bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#24603: [PATCHv5 09/11] Implement Turkic dotless and dotted i casing


From: Michal Nazarewicz
Subject: bug#24603: [PATCHv5 09/11] Implement Turkic dotless and dotted i casing rules (bug#24603)
Date: Thu, 9 Mar 2017 22:51:48 +0100

Implement part of Unicode special handling rules for Azerbaijani and
Turkish languages, namely ‘i’ is paired with ‘İ’ while ‘ı’ is paired
with ‘I’.

* src/casefiddle.c (enum case_action): Add ‘CASE_NO_ACTION’ enum value
which is used when in word while CASE_CAPITALIZE_UP requested.
(struct casing_context): Add SPECIAL_TR enum value to the special flag.
(prepare_casing_context): Set SPECIAL_TR special flag if buffer is in
Turkish and Azerbaijani.
(maybe_case_greek): Extract handling of sigma from case_character to
a new function.
(maybe_case_turkic): New function handling Turkic dotted and dotless
‘i’ casing.
(case_character_impl): Extract flag normalisation to a new function:
(normalise_flag): New function.
(case_single_character): Update after above changes.
(case_character): Rename to:
(case_characters): Make use of maybe_case_greek and maybe_case_turkic.
(do_casify_multibyte_string, do_casify_multibyte_region): Update to use
renamed case_characters.

* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test
cases for dotless and dotted i’s.
---
 src/casefiddle.c             | 305 ++++++++++++++++++++++++++++++-------------
 test/src/casefiddle-tests.el |  42 +++++-
 2 files changed, 255 insertions(+), 92 deletions(-)

diff --git a/src/casefiddle.c b/src/casefiddle.c
index d59684c7b8e..4785ebaddc4 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -30,7 +30,8 @@ along with GNU Emacs.  If not, see 
<http://www.gnu.org/licenses/>.  */
 #include "keymap.h"
 
 /* Order must match order in unidata-gen-table-special-casing. */
-enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
+enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP,
+                 /* Only for internal use: */ CASE_NO_ACTION};
 
 /* State for casing individual characters.  */
 struct casing_context {
@@ -73,7 +74,10 @@ struct casing_context {
     SPECIAL_NL,
     /* As above and the previous character was upcased ‘i’ so if we now see ‘j’
        it needs to be upcased as well. */
-    SPECIAL_NL_UPCASE_J
+    SPECIAL_NL_UPCASE_J,
+
+    /* Handle Azerbaijani and Turkish dotted and dotless i. */
+    SPECIAL_TR
   } special;
 };
 
@@ -108,32 +112,62 @@ prepare_casing_context (struct casing_context *ctx,
     case ('n' << 8) | 'l':  /* Dutch */
       if ((int) flag >= (int) CASE_CAPITALIZE)
        ctx->special = SPECIAL_NL;
+      break;
+    case ('t' << 8) | 'r':  /* Turkish */
+    case ('a' << 8) | 'z':  /* Azerbaijani */
+      ctx->special = SPECIAL_TR;
     }
 }
 
+/* Normalise CFG->flag and return CASE_UP, CASE_DOWN, CASE_CAPITALIZE or
+   CASE_NO_ACTION.  The latter if CFG->flag is CASE_CAPITALIZE_UP and we are
+   inside of a word. */
+static enum case_action
+normalise_flag (struct casing_context *ctx)
+{
+  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+  switch (ctx->flag) {
+  case CASE_CAPITALIZE:
+    return ctx->inword ? CASE_DOWN : CASE_CAPITALIZE;
+  case CASE_CAPITALIZE_UP:
+    return ctx->inword ? CASE_NO_ACTION : CASE_CAPITALIZE;
+  default:
+    return ctx->flag;
+  }
+}
+
 struct casing_str_buf {
   unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6];
   unsigned char len_chars;
   unsigned char len_bytes;
 };
 
-/* Based on CTX, case character CH.  If BUF is NULL, return cased character.
-   Otherwise, if BUF is non-NULL, save result in it and return whether the
-   character has been changed.
+#define RES_NOT_TOUCHED -2
+#define RES_NO_CHANGE   -1
+#define RES_CHANGED      0
+
+/* Based on CTX and FLAG, case character CH.  If BUF is NULL, return cased
+   character.  Otherwise, if BUF is non-NULL, save result in it and return
+   RES_CHANGED if the character changed or RES_NO_CHANGE if it didn’t.
+
+   FLAG may be one of CASE_UP, CASE_DOWN, CASE_CAPITALIZE (title-case if
+   possible, upper-aces otherwise) or CASE_NO_ACTION.  CTX->inword is not taken
+   into account when interpreting FLAG (it may be taken into account for other
+   decisions though).
 
    Since meaning of return value depends on arguments, it’s more convenient to
-   use case_single_character or case_character instead. */
+   use case_single_character or case_characters instead. */
 static int
 case_character_impl (struct casing_str_buf *buf,
-                    struct casing_context *ctx, int ch)
+                    struct casing_context *ctx, enum case_action flag, int ch)
 {
-  enum case_action flag;
   Lisp_Object prop;
-  bool was_inword;
   int cased;
 
   /* Handle Dutch ij.  Note that SPECIAL_NL and SPECIAL_NL_UPCASE_J implies 
that
-     ctx->flag ≥ CASE_CAPITALIZE. */
+     ctx->flag ≥ CASE_CAPITALIZE.  This has to be handled prior to flag ==
+     CASE_NO_ACTION check or otherwise we won’t handle ctx->flag ==
+     CASE_CAPITALIZE_UP correctly. */
   if (ctx->special == SPECIAL_NL && ch == 'i' && !ctx->inword)
     {
       ctx->special = SPECIAL_NL_UPCASE_J;
@@ -152,18 +186,10 @@ case_character_impl (struct casing_str_buf *buf,
     }
 
   /* Update inword state */
-  was_inword = ctx->inword;
   ctx->inword = SYNTAX (ch) == Sword &&
-    (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+    (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
 
-  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
-  if (ctx->flag == CASE_CAPITALIZE)
-    flag = (enum case_action)((int)ctx->flag - was_inword);
-  else if (ctx->flag != CASE_CAPITALIZE_UP)
-    flag = ctx->flag;
-  else if (!was_inword)
-    flag = CASE_CAPITALIZE;
-  else
+  if (flag == CASE_NO_ACTION)
     {
       cased = ch;
       goto done;
@@ -199,7 +225,7 @@ case_character_impl (struct casing_str_buf *buf,
             buf->len_chars = str->size;
             buf->len_bytes = STRING_BYTES(str);
             memcpy(buf->data, str->data, buf->len_bytes);
-            return 1;
+            return 0;
           }
         }
       }
@@ -220,7 +246,20 @@ case_character_impl (struct casing_str_buf *buf,
     return cased;
   buf->len_chars = 1;
   buf->len_bytes = CHAR_STRING (cased, buf->data);
-  return cased != ch;
+  return cased == ch ? -1 : 0;
+}
+
+/* Based on CTX, case character CH accordingly.  Update CTX as necessary.
+   Return cased character.
+
+   Special casing rules (such as upcase(fi) = FI) are not handled.  For
+   characters whose casing results in multiple code points, the character is
+   returned unchanged. */
+static inline int
+case_single_character (struct casing_context *ctx, int ch)
+{
+  enum case_action flag = normalise_flag (ctx);
+  return case_character_impl (NULL, ctx, flag, ch);
 }
 
 /* In Greek, lower case sigma has two forms: one when used in the middle and 
one
@@ -233,48 +272,126 @@ case_character_impl (struct casing_str_buf *buf,
 #define CAPITAL_SIGMA     0x03A3
 #define SMALL_SIGMA       0x03C3
 #define SMALL_FINAL_SIGMA 0x03C2
-
-/* Based on CTX, case character CH accordingly.  Update CTX as necessary.
-   Return cased character.
 
-   Special casing rules (such as upcase(fi) = FI) are not handled.  For
-   characters whose casing results in multiple code points, the character is
-   returned unchanged. */
-static inline int
-case_single_character (struct casing_context *ctx, int ch)
-{
-  return case_character_impl (NULL, ctx, ch);
+/* Save in BUF result of casing character CH if Greek casing rules apply.
+
+   If not-NULL, NEXT points to the next character in the cased string.  If 
NULL,
+   it is assumed current character is the last one being cased.  This is used 
to
+   apply some rules which depend on proceeding state.
+
+   FLAG is a normalised flag (as returned by normalise_flag function).
+
+   Return -2 (RES_NOT_TOUCHED) if Greek rules did not apply, no changes were
+   made and other casing rules should be tried.  Otherwise, meaning of return
+   values is the same as in case_characters function. */
+static int
+maybe_case_greek (struct casing_str_buf *buf, struct casing_context *ctx,
+                 enum case_action flag, int ch, const char *next) {
+  if (flag != CASE_DOWN || ch != CAPITAL_SIGMA)
+    return RES_NOT_TOUCHED;
+
+  ch = (ctx->inword && (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+    ? SMALL_FINAL_SIGMA : SMALL_SIGMA;
+  buf->len_bytes = CHAR_STRING (ch, buf->data);
+  buf->len_chars = 1;
+  ctx->inword = true;
+  return RES_CHANGED;
 }
+
+/* Azerbaijani and Turkish have dotless and dotted i.  An upper case of i is
+   İ while lower case of I is ı. */
+
+#define CAPITAL_DOTTED_I    0x130
+#define SMALL_DOTLESS_I     0x131
+#define COMBINING_DOT_ABOVE 0x307
 
-/* Save in BUF result of casing character CH.  Return whether casing changed 
the
-   character.
+/* Save in BUF result of casing character CH if Turkic casing rules apply.
 
    If not-NULL, NEXT points to the next character in the cased string.  If 
NULL,
    it is assumed current character is the last one being cased.  This is used 
to
    apply some rules which depend on proceeding state.
 
-   This is like case_single_character but also handles one-to-many casing
-   rules. */
-static bool
-case_character (struct casing_str_buf *buf, struct casing_context *ctx,
-               int ch, const unsigned char *next)
+   FLAG is a normalised flag (as returned by normalise_flag function).
+
+   Return -2 (RES_NOT_TOUCHED) if Turkic rules did not apply, no changes were
+   made and other casing rules should be tried.  Otherwise, meaning of return
+   values is the same as in case_characters function. */
+static int
+maybe_case_turkic (struct casing_str_buf *buf, struct casing_context *ctx,
+                  enum case_action flag, int ch, const char *next) {
+  bool dot_above = false;
+  int cased = ch;
+
+  if (flag == CASE_NO_ACTION || ctx->special != SPECIAL_TR)
+    return RES_NOT_TOUCHED;
+
+  switch (ch) {
+  case 'I':
+    if (flag == CASE_DOWN)
+      {
+       dot_above = next && STRING_CHAR (next) == COMBINING_DOT_ABOVE;
+       cased = dot_above ? 'i' : SMALL_DOTLESS_I;
+      }
+    break;
+
+  case 'i':
+    if (flag == CASE_UP || flag == CASE_CAPITALIZE)
+      cased = CAPITAL_DOTTED_I;
+    break;
+
+  case CAPITAL_DOTTED_I:
+    if (flag == CASE_DOWN)
+      cased = 'i';
+    break;
+
+  case SMALL_DOTLESS_I:
+    if (flag == CASE_UP || flag == CASE_CAPITALIZE)
+      cased = 'I';
+    break;
+
+  default:
+    return RES_NOT_TOUCHED;
+  }
+
+  ctx->inword = true;
+  buf->len_chars = 1;
+  buf->len_bytes = CHAR_STRING (cased, buf->data);
+  if (dot_above)
+    return CHAR_BYTES (COMBINING_DOT_ABOVE);
+  else
+    return ch == cased ? RES_NO_CHANGE : RES_CHANGED;
+}
+
+/* Save in BUF result of casing character CH.
+
+   If not-NULL, NEXT points to the next character in the cased string.  If 
NULL,
+   it is assumed current character is the last one being cased.  This is used 
to
+   apply some rules which depend on proceeding state.
+
+   Return:
+   • -1 (RES_NO_CHANGE) if character has not been changed,
+   • 0 (RES_CHANGED) if the character has changed or
+   • a positive number if the character CH and the one following it (pointed by
+     NEXT) map to character saved in BUF.  Returned value is the length in 
bytes
+     of the next character.
+
+   This is like case_single_character but also many-to-many casing rules. */
+static int
+case_characters (struct casing_str_buf *buf, struct casing_context *ctx,
+                int ch, const unsigned char *next)
 {
-  bool changed, was_inword;
+  enum case_action flag = normalise_flag (ctx);
+  int ret;
 
-  was_inword = ctx->inword;
-  changed = case_character_impl (buf, ctx, ch);
+  ret = maybe_case_turkic (buf, ctx, flag, ch, next);
+  if (ret != RES_NOT_TOUCHED)
+    return ret;
 
-  /* If we have just down-cased a capital sigma and the next character no 
longer
-     has a word syntax (i.e. current character is end of word), use final
-     sigma. */
-  if (was_inword && ch == CAPITAL_SIGMA && changed &&
-      (!next || SYNTAX (STRING_CHAR (next)) != Sword))
-    {
-      buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
-      buf->len_chars = 1;
-    }
+  ret = maybe_case_greek (buf, ctx, flag, ch, next);
+  if (ret != RES_NOT_TOUCHED)
+    return ret;
 
-  return changed;
+  return case_character_impl (buf, ctx, flag, ch);
 }
 
 static Lisp_Object
@@ -321,7 +438,6 @@ do_casify_multibyte_string (struct casing_context *ctx, 
Lisp_Object obj)
   typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 
1];
 
   ptrdiff_t size = SCHARS (obj), n;
-  int ch;
   USE_SAFE_ALLOCA;
   if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
       INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
@@ -331,12 +447,17 @@ do_casify_multibyte_string (struct casing_context *ctx, 
Lisp_Object obj)
 
   const unsigned char *src = SDATA (obj);
 
-  for (n = 0; size; --size)
+  n = 0;
+  while (size)
     {
+      int ch, ret;
       if (dst_end - o < sizeof(struct casing_str_buf))
        string_overflow ();
       ch = STRING_CHAR_ADVANCE (src);
-      case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
+      ret = case_characters ((void *)o, ctx, ch, size > 1 ? src : NULL);
+      if (ret > RES_CHANGED)
+       src += ret;
+      size -= ret > RES_CHANGED ? 2 : 1;
       n += ((struct casing_str_buf *)o)->len_chars;
       o += ((struct casing_str_buf *)o)->len_bytes;
     }
@@ -487,44 +608,50 @@ do_casify_multibyte_region (struct casing_context *ctx,
   ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
   ptrdiff_t opoint = PT, added = 0;
   struct casing_str_buf buf;
-  bool changed;
-  int ch, len;
+  int ch, len_bytes, len_chars, ret;
 
-  for (; size; --size)
+  while (size)
     {
-      ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
-      changed = case_character (
+      ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len_bytes);
+      ret = case_characters (
          &buf, ctx, ch,
-         size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
-
-      if (!changed)
-       {
-         pos_byte += len;
-         ++pos;
-         continue;
-       }
+         size > 1 ? BYTE_POS_ADDR (pos_byte + len_bytes) : NULL);
+      len_chars = 1;
 
-      last = pos + buf.len_chars;
-      if (first < 0)
-       first = pos;
-
-      if (buf.len_chars == 1 && buf.len_bytes == len)
-       memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
-      else
-       {
-         /* Replace one character with the other(s), keeping text
-            properties the same.  */
-         replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
-                          (const char *) buf.data, buf.len_chars,
-                          buf.len_bytes,
-                          0);
-         added += (ptrdiff_t) buf.len_chars - 1;
-         if (opoint > pos)
-           opoint += (ptrdiff_t) buf.len_chars - 1;
-       }
-
-      pos_byte += buf.len_bytes;
-      pos += buf.len_chars;
+      switch (ret) {
+      default:
+       len_chars += 1;
+       /* FALL THROUGH */
+
+      case RES_CHANGED:
+       len_bytes += ret;
+       len_chars = ret ? 2 : 1;
+
+       last = pos + buf.len_chars;
+       if (first < 0)
+         first = pos;
+
+       if (ret == 0 && buf.len_chars == 1 && buf.len_bytes == len_bytes)
+         memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len_bytes);
+       else
+         {
+           /* Replace one character with the other(s), keeping text
+              properties the same.  */
+           replace_range_2 (pos, pos_byte, pos + len_chars, pos_byte + 
len_bytes,
+                            (const char *) buf.data, buf.len_chars,
+                            buf.len_bytes,
+                            0);
+           added += buf.len_chars - len_chars;
+           if (opoint > pos)
+             opoint += buf.len_chars - len_chars;
+         }
+
+       /* FALL THOUGH */
+      case RES_NO_CHANGE:
+       size -= len_chars;
+       pos += buf.len_chars;
+       pos_byte += buf.len_bytes;
+      }
     }
 
   if (PT != opoint)
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 5e38a97d256..ce1bb18dd40 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -180,8 +180,8 @@ casefiddle-tests--test-casing
   (should-not
    (with-temp-buffer
      (casefiddle-tests--test-casing
-      ;; input     upper     lower    capitalize up-initials
-      '(("Foo baR" "FOO BAR" "foo bar" "Foo Bar" "Foo BaR")
+      ;; input     upper     lower    capitalize up-initials  [locale]
+      `(("Foo baR" "FOO BAR" "foo bar" "Foo Bar" "Foo BaR")
         ("Ⅷ ⅷ" "Ⅷ Ⅷ" "ⅷ ⅷ" "Ⅷ Ⅷ" "Ⅷ Ⅷ")
         ;; "DžUNGLA" is an unfortunate result but it’s really best we can
         ;; do while still being consistent.  Hopefully, users only ever
@@ -205,7 +205,43 @@ casefiddle-tests--test-casing
 
         ;; Dutch 'ij' is capitalised as single digraph.
         ("ijsland" "IJSLAND" "ijsland" "Ijsland" "Ijsland")
-        ("ijsland" "IJSLAND" "ijsland" "IJsland" "IJsland" "nl"))))))
+        ("ijsland" "IJSLAND" "ijsland" "IJsland" "IJsland" "nl")
+
+        ;; There is a language-independent special casing rule which
+        ;; converts İ into i followed by combining dot above that’s why we
+        ;; get the weird \u0307.  Conceptually, it converts i with
+        ;; a soft-dot into an i with a hard-dot so it makes some doze of
+        ;; sense.
+        ("İstanbul" "İSTANBUL" "i\u0307stanbul" "İstanbul" "İstanbul")
+        ("İstanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" "tr")
+        ("İstanbul" "İSTANBUL" "istanbul" "İstanbul" "İstanbul" "az")
+        (,(decode-coding-string "istanbul" 'no-conversion-multibyte) ; make it 
multibyte
+         "ISTANBUL" "istanbul" "Istanbul" "Istanbul")
+        (,(decode-coding-string "istanbul" 'no-conversion-multibyte)
+         "İSTANBUL" "istanbul" "İstanbul" "İstanbul" "tr")
+        (,(decode-coding-string "istanbul" 'no-conversion-multibyte)
+         "İSTANBUL" "istanbul" "İstanbul" "İstanbul" "az")
+        (,(decode-coding-string "Irmak" 'no-conversion-multibyte)
+         "IRMAK" "irmak" "Irmak" "Irmak")
+        (,(decode-coding-string "Irmak" 'no-conversion-multibyte)
+         "IRMAK" "ırmak" "Irmak" "Irmak" "tr")
+        (,(decode-coding-string "Irmak" 'no-conversion-multibyte)
+         "IRMAK" "ırmak" "Irmak" "Irmak" "az")
+        ;; FIXME: We explicitly exclude ı→I mapping from the case tables
+        ;; in characters.el which is why instead of:
+        ;;("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak")
+        ;; we actually get:
+        ("ırmak" "ıRMAK" "ırmak" "Irmak" "Irmak")
+        ;; ‘But wait,’ you ask, ‘why capitalisation works’?  This is
+        ;; because those bypass case-table and use character’s Unicode
+        ;; titlecase property.
+        ("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak" "tr")
+        ("ırmak" "IRMAK" "ırmak" "Irmak" "Irmak" "az")
+        ;; And for some combining dot above removal.
+        ("I\u0307si\u0307s" "I\u0307Sİ\u0307S" "isi\u0307s"
+         "I\u0307si\u0307s" "I\u0307si\u0307s" "tr")
+        ("I\u0307sI\u0307s" "I\u0307SI\u0307S" "isis"
+         "I\u0307sis" "I\u0307sI\u0307s" "tr"))))))
 
 (ert-deftest casefiddle-tests-casing-byte8 ()
   (should-not
-- 
2.12.0.246.ga2ecc84866-goog






reply via email to

[Prev in Thread] Current Thread [Next in Thread]