emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] Unicode Lisp reader escapes


From: Oliver Scholz
Subject: Re: [PATCH] Unicode Lisp reader escapes
Date: Fri, 05 May 2006 19:23:36 +0200
User-agent: Gnus/5.11 (Gnus v5.11) Emacs/22.0.50 (gnu/linux)

For what it's worth, I just tried the attached little stress test on
an updated C port of `decode-char' in order to check whether it
returns equivalent results. It does. (Well, except intentional
differences like that `ucs_to_internal' throws an error where
`decode-char' returns nil.)

Basically the test runs through all positive integers up to MAX_CHAR
and inserts an alist into a temp buffer with each car being the
integer and each cdr being a character in the \u syntax (e.g.
`?\u3b1'). It then reads that alist again and checks whether
`decode-char' on its car is `eq' to its cdr. I tried it with and
without `utf-translate-cjk-mode' and with and without
`utf-fragment-on-decoding'. Since all tests succeed, ucs_to_internal
and `decode-char' are functionally equivalent on all supported
characters.

The test: 

Attachment: ucs-test.el
Description: application/emacs-lisp

The updated patch: 
Index: src/lread.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/lread.c,v
retrieving revision 1.350
diff -u -r1.350 lread.c
--- src/lread.c 27 Feb 2006 02:04:35 -0000      1.350
+++ src/lread.c 5 May 2006 17:09:37 -0000
@@ -87,6 +87,9 @@
 Lisp_Object Qbackquote, Qcomma, Qcomma_at, Qcomma_dot, Qfunction;
 Lisp_Object Qinhibit_file_name_operation;
 Lisp_Object Qeval_buffer_list, Veval_buffer_list;
+Lisp_Object Qutf_translate_cjk_mode, Qutf_translate_cjk_lang_env, 
Qutf_translate_cjk_load_tables;
+Lisp_Object Qutf_subst_table_for_decode, Qtranslation_hash_table;
+Lisp_Object Qutf_translation_table_for_decode, Qtranslation_table;
 
 extern Lisp_Object Qevent_symbol_element_mask;
 extern Lisp_Object Qfile_exists_p;
@@ -1731,6 +1734,110 @@
   return str[0];
 }
 
+
+#define READ_HEX_ESCAPE(i, c)                                         \
+  while (1)                                                           \
+    {                                                                 \
+      c = READCHAR;                                                   \
+      if (c >= '0' && c <= '9')                                       \
+        {                                                             \
+          i *= 16;                                                    \
+          i += c - '0';                                               \
+        }                                                             \
+      else if ((c >= 'a' && c <= 'f')                                 \
+               || (c >= 'A' && c <= 'F'))                             \
+        {                                                             \
+          i *= 16;                                                    \
+          if (c >= 'a' && c <= 'f')                                   \
+            i += c - 'a' + 10;                                        \
+          else                                                        \
+            i += c - 'A' + 10;                                        \
+        }                                                             \
+      else                                                            \
+        {                                                             \
+          UNREAD (c);                                                 \
+          break;                                                      \
+        }                                                             \
+    }
+
+
+
+/* Return the internal character coresponding to an UCS code point.*/
+
+int
+ucs_to_internal (ucs)
+     int ucs;
+{
+  int c = 0;
+  Lisp_Object tmp_char;
+
+  if (! EQ (Qnil, SYMBOL_VALUE (Qutf_translate_cjk_mode)))
+    /* cf. `utf-lookup-subst-table-for-decode' */
+    {
+      Lisp_Object hash;
+      
+      if (EQ (Qnil, SYMBOL_VALUE (Qutf_translate_cjk_lang_env)))
+        call0 (Qutf_translate_cjk_load_tables);
+
+      hash = Fget (Qutf_subst_table_for_decode, Qtranslation_hash_table);
+
+      if (HASH_TABLE_P (hash))
+        {
+          tmp_char = Fgethash (make_number (ucs), hash, Qnil);
+          if (! EQ (Qnil, tmp_char))
+            {
+              CHECK_NUMBER (tmp_char);
+              c = XFASTINT (tmp_char);
+            }
+        }
+    }
+
+  if (c)
+    /* We found the character already in the translation hash table.
+       Do nothing. */
+    ;
+  else if (ucs < 160)
+    c = ucs;
+  else if (ucs < 256)
+    c = MAKE_CHAR (charset_latin_iso8859_1, ucs, 0);
+  else if (ucs < 0x2500)
+    {
+      ucs -= 0x0100;
+      c = MAKE_CHAR (charset_mule_unicode_0100_24ff,
+                     ((ucs / 96) + 32),
+                     ((ucs % 96) + 32));
+    }
+    else if (ucs < 0x3400)
+    {
+      ucs -= 0x2500;
+      c = MAKE_CHAR (charset_mule_unicode_2500_33ff,
+                     ((ucs / 96) + 32),
+                     ((ucs % 96) + 32));
+    }
+    else if ((ucs >= 0xE000) && (ucs < 0x10000))
+      {
+        ucs -= 0xE000;
+        c = MAKE_CHAR (charset_mule_unicode_e000_ffff,
+                       ((ucs / 96) + 32),
+                       ((ucs % 96) + 32));
+      }
+  
+  if (c || ucs == 0) /* U+0000 is also a valid character. */
+    {
+      Lisp_Object vect = Fget (Qutf_translation_table_for_decode,
+                               Qtranslation_table);
+      if (CHAR_TABLE_P (vect))
+        {
+          tmp_char = Faref (vect, make_number (c));
+          if (! EQ (Qnil, tmp_char))
+            return XFASTINT (tmp_char);
+        }
+      return c;
+    }
+  else error ("Invalid or unsupported UCS character: %x", ucs);
+}
+
+      
 /* Read a \-escape sequence, assuming we already read the `\'.
    If the escape sequence forces unibyte, store 1 into *BYTEREP.
    If the escape sequence forces multibyte, store 2 into *BYTEREP.
@@ -1879,34 +1986,23 @@
       /* A hex escape, as in ANSI C.  */
       {
        int i = 0;
-       while (1)
-         {
-           c = READCHAR;
-           if (c >= '0' && c <= '9')
-             {
-               i *= 16;
-               i += c - '0';
-             }
-           else if ((c >= 'a' && c <= 'f')
-                    || (c >= 'A' && c <= 'F'))
-             {
-               i *= 16;
-               if (c >= 'a' && c <= 'f')
-                 i += c - 'a' + 10;
-               else
-                 i += c - 'A' + 10;
-             }
-           else
-             {
-               UNREAD (c);
-               break;
-             }
-         }
-
+        READ_HEX_ESCAPE (i, c);
        *byterep = 2;
        return i;
       }
 
+    case 'u':
+      /* A hexadecimal reference to an UCS character. */
+      {
+        int i = 0;
+        
+        READ_HEX_ESCAPE (i, c);
+        *byterep = 2;
+
+        return ucs_to_internal (i);
+
+      }
+
     default:
       if (BASE_LEADING_CODE_P (c))
        c = read_multibyte (c, readcharfun);
@@ -4121,6 +4217,27 @@
 
   Vloads_in_progress = Qnil;
   staticpro (&Vloads_in_progress);
+
+  Qutf_translate_cjk_mode = intern ("utf-translate-cjk-mode");
+  staticpro (&Qutf_translate_cjk_mode);
+  
+  Qutf_translate_cjk_lang_env = intern ("utf-translate-cjk-lang-env");
+  staticpro (&Qutf_translate_cjk_lang_env);
+  
+  Qutf_translate_cjk_load_tables = intern ("utf-translate-cjk-load-tables");
+  staticpro (&Qutf_translate_cjk_load_tables);
+  
+  Qutf_subst_table_for_decode = intern ("utf-subst-table-for-decode");
+  staticpro (&Qutf_subst_table_for_decode);
+  
+  Qtranslation_hash_table = intern ("translation-hash-table");
+  staticpro (&Qutf_subst_table_for_decode);
+
+  Qutf_translation_table_for_decode = intern 
("utf-translation-table-for-decode");
+  staticpro (&Qutf_translation_table_for_decode);
+  
+  Qtranslation_table = intern ("translation-table");
+  staticpro (&Qtranslation_table);
 }
 
 /* arch-tag: a0d02733-0f96-4844-a659-9fd53c4f414d

    Oliver
-- 
16 Floréal an 214 de la Révolution
Liberté, Egalité, Fraternité!

reply via email to

[Prev in Thread] Current Thread [Next in Thread]