Re: [PATCH] Unicode Lisp reader escapes

emacs-devel
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH] Unicode Lisp reader escapes

From:	Oliver Scholz
Subject:	Re: [PATCH] Unicode Lisp reader escapes
Date:	Thu, 04 May 2006 10:23:46 +0200
User-agent:	Gnus/5.11 (Gnus v5.11) Emacs/22.0.50 (gnu/linux)
For what it's worth, I made a stab at implementing \u analogous to
\x---including a port of the core functionality of `decode-char' to C.

As for the current discussion: I regard both e.g. \u3b1 and
(decode-char 'ucs #x3b1) as a means to say "Give me that abstract
character---the greek letter alpha---I don't care about your internal
encoding, *just use your defaults*, but give me that character." So,
effectively the respective functions should deal with fragmentation
and the like. It would matter, for instance, if the fontset specifies
different glyphs for the same abstract character depending on the
charsets.

But I see Eli's point. Ideally, the conversion (to ISO 8859-X)
wouldn't take place when reading the string, but when it is
displayed/inserted into a buffer. Logically, because that's when the
difference between abstract character and internal representation
should become effective. Practically, because: if the user loads a
Library containing strings with \u escapes (or `decode-char'
expressions eval'ed at load-time) and *then* customises the value of
`utf-fragment-on-decoding', the change won't affect those characters.
However, I believe that this is rather a minor obscurity than a bug; I
don't believe that anybody would get bitten by this seriously.

    Oliver

Here's the patch, only slightly tested:

Index: src/lread.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/lread.c,v
retrieving revision 1.350
diff -u -r1.350 lread.c
--- src/lread.c 27 Feb 2006 02:04:35 -0000      1.350
+++ src/lread.c 4 May 2006 08:00:53 -0000
@@ -1731,6 +1731,102 @@
   return str[0];
 }
 
+
+#define READ_HEX_ESCAPE(i, c)                                         \
+  while (1)                                                           \
+    {                                                                 \
+      c = READCHAR;                                                   \
+      if (c >= '0' && c <= '9')                                       \
+        {                                                             \
+          i *= 16;                                                    \
+          i += c - '0';                                               \
+        }                                                             \
+      else if ((c >= 'a' && c <= 'f')                                 \
+               || (c >= 'A' && c <= 'F'))                             \
+        {                                                             \
+          i *= 16;                                                    \
+          if (c >= 'a' && c <= 'f')                                   \
+            i += c - 'a' + 10;                                        \
+          else                                                        \
+            i += c - 'A' + 10;                                        \
+        }                                                             \
+      else                                                            \
+        {                                                             \
+          UNREAD (c);                                                 \
+          break;                                                      \
+        }                                                             \
+    }
+
+
+
+/* Return the internal character coresponding to an UCS code point.*/
+
+int
+ucs_to_internal (ucs)
+     int ucs;
+{
+  int c = 0;
+  Lisp_Object tmp_char;
+
+  if (! EQ (Qnil, SYMBOL_VALUE (intern ("utf-translate-cjk-mode"))))
+    /* cf. `utf-lookup-subst-table-for-decode' */
+    {
+      if (EQ (Qnil, SYMBOL_VALUE (intern ("utf-translate-cjk-lang-env"))))
+        call0 (intern ("utf-translate-cjk-load-tables"));
+      tmp_char = Fgethash (make_number (ucs),
+                           Fget (intern ("utf-subst-table-for-decode"),
+                                 intern ("translation-hash-table")),
+                           Qnil);
+      if (! EQ (Qnil, tmp_char))
+        {
+          CHECK_NUMBER (tmp_char);
+          c = XFASTINT (tmp_char);
+        }
+    }
+
+  if (c)
+    /* We found the character already in the translation hash table.
+       Do nothing. */
+    ;
+  else if (ucs < 160)
+    c = ucs;
+  else if (ucs < 256)
+    c = MAKE_CHAR (charset_latin_iso8859_1, ucs, 0);
+  else if (ucs < 0x2500)
+    {
+      ucs -= 0x0100;
+      c = MAKE_CHAR (charset_mule_unicode_0100_24ff,
+                     ((ucs / 96) + 32),
+                     ((ucs % 96) + 32));
+    }
+    else if (ucs < 0x3400)
+    {
+      ucs -= 0x2500;
+      c = MAKE_CHAR (charset_mule_unicode_2500_33ff,
+                     ((ucs / 96) + 32),
+                     ((ucs % 96) + 32));
+    }
+    else if ((ucs >= 0xE000) && (ucs < 0x10000))
+      {
+        ucs -= 0xE000;
+        c = MAKE_CHAR (charset_mule_unicode_e000_ffff,
+                       ((ucs / 96) + 32),
+                       ((ucs % 96) + 32));
+      }
+  
+  if (c)
+    {
+      Lisp_Object vect = Fget (intern ("utf-translation-table-for-decode"),
+                               intern ("translation-table"));
+      tmp_char = Faref (vect, make_number (c));
+      if (! EQ (Qnil, tmp_char))
+        return XFASTINT (tmp_char);
+      return c;
+    }
+  else error ("Invalid or unsupported UCS character: %x", ucs);
+}
+
+      
 /* Read a \-escape sequence, assuming we already read the `\'.
    If the escape sequence forces unibyte, store 1 into *BYTEREP.
    If the escape sequence forces multibyte, store 2 into *BYTEREP.
@@ -1879,34 +1975,24 @@
       /* A hex escape, as in ANSI C.  */
       {
        int i = 0;
-       while (1)
-         {
-           c = READCHAR;
-           if (c >= '0' && c <= '9')
-             {
-               i *= 16;
-               i += c - '0';
-             }
-           else if ((c >= 'a' && c <= 'f')
-                    || (c >= 'A' && c <= 'F'))
-             {
-               i *= 16;
-               if (c >= 'a' && c <= 'f')
-                 i += c - 'a' + 10;
-               else
-                 i += c - 'A' + 10;
-             }
-           else
-             {
-               UNREAD (c);
-               break;
-             }
-         }
-
+        READ_HEX_ESCAPE (i, c);
        *byterep = 2;
        return i;
       }
 
+    case 'u':
+      /* A hexadecimal reference to an UCS character. */
+      {
+        int i = 0;
+        Lisp_Object lisp_char;
+        
+        READ_HEX_ESCAPE (i, c);
+        *byterep = 2;
+
+        return ucs_to_internal (i);
+
+      }
+
     default:
       if (BASE_LEADING_CODE_P (c))
        c = read_multibyte (c, readcharfun);

    
-- 
15 Floréal an 214 de la Révolution
Liberté, Egalité, Fraternité!
[Prev in Thread]
Current Thread
[Next in Thread]
Re: [PATCH] Unicode Lisp reader escapes, (continued)
Prev by Date: Re: German reference card update
Next by Date: Re: Why should interactive search results raise errors?
Previous by thread: Re: [PATCH] Unicode Lisp reader escapes
Next by thread: Re: [PATCH] Unicode Lisp reader escapes
Index(es):
- Date
- Thread