[Emacs-diffs] master de7d5f3 1/5: Implement named character escapes, sim

emacs-diffs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] master de7d5f3 1/5: Implement named character escapes, sim

From:	Paul Eggert
Subject:	[Emacs-diffs] master de7d5f3 1/5: Implement named character escapes, similar to Perl
Date:	Fri, 22 Apr 2016 02:29:45 +0000

branch: master
commit de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73
Author: Philipp Stephani <address@hidden>
Commit: Paul Eggert <address@hidden>

    Implement named character escapes, similar to Perl
    
    * lread.c (init_character_names): New function.
    (read_escape): Read Perl-style named character escape sequences.
    (syms_of_lread): Initialize new variable 'character_names'.
    * test/src/lread-tests.el (lread-char-empty-name): Add test file
    for src/lread.c.
---
 src/lread.c             |   96 +++++++++++++++++++++++++++++++++++++++++++++++
 test/src/lread-tests.el |   54 ++++++++++++++++++++++++++
 2 files changed, 150 insertions(+)

diff --git a/src/lread.c b/src/lread.c
index fedfcb8..9fa46a8 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -43,6 +43,7 @@ along with GNU Emacs.  If not, see 
<http://www.gnu.org/licenses/>.  */
 #include "systime.h"
 #include "termhooks.h"
 #include "blockinput.h"
+#include <c-ctype.h>
 
 #ifdef MSDOS
 #include "msdos.h"
@@ -2149,6 +2150,36 @@ grow_read_buffer (void)
                         MAX_MULTIBYTE_LENGTH, -1, 1);
 }
 
+/* Hash table that maps Unicode character names to code points.  */
+static Lisp_Object character_names;
+
+/* Length of the longest Unicode character name, in bytes. */
+static ptrdiff_t max_character_name_length;
+
+/* Initializes `character_names' and `max_character_name_length'.
+   Called by `read_escape'.  */
+void init_character_names ()
+{
+  character_names = CALLN (Fmake_hash_table,
+                           QCtest, Qequal,
+                           /* Currently around 100,000 Unicode
+                              characters are defined.  */
+                           QCsize, make_natnum (100000));
+  const Lisp_Object get_property =
+    Fsymbol_function (intern_c_string ("get-char-code-property"));
+  ptrdiff_t length = 0;
+  for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
+    {
+      const Lisp_Object code = make_natnum (i);
+      const Lisp_Object name = call2 (get_property, code, Qname);
+      if (NILP (name)) continue;
+      CHECK_STRING (name);
+      length = max (length, SBYTES (name));
+      Fputhash (name, code, character_names);
+    }
+  max_character_name_length = length;
+}
+
 /* Read a \-escape sequence, assuming we already read the `\'.
    If the escape sequence forces unibyte, return eight-bit char.  */
 
@@ -2356,6 +2387,68 @@ read_escape (Lisp_Object readcharfun, bool stringp)
        return i;
       }
 
+    case 'N':
+      /* Named character.  */
+      {
+        c = READCHAR;
+        if (c != '{')
+          invalid_syntax ("Expected opening brace after \\N");
+        if (NILP (character_names))
+          init_character_names ();
+        USE_SAFE_ALLOCA;
+        char *name = SAFE_ALLOCA (max_character_name_length + 1);
+        bool whitespace = false;
+        ptrdiff_t length = 0;
+        while (true)
+          {
+            c = READCHAR;
+            if (c < 0)
+              end_of_file_error ();
+            if (c == '}')
+              break;
+            if (! c_isascii (c))
+              xsignal1 (Qinvalid_read_syntax,
+                        CALLN (Fformat,
+                               build_pure_c_string ("Non-ASCII character 
U+%04X"
+                                                    " in character name"),
+                               make_natnum (c)));
+            /* We treat multiple adjacent whitespace characters as a
+               single space character.  This makes it easier to use
+               character names in e.g. multi-line strings.  */
+            if (c_isspace (c))
+              {
+                if (! whitespace)
+                  {
+                    whitespace = true;
+                    name[length++] = ' ';
+                  }
+              }
+            else
+              {
+                whitespace = false;
+                name[length++] = c;
+              }
+            if (length >= max_character_name_length)
+              invalid_syntax ("Character name too long");
+          }
+        if (length == 0)
+          invalid_syntax ("Empty character name");
+        name[length] = 0;
+        const Lisp_Object lisp_name = make_unibyte_string (name, length);
+        const Lisp_Object code =
+          (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
+          /* Code point as U+N, where N is between 1 and 8 hexadecimal
+             digits.  */
+          string_to_number (name + 2, 16, false) :
+          Fgethash (lisp_name, character_names, Qnil);
+        SAFE_FREE ();
+        if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
+          xsignal1 (Qinvalid_read_syntax,
+                    CALLN (Fformat,
+                           build_pure_c_string ("\\N{%s}"), lisp_name));
+        return XINT (code);
+      }
+
     default:
       return c;
     }
@@ -4744,4 +4837,7 @@ that are loaded before your customizations are read!  */);
   DEFSYM (Qweakness, "weakness");
   DEFSYM (Qrehash_size, "rehash-size");
   DEFSYM (Qrehash_threshold, "rehash-threshold");
+
+  character_names = Qnil;
+  staticpro (&character_names);
 }
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
new file mode 100644
index 0000000..1f87334
--- /dev/null
+++ b/test/src/lread-tests.el
@@ -0,0 +1,54 @@
+;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2016  Google Inc.
+
+;; Author: Philipp Stephani <address@hidden>
+
+;; This file is part of GNU Emacs.
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; Unit tests for code in src/lread.c.
+
+;;; Code:
+
+(ert-deftest lread-char-number ()
+  (should (equal ?\N{U+A817} #xA817)))
+
+(ert-deftest lread-char-name ()
+  (should (equal ?\N{SYLOTI  NAGRI LETTER
+                 DHO}
+                 #xA817)))
+
+(ert-deftest lread-char-invalid-number ()
+  (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax))
+
+(ert-deftest lread-char-invalid-name ()
+  (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
+
+(ert-deftest lread-char-non-ascii-name ()
+  (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax)
+
+(ert-deftest lread-char-empty-name ()
+  (should-error (read "?\\N{}")) 'invalid-read-syntax)
+
+(ert-deftest lread-string-char-number ()
+  (should (equal "a\N{U+A817}b" "a\uA817b")))
+
+(ert-deftest lread-string-char-name ()
+  (should (equal "a\N{SYLOTI NAGRI  LETTER DHO}b" "a\uA817b")))
+
+;;; lread-tests.el ends here

[Prev in Thread]

Current Thread

[Next in Thread]

[Emacs-diffs] master updated (7621a52 -> bd1c7ca), Paul Eggert, 2016/04/21
- [Emacs-diffs] master e7cb38e 4/5: Use 'ucs-names' for character name escapes, Paul Eggert, 2016/04/21
- [Emacs-diffs] master 753c875 3/5: Minor cleanups for character name escapes, Paul Eggert, 2016/04/21
- [Emacs-diffs] master de7d5f3 1/5: Implement named character escapes, similar to Perl, Paul Eggert <=
- [Emacs-diffs] master a58d4e3 2/5: Add documentation for character name escapes, Paul Eggert, 2016/04/21
- [Emacs-diffs] master bd1c7ca 5/5: Improve character name escapes, Paul Eggert, 2016/04/21

Prev by Date: [Emacs-diffs] master 753c875 3/5: Minor cleanups for character name escapes
Next by Date: [Emacs-diffs] master a58d4e3 2/5: Add documentation for character name escapes
Previous by thread: [Emacs-diffs] master 753c875 3/5: Minor cleanups for character name escapes
Next by thread: [Emacs-diffs] master a58d4e3 2/5: Add documentation for character name escapes
Index(es):
- Date
- Thread