From aef09688a357c815fd3ccfbc04592717737e6c86 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 25 Apr 2016 10:41:29 -0700 Subject: [PATCH] =?UTF-8?q?New=20function=20=E2=80=98char-from-name?= =?UTF-8?q?=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This also fixes the mishandling of "\N{CJK COMPATIBILITY IDEOGRAPH-F900}", "\N{VARIATION SELECTOR-1}", etc. Problem reported by Eli Zaretskii in: http://lists.gnu.org/archive/html/emacs-devel/2016-04/msg00614.html * doc/lispref/nonascii.texi (Character Codes), etc/NEWS: Document this. * lisp/international/mule-cmds.el (char-from-name): New function. (read-char-by-name): Use it. Document that "BED" is treated as a name, not as a hexadecimal number. Reject out-of-range integers, floating-point numbers, and strings with trailing junk. * src/lread.c (character_name_to_code): Call char-from-name instead of inspecting ucs-names directly, so that we handle computed names like "VARIATION SELECTOR-1". Do not use an auto string, since char-from-name might GC. * test/src/lread-tests.el: Add tests for new behavior, and fix some old tests that were wrong. --- doc/lispref/nonascii.texi | 12 +++++++++++ etc/NEWS | 4 ++++ lisp/international/mule-cmds.el | 43 +++++++++++++++++++++++++++--------- src/lread.c | 31 +++++++++----------------- test/src/lread-tests.el | 48 +++++++++++++++++++++++++++++++++++++---- 5 files changed, 103 insertions(+), 35 deletions(-) diff --git a/doc/lispref/nonascii.texi b/doc/lispref/nonascii.texi index 0e4aa86..fd2ce32 100644 --- a/doc/lispref/nonascii.texi +++ b/doc/lispref/nonascii.texi @@ -420,6 +420,18 @@ Character Codes @end example @end defun address@hidden char-from-name string &optional ignore-case +This function returns the character whose Unicode name is @var{string}. +If @var{ignore-case} is address@hidden, case is ignored in @var{string}. +This function returns @code{nil} if @var{string} does not name a character. + address@hidden +;; U+03A3 +(= (char-from-name "GREEK CAPITAL LETTER SIGMA") #x03A3) + @result{} t address@hidden example address@hidden defun + @defun get-byte &optional pos string This function returns the byte at character position @var{pos} in the current buffer. If the current buffer is unibyte, this is literally diff --git a/etc/NEWS b/etc/NEWS index 6bdb648..e401d2d 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -391,6 +391,10 @@ compares their numerical values. According to this predicate, "foo2.png" is smaller than "foo12.png". +++ +** The new function 'char-from-name' converts a Unicode name string +to the corresponding character code. + ++++ ** New functions 'sxhash-eq' and 'sxhash-eql' return hash codes of a Lisp object suitable for use with 'eq' and 'eql' correspondingly. If two objects are 'eq' ('eql'), then the result of 'sxhash-eq' diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el index 8eb320a..2ce21a8 100644 --- a/lisp/international/mule-cmds.el +++ b/lisp/international/mule-cmds.el @@ -2978,6 +2978,27 @@ mule--ucs-names-annotation (let ((char (assoc name ucs-names))) (when char (format " (%c)" (cdr char))))) +(defun char-from-name (string &optional ignore-case) + "Return a character as a number from its Unicode name STRING. +If optional IGNORE-CASE is non-nil, ignore case in STRING. +Return nil if STRING does not name a character." + (or (cdr (assoc-string string (ucs-names) ignore-case)) + (let ((minus (string-match-p "-[0-9A-F]+\\'" string))) + (when minus + ;; Parse names like "VARIATION SELECTOR-17" and "CJK + ;; COMPATIBILITY IDEOGRAPH-F900" that are not in ucs-names. + (ignore-errors + (let* ((case-fold-search ignore-case) + (vs (string-match-p "\\`VARIATION SELECTOR-" string)) + (minus-num (string-to-number (substring string minus) + (if vs 10 16))) + (vs-offset (if vs (if (< minus-num -16) #xE00EF #xFDFF) 0)) + (code (- vs-offset minus-num)) + (name (get-char-code-property code 'name))) + (when (eq t (compare-strings string nil nil name nil nil + ignore-case)) + code))))))) + (defun read-char-by-name (prompt) "Read a character by its Unicode name or hex number string. Display PROMPT and read a string that represents a character by its @@ -2991,9 +3012,11 @@ read-char-by-name the characters whose names include that substring, not necessarily at the beginning of the name. -This function also accepts a hexadecimal number of Unicode code -point or a number in hash notation, e.g. #o21430 for octal, -#x2318 for hex, or #10r8984 for decimal." +Accept a name like \"CIRCULATION FUNCTION\", a hexadecimal +number like \"2A10\", or a number in hash notation (e.g., +\"#x2a10\" for hex, \"10r10768\" for decimal, or \"#o25020\" for +octal). Treat otherwise-ambiguous strings like \"BED\" (U+1F6CF) +as names, not numbers." (let* ((enable-recursive-minibuffers t) (completion-ignore-case t) (input @@ -3006,13 +3029,13 @@ read-char-by-name (category . unicode-name)) (complete-with-action action (ucs-names) string pred))))) (char - (cond - ((string-match-p "\\`[0-9a-fA-F]+\\'" input) - (string-to-number input 16)) - ((string-match-p "\\`#" input) - (read input)) - (t - (cdr (assoc-string input (ucs-names) t)))))) + (cond + ((char-from-name input t)) + ((string-match-p "\\`[0-9a-fA-F]+\\'" input) + (ignore-errors (string-to-number input 16))) + ((string-match-p "\\`#\\([bBoOxX]\\|[0-9]+[rR]\\)[0-9a-zA-Z]+\\'" + input) + (ignore-errors (read input)))))) (unless (characterp char) (error "Invalid character")) char)) diff --git a/src/lread.c b/src/lread.c index a42c1f6..6e97e07 100644 --- a/src/lread.c +++ b/src/lread.c @@ -2155,26 +2155,15 @@ grow_read_buffer (void) static int character_name_to_code (char const *name, ptrdiff_t name_len) { - Lisp_Object code; - - /* Code point as U+XXXX.... */ - if (name[0] == 'U' && name[1] == '+') - { - /* Pass the leading '+' to string_to_number, so that it - rejects monstrosities such as negative values. */ - code = string_to_number (name + 1, 16, false); - } - else - { - /* Look up the name in the table returned by 'ucs-names'. */ - AUTO_STRING_WITH_LEN (namestr, name, name_len); - Lisp_Object names = call0 (Qucs_names); - code = CDR (Fassoc (namestr, names)); - } - - if (! (INTEGERP (code) - && 0 <= XINT (code) && XINT (code) <= MAX_UNICODE_CHAR - && ! char_surrogate_p (XINT (code)))) + /* For "U+XXXX", pass the leading '+' to string_to_number to reject + monstrosities like "U+-0000". */ + Lisp_Object code + = (name[0] == 'U' && name[1] == '+' + ? string_to_number (name + 1, 16, false) + : call2 (Qchar_from_name, make_unibyte_string (name, name_len), Qt)); + + if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR) + || char_surrogate_p (XINT (code))) { AUTO_STRING (format, "\\N{%s}"); AUTO_STRING_WITH_LEN (namestr, name, name_len); @@ -4829,5 +4818,5 @@ that are loaded before your customizations are read! */); DEFSYM (Qrehash_size, "rehash-size"); DEFSYM (Qrehash_threshold, "rehash-threshold"); - DEFSYM (Qucs_names, "ucs-names"); + DEFSYM (Qchar_from_name, "char-from-name"); } diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el index 2ebaf49..1a82d13 100644 --- a/test/src/lread-tests.el +++ b/test/src/lread-tests.el @@ -28,15 +28,55 @@ (ert-deftest lread-char-number () (should (equal (read "?\\N{U+A817}") #xA817))) -(ert-deftest lread-char-name () +(ert-deftest lread-char-name-1 () (should (equal (read "?\\N{SYLOTI NAGRI LETTER \n DHO}") #xA817))) +(ert-deftest lread-char-name-2 () + (should (equal (read "?\\N{BED}") #x1F6CF))) +(ert-deftest lread-char-name-3 () + (should (equal (read "?\\N{U+BED}") #xBED))) +(ert-deftest lread-char-name-4 () + (should (equal (read "?\\N{VARIATION SELECTOR-1}") #xFE00))) +(ert-deftest lread-char-name-5 () + (should (equal (read "?\\N{VARIATION SELECTOR-16}") #xFE0F))) +(ert-deftest lread-char-name-6 () + (should (equal (read "?\\N{VARIATION SELECTOR-17}") #xE0100))) +(ert-deftest lread-char-name-7 () + (should (equal (read "?\\N{VARIATION SELECTOR-256}") #xE01EF))) +(ert-deftest lread-char-name-8 () + (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-F900}") #xF900))) +(ert-deftest lread-char-name-9 () + (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-FAD9}") #xFAD9))) +(ert-deftest lread-char-name-10 () + (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2F800}") #x2F800))) +(ert-deftest lread-char-name-11 () + (should (equal (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2FA1D}") #x2FA1D))) (ert-deftest lread-char-invalid-number () (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax)) -(ert-deftest lread-char-invalid-name () +(ert-deftest lread-char-invalid-name-1 () (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-2 () + (should-error (read "?\\N{VARIATION SELECTOR-0}")) :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-3 () + (should-error (read "?\\N{VARIATION SELECTOR-257}")) + :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-4 () + (should-error (read "?\\N{VARIATION SELECTOR--0}")) + :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-5 () + (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-F8FF}")) + :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-6 () + (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-FADA}")) + :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-7 () + (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2F7FF}")) + :type 'invalid-read-syntax) +(ert-deftest lread-char-invalid-name-8 () + (should-error (read "?\\N{CJK COMPATIBILITY IDEOGRAPH-2FA1E}")) + :type 'invalid-read-syntax) (ert-deftest lread-char-non-ascii-name () (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}") @@ -55,13 +95,13 @@ (should-error (read "?\\N{U+DFFF}") :type 'invalid-read-syntax)) (ert-deftest lread-string-char-number-1 () - (should (equal (read "a\\N{U+A817}b") "a\uA817bx"))) + (should (equal (read "\"a\\N{U+A817}b\"") "a\uA817b"))) (ert-deftest lread-string-char-number-2 () (should-error (read "?\\N{0.5}") :type 'invalid-read-syntax)) (ert-deftest lread-string-char-number-3 () (should-error (read "?\\N{U+-0}") :type 'invalid-read-syntax)) (ert-deftest lread-string-char-name () - (should (equal (read "a\\N{SYLOTI NAGRI LETTER DHO}b") "a\uA817b"))) + (should (equal (read "\"a\\N{SYLOTI NAGRI LETTER DHO}b\"") "a\uA817b"))) ;;; lread-tests.el ends here -- 2.5.5