[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] uniname/uniname: support character alias
From: |
Daiki Ueno |
Subject: |
[PATCH] uniname/uniname: support character alias |
Date: |
Thu, 05 Feb 2015 18:13:26 +0900 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/25.0.50 (gnu/linux) |
I need this for gettext to support Python 3.3's change to its "\N{...}"
notation in string literals, which supports aliased character names:
https://docs.python.org/3/reference/lexical_analysis.html#literals
As usual, the patch does not include generated/copied files.
To generate, do the following from the top directory of gnulib:
$ sed -e '/^$/d' -e '/^#/d' < $UCD/NameAliases.txt > \
tests/uniname/NameAliases.txt
$ (cd lib/uniname \
&& clisp gen-uninames.lisp $UCD/UnicodeData.txt uninames.h \
tests/uniname/NameAliases.txt)
Regards,
--
Daiki Ueno
>From bf9ae34b0c3803785b7d49f05edd6c6f6de2bbb3 Mon Sep 17 00:00:00 2001
From: Daiki Ueno <address@hidden>
Date: Thu, 5 Feb 2015 16:56:41 +0900
Subject: [PATCH] uniname/uniname: support character alias
* lib/uniname/gen-uninames.lisp (main): New argument ALIASFILE.
Register one-way mapping from aliases to codepoints in the
generated tables.
* lib/uniname/uninames.h: Regenerate.
* tests/uniname/NameAliases.txt: New file, taken from UCD 7.0.0.
* modules/uniname/uniname-tests (Files): Add
tests/uniname/NameAliases.txt.
* tests/uniname/test-uninames.c: Mark as static.
(ALIASLEN): Define.
(struct unicode_alias): New struct.
(unicode_aliases): New variable.
(fill_aliases): New function.
(test_alias_lookup): New test function.
(main): Run the 'test_alias_lookup' test if the second argument is
given.
* tests/uniname/test-uninames.sh: Supply NameAliases.txt as the
second argument.
---
lib/uniname/gen-uninames.lisp | 49 ++++++++++++++-----
modules/uniname/uniname-tests | 1 +
tests/uniname/test-uninames.c | 108 ++++++++++++++++++++++++++++++++++++++++-
tests/uniname/test-uninames.sh | 2 +-
4 files changed, 145 insertions(+), 15 deletions(-)
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index e7de0a1..060dda1 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -25,10 +25,13 @@
length ; number of words
)
-(defun main (inputfile outputfile)
- (declare (type string inputfile outputfile))
+(defun main (inputfile outputfile aliasfile)
+ (declare (type string inputfile outputfile aliasfile))
#+UNICODE (setq *default-file-encoding* charset:utf-8)
(let ((all-chars '())
+ (all-chars-hashed (make-hash-table :test #'equal))
+ (all-aliases '())
+ all-chars-and-aliases
(all-ranges '())
(name-index 0)
range)
@@ -53,6 +56,7 @@
(push (make-unicode-char :index name-index
:name name-string)
all-chars)
+ (setf (gethash code all-chars-hashed) (car all-chars))
;; Update the contiguous range, or start a new range.
(if (and range (= (1+ (range-end-code range)) code))
(setf (range-end-code range) code)
@@ -70,9 +74,28 @@
(if range
(push range all-ranges))
(setq all-ranges (nreverse all-ranges))
+ (when aliasfile
+ ;; Read all characters and names from the alias file.
+ (with-open-file (istream aliasfile :direction :input)
+ (loop
+ (let ((line (read-line istream nil nil)))
+ (unless line (return))
+ (let* ((i1 (position #\; line))
+ (i2 (position #\; line :start (1+ i1)))
+ (code-string (subseq line 0 i1))
+ (code (parse-integer code-string :radix 16))
+ (name-string (subseq line (1+ i1) i2))
+ (uc (gethash code all-chars-hashed)))
+ (when uc
+ (push (make-unicode-char :index (unicode-char-index uc)
+ :name name-string)
+ all-aliases)
+ ) ) ) ) ) )
+ (setq all-aliases (nreverse all-aliases)
+ all-chars-and-aliases (append all-chars all-aliases))
;; Split into words.
(let ((words-by-length (make-array 0 :adjustable t)))
- (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar
#'unicode-char-name all-chars)))
+ (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar
#'unicode-char-name all-chars-and-aliases)))
(let ((i1 0))
(loop
(when (>= i1 (length name)) (return))
@@ -195,7 +218,7 @@
(gethash word (word-list-hashed (aref
words-by-length (length word))))
) )
;; Compute the word-indices for every unicode-char.
- (dolist (uc all-chars)
+ (dolist (uc all-chars-and-aliases)
(let ((name (unicode-char-name uc))
(indices '()))
(let ((i1 0))
@@ -215,8 +238,8 @@
)
) )
;; Sort the list of unicode-chars by word-indices.
- (setq all-chars
- (sort all-chars
+ (setq all-chars-and-aliases
+ (sort all-chars-and-aliases
(lambda (vec1 vec2)
(let ((len1 (length vec1))
(len2 (length vec2)))
@@ -235,10 +258,10 @@
) )
;; Output the word-indices.
(format ostream "static const uint16_t unicode_names[~D] = {~%"
- (reduce #'+ (mapcar (lambda (uc) (length
(unicode-char-word-indices uc))) all-chars))
+ (reduce #'+ (mapcar (lambda (uc) (length
(unicode-char-word-indices uc))) all-chars-and-aliases))
)
(let ((i 0))
- (dolist (uc all-chars)
+ (dolist (uc all-chars-and-aliases)
(format ostream " ~{ ~D,~}"
(maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r)
1 0)))
(coerce (unicode-char-word-indices uc)
'list)
@@ -255,9 +278,9 @@
(format ostream "static const struct { uint16_t index; uint32_t
name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__
>= 7)~%__attribute__((__packed__))~%#endif~%")
(format ostream "unicode_name_to_index[~D] = {~%"
- (length all-chars)
+ (length all-chars-and-aliases)
)
- (dolist (uc all-chars)
+ (dolist (uc all-chars-and-aliases)
(format ostream " { 0x~4,'0X, ~D },"
(unicode-char-index uc)
(unicode-char-word-indices-index uc)
@@ -285,10 +308,10 @@
)
(format ostream "};~%")
(format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
- (reduce #'max (mapcar (lambda (uc) (length
(unicode-char-name uc))) all-chars))
+ (reduce #'max (mapcar (lambda (uc) (length
(unicode-char-name uc))) all-chars-and-aliases))
)
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
- (reduce #'max (mapcar (lambda (uc) (length
(unicode-char-word-indices uc))) all-chars))
+ (reduce #'max (mapcar (lambda (uc) (length
(unicode-char-word-indices uc))) all-chars-and-aliases))
)
(format ostream "static const struct { uint16_t index; uint32_t gap;
uint16_t length; } unicode_ranges[~D] = {~%"
(length all-ranges))
@@ -302,4 +325,4 @@
)
) ) )
-(main (first *args*) (second *args*))
+(main (first *args*) (second *args*) (third *args*))
diff --git a/modules/uniname/uniname-tests b/modules/uniname/uniname-tests
index 305e6a1..512b035 100644
--- a/modules/uniname/uniname-tests
+++ b/modules/uniname/uniname-tests
@@ -2,6 +2,7 @@ Files:
tests/uniname/test-uninames.sh
tests/uniname/test-uninames.c
tests/uniname/UnicodeDataNames.txt
+tests/uniname/NameAliases.txt
Depends-on:
xalloc
diff --git a/tests/uniname/test-uninames.c b/tests/uniname/test-uninames.c
index f8fb077..eccf2f4 100644
--- a/tests/uniname/test-uninames.c
+++ b/tests/uniname/test-uninames.c
@@ -27,7 +27,19 @@
/* The names according to the UnicodeData.txt file, modified to contain the
Hangul syllable names, as described in the Unicode 3.0 book. */
-const char * unicode_names [0x110000];
+static const char * unicode_names [0x110000];
+
+/* Maximum entries in unicode_aliases. */
+#define ALIASLEN 0x200
+
+/* The aliases according to the NameAliases.txt file. */
+struct unicode_alias
+{
+ const char *name;
+ unsigned int uc;
+};
+
+static struct unicode_alias unicode_aliases [ALIASLEN];
/* Maximum length of a field in the UnicodeData.txt file. */
#define FIELDLEN 120
@@ -113,6 +125,62 @@ fill_names (const char *unicodedata_filename)
}
}
+/* Stores in unicode_aliases[] the relevant contents of the NameAliases.txt
+ file. */
+static void
+fill_aliases (const char *namealiases_filename)
+{
+ int i;
+ FILE *stream;
+ char field0[FIELDLEN];
+ char field1[FIELDLEN];
+ int lineno = 0;
+
+ for (i = 0; i < ALIASLEN; i++)
+ unicode_aliases[i].uc = UNINAME_INVALID;
+
+ stream = fopen (namealiases_filename, "r");
+ if (stream == NULL)
+ {
+ fprintf (stderr, "error during fopen of '%s'\n", namealiases_filename);
+ exit (EXIT_FAILURE);
+ }
+
+ for (i = 0; i < ALIASLEN; i++)
+ {
+ int n;
+ int c;
+ unsigned int uc;
+
+ lineno++;
+ n = getfield (stream, field0, ';');
+ n += getfield (stream, field1, ';');
+ if (n == 0)
+ break;
+ if (n != 2)
+ {
+ fprintf (stderr, "short line in '%s':%d\n",
+ namealiases_filename, lineno);
+ exit (EXIT_FAILURE);
+ }
+ for (; (c = getc (stream)), (c != EOF && c != '\n'); )
+ ;
+ uc = strtoul (field0, NULL, 16);
+ if (uc >= 0x110000)
+ {
+ fprintf (stderr, "index too large\n");
+ exit (EXIT_FAILURE);
+ }
+ unicode_aliases[i].name = xstrdup (field1);
+ unicode_aliases[i].uc = uc;
+ }
+ if (ferror (stream) || fclose (stream))
+ {
+ fprintf (stderr, "error reading from '%s'\n", namealiases_filename);
+ exit (1);
+ }
+}
+
/* Perform an exhaustive test of the unicode_character_name function. */
static int
test_name_lookup ()
@@ -246,6 +314,38 @@ test_inverse_lookup ()
return error;
}
+/* Perform a test of the unicode_name_character function for aliases. */
+static int
+test_alias_lookup ()
+{
+ int error = 0;
+ unsigned int i;
+ char buf[UNINAME_MAX];
+
+ /* Verify all valid character names are recognized. */
+ for (i = 0; i < ALIASLEN; i++)
+ if (unicode_aliases[i].uc != UNINAME_INVALID
+ /* Skip if the character has no canonical name (e.g. control
+ characters). */
+ && unicode_character_name (unicode_aliases[i].uc, buf))
+ {
+ unsigned int result = unicode_name_character (unicode_aliases[i].name);
+ if (result != unicode_aliases[i].uc)
+ {
+ if (result == UNINAME_INVALID)
+ fprintf (stderr, "inverse name lookup of \"%s\" failed\n",
+ unicode_aliases[i]);
+ else
+ fprintf (stderr,
+ "inverse name lookup of \"%s\" returned 0x%04X\n",
+ unicode_aliases[i], result);
+ error = 1;
+ }
+ }
+
+ return error;
+}
+
int
main (int argc, char *argv[])
{
@@ -258,5 +358,11 @@ main (int argc, char *argv[])
error |= test_name_lookup ();
error |= test_inverse_lookup ();
+ if (argc > 2)
+ {
+ fill_aliases (argv[2]);
+ error |= test_alias_lookup ();
+ }
+
return error;
}
diff --git a/tests/uniname/test-uninames.sh b/tests/uniname/test-uninames.sh
index f26c275..0e6a018 100755
--- a/tests/uniname/test-uninames.sh
+++ b/tests/uniname/test-uninames.sh
@@ -1,2 +1,2 @@
#!/bin/sh
-exec ./test-uninames${EXEEXT} "$srcdir/uniname/UnicodeDataNames.txt"
+exec ./test-uninames${EXEEXT} "$srcdir/uniname/UnicodeDataNames.txt"
"$srcdir/uniname/NameAliases.txt"
--
2.1.0
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [PATCH] uniname/uniname: support character alias,
Daiki Ueno <=