enhancements to utf-8.el

gnu-emacs-sources

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

enhancements to utf-8.el

From:	Dave Love
Subject:	enhancements to utf-8.el
Date:	28 Oct 2001 17:21:45 +0000
User-agent:	Gnus/5.09 (Gnus v5.9.0) Emacs/21.0.107

These changes allow Emacs 21.1's utf-8 coding system to encode extra
iso-8859 charsets using the previously-posted ucs-tables.el.  That
means that you can save a buffer as utf-8 that contains, for example,
non-ASCII text yanked from a Latin-2 buffer.

They also implement composing untranslatable utf-8 sequences into a
single displayed character; a tooltip says for which unicode it stands
in.  If a table of substitutions is available, it can be used to
display such sequences as a correct character.  I have one, not yet
posted.

Index: utf-8.el
===================================================================
RCS file: /cvs/emacs/lisp/international/utf-8.el,v
retrieving revision 1.9
diff -u -p -r1.9 utf-8.el
--- utf-8.el    2001/07/16 12:22:59     1.9
+++ utf-8.el    2001/10/28 16:33:04
@@ -64,6 +64,7 @@
   ;;         ascii          |       1        |       1
   ;; -----------------------+----------------+---------------
   ;;    eight-bit-control   |       2        |       2
+  ;;    eight-bit-graphic   |       2        |       1
   ;;     latin-iso8859-1    |       2        |       2
   ;; -----------------------+----------------+---------------
   ;; mule-unicode-0100-24ff |       2        |       4
@@ -228,7 +229,8 @@ characters.")
      (loop
       (if (r5 < 0)
          ((r1 = -1)
-          (read-multibyte-character r0 r1))
+          (read-multibyte-character r0 r1)
+          (translate-character ucs-mule-8859-to-mule-unicode r0 r1))
        (;; We have already done read-multibyte-character.
         (r0 = r5)
         (r1 = r6)
@@ -340,26 +342,96 @@ Only characters from the charsets ascii,
 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
 Others are encoded as U+FFFD.")
 
+;; Dummy definition so that the CCL can be checked correctly; the
+;; actual data are loaded on demand.
+(define-translation-table 'ucs-mule-8859-to-mule-unicode)
+
+(defsubst utf-8-untranslated-to-ucs ()
+  (let ((b1 (char-after))
+       (b2 (char-after (1+ (point))))
+       (b3 (char-after (+ 2 (point))))
+       (b4 (char-after (+ 4 (point)))))
+    (if (and b1 b2 b3)
+       (cond ((< b1 ?\xf0)
+              (setq b2 (lsh (logand b2 ?\x3f) 6))
+              (setq b3 (logand b3 ?\x3f))
+              (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
+             (b4
+              (setq b2 (lsh (logand b2 ?\x3f) 12))
+              (setq b3 (lsh (logand b3 ?\x3f) 6))
+              (setq b4 (logand b4 ?\x3f))
+              (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
+                                                    18)))))))))
+
+(defun utf-8-help-echo (window object position)
+  (format "Untranslated Unicode U+%04X"
+         (get-char-property position 'untranslated-utf-8 object)))
+
+(defvar utf-8-subst-table nil
+  "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs 
characters.")
+
+;; We compose the untranslatable sequences into a single character.
+;; This is infelicitous for editing, because there's currently no
+;; mechanism for treating compositions as atomic, but is OK for
+;; display.  We try to compose an appropriate character from a hash
+;; table of CJK characters to display correctly.  Otherwise we use
+;; U+FFFD.  What we really should have is hash table lookup from CCL
+;; so that we could do this properly.  This function GCs too much.
+(defsubst utf-8-compose ()
+  "Put a suitable composition on an untranslatable sequence.
+Return the sequence's length."
+  (let* ((u (utf-8-untranslated-to-ucs))
+        (l (and u (if (>= u ?\x10000)
+                      4
+                    3)))
+        (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
+    (when u
+      (put-text-property (point) (min (point-max) (+ l (point)))
+                        'untranslated-utf-8 u)
+      (unless subst
+         (put-text-property (point) (min (point-max) (+ l (point)))
+                            'help-echo 'utf-8-help-echo)
+         (setq subst ?$,3u=(B))
+      (compose-region (point) (+ l (point)) subst)
+      l)))
+
+(defun utf-8-post-read-conversion (length)
+  "Compose untranslated utf-8 sequences into single characters."
+  (save-excursion
+    (while (and (skip-chars-forward
+                (eval-and-compile      ; missing optimization
+                  (string-as-multibyte "^\341-\377")))
+               (not (eobp)))
+      (forward-char (utf-8-compose))))
+  length)
+
 (make-coding-system
  'mule-utf-8 4 ?u
  "UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are:
+The supported Emacs character sets are the following, determined by the
+translation table `ucs-mule-8859-to-mule-unicode':
    ascii
    eight-bit-control
    eight-bit-graphic
    latin-iso8859-1
+   latin-iso8859-2
+   latin-iso8859-3
+   latin-iso8859-4
+   cyrillic-iso8859-5
+   greek-iso8859-7
+   hebrew-iso8859-8
+   latin-iso8859-9
+   latin-iso8859-14
+   latin-iso8859-15
    mule-unicode-0100-24ff
    mule-unicode-2500-33ff
    mule-unicode-e000-ffff
 
 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
 are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences.  Emacs characters out of
-these ranges are encoded into U+FFFD.
-
-Note that, currently, characters in the mule-unicode charsets have no
-syntax and case information.  Thus, for instance, upper- and
-lower-casing commands won't work with them."
+characters to preserve their byte sequences and composed to behave as
+a single character when editing.  Emacs characters out of these ranges
+are encoded into U+FFFD."
 
  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
  '((safe-charsets
@@ -367,12 +439,22 @@ lower-casing commands won't work with th
     eight-bit-control
     eight-bit-graphic
     latin-iso8859-1
+    latin-iso8859-15
+    latin-iso8859-14
+    latin-iso8859-9
+    hebrew-iso8859-8
+    greek-iso8859-7
+    cyrillic-iso8859-5
+    latin-iso8859-4
+    latin-iso8859-3
+    latin-iso8859-2
     mule-unicode-0100-24ff
     mule-unicode-2500-33ff
     mule-unicode-e000-ffff)
    (mime-charset . utf-8)
    (coding-category . coding-category-utf-8)
-   (valid-codes (0 . 255))))
+   (valid-codes (0 . 255))
+   (post-read-conversion . utf-8-post-read-conversion)))
 
 (define-coding-system-alias 'utf-8 'mule-utf-8)

[Prev in Thread]

Current Thread

[Next in Thread]

enhancements to utf-8.el, Dave Love <=

Prev by Date: Georgian and extra Cyrillic language environments
Next by Date: ebackup.el
Previous by thread: Georgian and extra Cyrillic language environments
Next by thread: ebackup.el
Index(es):
- Date
- Thread