[Emacs-diffs] Changes to emacs/lisp/international/mule.el

emacs-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Emacs-diffs] Changes to emacs/lisp/international/mule.el

From:	Kenichi Handa
Subject:	[Emacs-diffs] Changes to emacs/lisp/international/mule.el
Date:	Wed, 03 Dec 2003 03:24:43 -0500
Index: emacs/lisp/international/mule.el
diff -c emacs/lisp/international/mule.el:1.195 
emacs/lisp/international/mule.el:1.196
*** emacs/lisp/international/mule.el:1.195      Thu Nov 13 21:39:35 2003
--- emacs/lisp/international/mule.el    Wed Dec  3 03:24:42 2003
***************
*** 1330,1341 ****
  ;;; X selections
  
  (defvar ctext-non-standard-encodings-alist
!   '(("ISO8859-15" . iso-8859-15)
!     ("ISO8859-14" . iso-8859-14)
!     ("KOI8-R" . koi8-r)
!     ("BIG5-0" . big5))
!   "Alist of non-standard encoding names vs Emacs coding systems.
! This alist is used to decode an extened segment of a compound text.")
  
  (defvar ctext-non-standard-encodings-regexp
    (string-to-multibyte
--- 1330,1371 ----
  ;;; X selections
  
  (defvar ctext-non-standard-encodings-alist
!   '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2))
!     ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
!     ("ISO8859-15" iso-8859-15 1 latin-iso8859-15))
!   "Alist of non-standard encoding names vs the corresponding usages in CTEXT.
! 
! It controls how extended segments of a compound text are handled
! by the coding system `compound-text-with-extensions'.
! 
! Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET).
! 
! ENCODING-NAME is an encoding name of an \"extended segments\".
! 
! CODING-SYSTEM is the coding-system to encode (or decode) the
! characters into (or from) the extended segment.
! 
! N-OCTET is the number of octets (bytes) that encodes a character
! in the segment.  It can be 0 (meaning the number of octets per
! character is variable), 1, 2, 3, or 4.
! 
! CHARSET is a charater set containing characters that are encoded
! in the segment.  It can be a list of character sets.  It can also
! be a char-table, in which case characters that have non-nil value
! in the char-table are the target.
! 
! On decoding CTEXT, all encoding names listed here are recognized.
! 
! On encoding CTEXT, encoding names in the variable
! `ctext-non-standard-encodings' (which see) and in the information
! listed for the current language environment under the key
! `ctext-non-standard-encodings' are used.")
! 
! (defvar ctext-non-standard-encodings
!   '("big5-0")
!   "List of non-standard encoding names used in extended segments of CTEXT.
! Each element must be one of the names listed in the variable
! `ctext-non-standard-encodings-alist' (which see).")
  
  (defvar ctext-non-standard-encodings-regexp
    (string-to-multibyte
***************
*** 1347,1359 ****
      "\\(address@hidden)")))
  
  ;; Functions to support "Non-Standard Character Set Encodings" defined
! ;; by the COMPOUND-TEXT spec.
! ;; We support that by decoding the whole data by `ctext' which just
! ;; pertains byte sequences belonging to ``extended segment'', then
! ;; decoding those byte sequences one by one in Lisp.
! ;; This function also supports "The UTF-8 encoding" described in the
! ;; section 7 of the documentation fo COMPOUND-TEXT distributed with
! ;; XFree86.
  
  (defun ctext-post-read-conversion (len)
    "Decode LEN characters encoded as Compound Text with Extended Segments."
--- 1377,1385 ----
      "\\(address@hidden)")))
  
  ;; Functions to support "Non-Standard Character Set Encodings" defined
! ;; by the COMPOUND-TEXT spec.  They also support "The UTF-8 encoding"
! ;; described in the section 7 of the documentation of COMPOUND-TEXT
! ;; distributed with XFree86.
  
  (defun ctext-post-read-conversion (len)
    "Decode LEN characters encoded as Compound Text with Extended Segments."
***************
*** 1365,1371 ****
            pos bytes)
        (or in-workbuf
            (narrow-to-region (point) (+ (point) len)))
-       (decode-coding-region (point-min) (point-max) 'ctext)
        (if in-workbuf
            (set-buffer-multibyte t))
        (while (re-search-forward ctext-non-standard-encodings-regexp
--- 1391,1396 ----
***************
*** 1376,1386 ****
              (let* ((M (char-after (+ pos 4)))
                     (L (char-after (+ pos 5)))
                     (encoding (match-string 2))
!                    (coding (or (cdr (assoc-ignore-case 
!                                      encoding
!                                      ctext-non-standard-encodings-alist))
!                                (coding-system-p
!                                 (intern (downcase encoding))))))
                (setq bytes (- (+ (* (- M 128) 128) (- L 128))
                               (- (point) (+ pos 6))))
                (when coding
--- 1401,1414 ----
              (let* ((M (char-after (+ pos 4)))
                     (L (char-after (+ pos 5)))
                     (encoding (match-string 2))
!                    (encoding-info (assoc-ignore-case 
!                                    encoding
!                                    ctext-non-standard-encodings-alist))
!                    (coding (if encoding-info
!                                (nth 1 encoding-info)
!                              (setq encoding (intern (downcase encoding)))
!                              (and (coding-system-p encoding)
!                                   encoding))))
                (setq bytes (- (+ (* (- M 128) 128) (- L 128))
                               (- (point) (+ pos 6))))
                (when coding
***************
*** 1388,1453 ****
                  (forward-char bytes)
                  (decode-coding-region (- (point) bytes) (point) coding)))
            ;; ESC % G --UTF-8-BYTES-- ESC % @
!           (setq bytes (- (point) pos))
!           (decode-coding-region (- (point) bytes) (point) 'utf-8))))
        (goto-char (point-min))
        (- (point-max) (point)))))
  
! ;; From X registry 2001/06/01
! ;; 20. NON-STANDARD CHARACTER SET ENCODINGS
! 
! ;; See Section 6 of the Compound Text standard.
! 
! ;; Name                                               Reference
! ;; ----                                               ---------
! ;; "DEC.CNS11643.1986-2"                              [53]
! ;;    CNS11643 2-plane using the recommended
! ;;    internal representation scheme
! ;; "DEC.DTSCS.1990-2"                         [54]
! ;;    DEC Taiwan Supplemental Character Set
! ;; "fujitsu.u90x03"                           [87]
! ;; "ILA"                                              [62]
! ;;    registry prefix
! ;; "IPSYS"                                            [59]
! ;;    registry prefix
! ;; "omron_UDC"                                        [45]
! ;;         omron User Defined Charset
! ;; "omron_UDC_ja"                                     [45]
! ;;         omron User Defined Charset for Japanese
! ;; "omron_UDC_zh"                                     [45]
! ;;         omron User Defined Charset for Chinese(Main land)
! ;; "omron_UDC_tw"                                     [45]
! ;;         omron User Defined Charset for Chinese(Taiwan)
! 
! ;; If you add charsets here, be sure to modify the regexp used by
! ;; ctext-pre-write-conversion to look up non-standard charsets.
! (defvar ctext-non-standard-designations-alist
!   '(("$(0" . (big5 "big5-0" 2))
!     ("$(1" . (big5 "big5-0" 2))
!     ;; The following are actually standard; generating extended
!     ;; segments for them is wrong and screws e.g. Latin-9 users.
!     ;; 8859-{10,13,16} aren't Emacs charsets anyhow.  -- fx
! ;;     ("-V"  . (t "iso8859-10" 1))
! ;;     ("-Y"  . (t "iso8859-13" 1))
! ;;     ("-_"  . (t "iso8859-14" 1))
! ;;     ("-b"  . (t "iso8859-15" 1))
! ;;     ("-f"  . (t "iso8859-16" 1))
!     )
!   "Alist of ctext control sequences that introduce character sets which
! are not in the list of approved encodings, and the corresponding
! coding system, identifier string, and number of octets per encoded
! character.
! 
! Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
! is the control sequence (sans the leading ESC) that introduces the character
! set in the text encoded by compound-text.  ENCODING is a coding system
! symbol; if it is t, it means that the ctext coding system already encodes
! the text correctly, and only the leading control sequence needs to be altered.
! If ENCODING is a coding system, we need to re-encode the text with that
! coding system.  CHARSET is the name of the charset we need to put into
! the leading control sequence.  NOCTETS is the number of octets (bytes) that
! encode each character in this charset.  NOCTETS can be 0 (meaning the number
! of octets per character is variable), 1, 2, 3, or 4.")
  
  (defun ctext-pre-write-conversion (from to)
    "Encode characters between FROM and TO as Compound Text w/Extended Segments.
--- 1416,1454 ----
                  (forward-char bytes)
                  (decode-coding-region (- (point) bytes) (point) coding)))
            ;; ESC % G --UTF-8-BYTES-- ESC % @
!           (delete-char -3)
!           (delete-region pos (+ pos 3))
!           (decode-coding-region pos (point) 'utf-8))))
        (goto-char (point-min))
        (- (point-max) (point)))))
  
! ;; Return a char table of extended segment usage for each character.
! ;; Each value of the char table is nil, one of the elements of
! ;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'.
! 
! (defun ctext-non-standard-encodings-table ()
!   (let ((table (make-char-table 'translation-table)))
!     (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8)
!     (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8)
!     (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8)
!     (dolist (encoding (reverse
!                      (append
!                       (get-language-info current-language-environment
!                                          'ctext-non-standard-encodings)
!                       ctext-non-standard-encodings)))
!       (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
!            (charset (nth 3 slot)))
!       (if charset
!           (cond ((charsetp charset)
!                  (aset table (make-char charset) slot))
!                 ((listp charset)
!                  (dolist (elt charset)
!                    (aset table (make-char elt) slot)))
!                 ((char-table-p charset)
!                  (map-char-table #'(lambda (k v) 
!                                  (if (and v (> k 128)) (aset table k slot)))
!                                  charset))))))
!     table))
  
  (defun ctext-pre-write-conversion (from to)
    "Encode characters between FROM and TO as Compound Text w/Extended Segments.
***************
*** 1470,1516 ****
             (insert-buffer-substring buf from to))))
  
      ;; Now we can encode the whole buffer.
!     (let ((case-fold-search nil)
          last-coding-system-used
!         pos posend desig encode-info encoding chset noctets textlen)
!       (goto-char (point-min))
!       ;; At first encode the whole buffer.
!       (encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
!       ;; Then replace ISO-2022 charset designations with extended
!       ;; segments, for those charsets that are not part of the
!       ;; official X registry.  The regexp below finds the leading
!       ;; sequences for big5.
!       (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
!       (setq pos (match-beginning 0)
!             posend (point)
!             desig (match-string 1)
!             encode-info (cdr (assoc desig
!                                     ctext-non-standard-designations-alist))
!             encoding (car encode-info)
!             chset (cadr encode-info)
!             noctets (car (cddr encode-info)))
!       (skip-chars-forward "^\e")
!       (cond
!        ((eq encoding t)  ; only the leading sequence needs to be changed
!         (setq textlen (+ (- (point) posend) (length chset) 1))
!         ;; Generate the control sequence for an extended segment.
!         (replace-match (format "\e%%/%d%c%c%s"
!                                noctets
!                                (+ (/ textlen 128) 128)
!                                (+ (% textlen 128) 128)
!                                chset)
!                        t t))
!        ((coding-system-p encoding) ; need to recode the entire segment...
!         (decode-coding-region pos (point) 'ctext-no-compositions)
!         (encode-coding-region pos (point) encoding)
!         (setq textlen (+ (- (point) pos) (length chset) 1))
!         (save-excursion
!           (goto-char pos)
!           (insert (format "\e%%/%d%c%c%s"
!                           noctets
!                           (+ (/ textlen 128) 128)
!                           (+ (% textlen 128) 128)
!                           chset))))))
        (goto-char (point-min))))
    ;; Must return nil, as build_annotations_2 expects that.
    nil)
--- 1471,1526 ----
             (insert-buffer-substring buf from to))))
  
      ;; Now we can encode the whole buffer.
!     (let ((encoding-table (ctext-non-standard-encodings-table))
          last-coding-system-used
!         last-pos last-encoding-info
!         encoding-info end-pos)
!       (goto-char (setq last-pos (point-min)))
!       (setq end-pos (point-marker))
!       (while (re-search-forward "[^\000-\177]+" nil t)
!       ;; Found a sequence of non-ASCII characters.
!       (setq last-pos (match-beginning 0)
!             last-encoding-info (aref encoding-table (char-after last-pos)))
!       (set-marker end-pos (match-end 0))
!       (goto-char (1+ last-pos))
!       (catch 'tag
!         (while t
!           (setq encoding-info
!                 (if (< (point) end-pos)
!                     (aref encoding-table (following-char))))
!           (unless (eq last-encoding-info encoding-info)
!             (cond ((consp last-encoding-info)
!                    ;; Encode the previous range using an extended
!                    ;; segment.
!                    (let ((encoding-name (car last-encoding-info))
!                          (coding-system (nth 1 last-encoding-info))
!                          (noctets (nth 2 last-encoding-info))
!                          len)
!                      (encode-coding-region last-pos (point) coding-system)
!                      (setq len (+ (length encoding-name) 1
!                                   (- (point) last-pos)))
!                      (save-excursion
!                        (goto-char last-pos)
!                        (insert (string-to-multibyte 
!                                 (format "\e%%/%d%c%c%s"
!                                         noctets
!                                         (+ (/ len 128) 128)
!                                         (+ (% len 128) 128)
!                                         encoding-name))))))
!                   ((eq last-encoding-info 'utf-8)
!                    ;; Encode the previous range using UTF-8 encoding
!                    ;; extention.
!                    (encode-coding-region last-pos (point) 'mule-utf-8)
!                    (save-excursion
!                      (goto-char last-pos)
!                      (insert "\e%G"))
!                    (insert "\e%@")))
!             (setq last-pos (point)
!                   last-encoding-info encoding-info))
!           (if (< (point) end-pos)
!               (forward-char 1)
!             (throw 'tag nil)))))
!       (set-marker end-pos nil)
        (goto-char (point-min))))
    ;; Must return nil, as build_annotations_2 expects that.
    nil)
[Prev in Thread]
Current Thread
[Next in Thread]
[Emacs-diffs] Changes to emacs/lisp/international/mule.el, Kenichi Handa <=
Prev by Date: [Emacs-diffs] Changes to emacs/lisp/international/mule-conf.el
Next by Date: [Emacs-diffs] Changes to emacs/lisp/language/cyrillic.el
Previous by thread: [Emacs-diffs] Changes to emacs/lisp/international/mule-conf.el
Next by thread: [Emacs-diffs] Changes to emacs/lisp/language/cyrillic.el
Index(es):
- Date
- Thread