--- Begin Message ---
Subject: |
Chinese GBK encoding support requested |
Date: |
28 Oct 2001 23:46:46 +0800 |
User-agent: |
Gnus/5.09 (Gnus v5.9.0) Emacs/21.1 |
Hi,
Is there any plan to support Chinese GBK encoding? This coding is
currently used widely in Simplified Chinese Platform and is the
defacto encoding scheme in use (at least before unicode based codings
finally gains its popularity). GBK covers a larger charset than
GB2312, including the name of Chinese Prime Minister, Zhu Rong-Ji,
which cannot be encoded in GB2312.
Unfortunately, Emacs currently only support GB2312, BIG5, and some
other (rarely used?) CNS codings, so it would be inconvient for
chinese processing.
I have made a 'dirty' patch to deal with GBK encoding, which I think
might be useful if the developers decide to add the support. It's
'dirty' because it disables a charset, chinese-cns11643-7, to define
my own ones. In fact, since GBK is a superset of GB2312, I can reuse
chinese-gb2312, but I need to define two more charsets and there are
only one unused charset left.
After that, I implement the encoder and decoder functions:
ccl-decode-gbk-char, ccl-encode-gbk-char, and ccl-encode-gbk-font.
So the key components are there.
The patch works fine on my system, though I'm not very sure about the
use of charset category, etc. Also, I think it'd be better if we can
make it without disabling any existing charset. A MULE guru may be
able to help and find out what's being left out or incorrect in my
patch.
Any comments are welcome.
Yong LU
Part I: Patch to disable chinese-cns11643-7
---------8<---------8<---------o--------->8--------->8--------->8------
diff -subr emacs-21.0.104/lisp/gnus/mm-util.el
emacs-21.0.104.gbk/lisp/gnus/mm-util.el
--- emacs-21.0.104/lisp/gnus/mm-util.el Tue Mar 6 18:32:07 2001
+++ emacs-21.0.104.gbk/lisp/gnus/mm-util.el Wed Aug 22 14:58:28 2001
@@ -74,8 +74,8 @@
korean-ksc5601 japanese-jisx0212
chinese-cns11643-1 chinese-cns11643-2
chinese-cns11643-3 chinese-cns11643-4
- chinese-cns11643-5 chinese-cns11643-6
- chinese-cns11643-7)
+ chinese-cns11643-5 chinese-cns11643-6)
+;; chinese-cns11643-7)
;; utf-8 comes either from Mule-UCS or Mule 5+.
,@(if (mm-coding-system-p 'utf-8)
(list (cons 'utf-8 (delete 'ascii
diff -subr emacs-21.0.104/lisp/international/characters.el
emacs-21.0.104.gbk/lisp/international/characters.el
--- emacs-21.0.104/lisp/international/characters.el Fri Mar 9 18:23:38 2001
+++ emacs-21.0.104.gbk/lisp/international/characters.el Wed Aug 22 15:00:16 2001
@@ -167,8 +167,8 @@
chinese-cns11643-3
chinese-cns11643-4
chinese-cns11643-5
- chinese-cns11643-6
- chinese-cns11643-7))
+ chinese-cns11643-6))
+;; chinese-cns11643-7))
generic-char)
(while cns-list
(setq generic-char (make-char (car cns-list)))
@@ -802,7 +802,7 @@
(chinese-cns11643-4 . iso-2022-cn)
(chinese-cns11643-5 . iso-2022-cn)
(chinese-cns11643-6 . iso-2022-cn)
- (chinese-cns11643-7 . iso-2022-cn)
+;; (chinese-cns11643-7 . iso-2022-cn)
(indian-2-column . devanagari)
(tibetan . tibetan)
(latin-iso8859-14 . iso-latin-8)
diff -subr emacs-21.0.104/lisp/international/fontset.el
emacs-21.0.104.gbk/lisp/international/fontset.el
--- emacs-21.0.104/lisp/international/fontset.el Mon Feb 26 16:59:42 2001
+++ emacs-21.0.104.gbk/lisp/international/fontset.el Wed Aug 22 15:00:10 2001
@@ -60,7 +60,7 @@
(chinese-cns11643-4 . ("*" . "CNS11643.1992-4"))
(chinese-cns11643-5 . ("*" . "CNS11643.1992-5"))
(chinese-cns11643-6 . ("*" . "CNS11643.1992-6"))
- (chinese-cns11643-7 . ("*" . "CNS11643.1992-7"))
+;; (chinese-cns11643-7 . ("*" . "CNS11643.1992-7"))
(chinese-big5-1 . ("*" . "Big5"))
(chinese-big5-2 . ("*" . "Big5"))
(chinese-sisheng . (nil . "sisheng_cwnn"))
@@ -514,8 +514,8 @@
chinese-cns11643-3:-*-medium-r-normal-*-16-*-cns11643*-3,
chinese-cns11643-4:-*-medium-r-normal-*-16-*-cns11643*-4,
chinese-cns11643-5:-*-medium-r-normal-*-16-*-cns11643*-5,
- chinese-cns11643-6:-*-medium-r-normal-*-16-*-cns11643*-6,
- chinese-cns11643-7:-*-medium-r-normal-*-16-*-cns11643*-7")
+ chinese-cns11643-6:-*-medium-r-normal-*-16-*-cns11643*-6,")
+;; chinese-cns11643-7:-*-medium-r-normal-*-16-*-cns11643*-7")
"String of fontset spec of the standard fontset.
You have the biggest chance to display international characters
with correct glyphs by using the standard fontset.
diff -subr emacs-21.0.104/lisp/international/mule-conf.el
emacs-21.0.104.gbk/lisp/international/mule-conf.el
--- emacs-21.0.104/lisp/international/mule-conf.el Fri Mar 9 18:23:38 2001
+++ emacs-21.0.104.gbk/lisp/international/mule-conf.el Wed Aug 22 15:01:12 2001
@@ -236,9 +236,9 @@
(define-charset 249 'chinese-cns11643-6
[2 94 2 0 ?L 0 "CNS11643-6" "CNS11643-6 (Chinese traditional): ISO-IR-186"
"CNS11643 Plane 6 Chinese Traditional: ISO-IR-186"])
-(define-charset 250 'chinese-cns11643-7
- [2 94 2 0 ?M 0 "CNS11643-7" "CNS11643-7 (Chinese traditional): ISO-IR-187"
- "CNS11643 Plane 7 Chinese Traditional: ISO-IR-187"])
+;; (define-charset 250 'chinese-cns11643-7
+;; [2 94 2 0 ?M 0 "CNS11643-7" "CNS11643-7 (Chinese traditional): ISO-IR-187"
+;; "CNS11643 Plane 7 Chinese Traditional: ISO-IR-187"])
;; Actual Glyph for 2-column width.
(define-charset 251 'indian-2-column
@@ -373,13 +373,13 @@
(nil korean-ksc5601 chinese-gb2312 chinese-cns11643-1 t)
(nil chinese-cns11643-2)
(nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
- chinese-cns11643-6 chinese-cns11643-7)
+ chinese-cns11643-6) ;; chinese-cns11643-7)
short ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
init-bol)
'((safe-charsets ascii japanese-jisx0208 japanese-jisx0208-1978 latin-jisx0201
korean-ksc5601 chinese-gb2312 chinese-cns11643-1
chinese-cns11643-2 chinese-cns11643-3 chinese-cns11643-4
- chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7)
+ chinese-cns11643-5 chinese-cns11643-6) ;; chinese-cns11643-7)
(composition . t)))
(define-coding-system-alias 'iso-2022-cjk 'iso-2022-7bit-lock-ss2)
diff -subr emacs-21.0.104/lisp/language/chinese.el
emacs-21.0.104.gbk/lisp/language/chinese.el
--- emacs-21.0.104/lisp/language/chinese.el Wed Jan 24 22:50:08 2001
+++ emacs-21.0.104.gbk/lisp/language/chinese.el Wed Aug 22 14:57:29 2001
@@ -54,12 +54,12 @@
(nil chinese-gb2312 chinese-cns11643-1)
(nil chinese-cns11643-2)
(nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
- chinese-cns11643-6 chinese-cns11643-7)
+ chinese-cns11643-6) ;; chinese-cns11643-7)
nil ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
init-bol)
'((safe-charsets ascii chinese-gb2312 chinese-cns11643-1 chinese-cns11643-2
chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
- chinese-cns11643-6 chinese-cns11643-7)
+ chinese-cns11643-6) ;; chinese-cns11643-7)
(mime-charset . iso-2022-cn-ext)))
@@ -171,8 +171,8 @@
(set-language-info-alist
"Chinese-CNS" '((charset chinese-cns11643-1 chinese-cns11643-2
chinese-cns11643-3 chinese-cns11643-4
- chinese-cns11643-5 chinese-cns11643-6
- chinese-cns11643-7)
+ chinese-cns11643-5 chinese-cns11643-6)
+;; chinese-cns11643-7)
(coding-system iso-2022-cn)
(coding-priority iso-2022-cn chinese-big5 chinese-iso-8bit)
(features china-util)
diff -subr emacs-21.0.104/lisp/ps-mule.el emacs-21.0.104.gbk/lisp/ps-mule.el
--- emacs-21.0.104/lisp/ps-mule.el Thu Apr 5 17:45:08 2001
+++ emacs-21.0.104.gbk/lisp/ps-mule.el Wed Aug 22 14:56:40 2001
@@ -411,8 +411,8 @@
(normal bdf ("cns5-40.bdf" "cns-5-40.bdf") ps-mule-encode-7bit 2))
(chinese-cns11643-6
(normal bdf ("cns6-40.bdf" "cns-6-40.bdf") ps-mule-encode-7bit 2))
- (chinese-cns11643-7
- (normal bdf ("cns7-40.bdf" "cns-7-40.bdf") ps-mule-encode-7bit 2))
+;; (chinese-cns11643-7
+;; (normal bdf ("cns7-40.bdf" "cns-7-40.bdf") ps-mule-encode-7bit 2))
(indian-2-column
(normal bdf ("ind24-mule.bdf" "mule-indian-24.bdf") ps-mule-encode-7bit
2))
(tibetan
---------8<---------8<---------o--------->8--------->8--------->8------
Part II: GBK encoders and decoders
---------8<---------8<---------o--------->8--------->8--------->8------
;;; chinese-gbk.el --- Support for Chinese GBK
;;
;; Author: Yong Lu <l y o n g u @ y a h o o . c o m>
;;
;; Date: Aug 22, 2001
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License along with
;; CCE; see the file COPYING. If not, write to the Free Software Foundation,
;; 675 Mass Ave, Cambridge, MA 02139, USA.
;;; Commentary:
;; GBK includes the character set GB2312
;; Please refer to mule-conf.el and other source files in
;; ./lisp/{language,international} and ./src for more information.
;;
;;
;; define gbk specific charsets.
;;
;; ;;
;; ;; chinese-gbk-0:
;; ;; Range Words Marks
;; ;; -------------------------------------------------------
;; ;; A1A1-A9FE 846 GB2312, GB12345 (GBK/1)
;; ;; AAA1-AFFE 564 User defined 1
;; ;; B0A1-F7FE 6768 GB2312 (GBK/2)
;; ;; F8A1-FEFE 658 User defined 2
;; ;;
;; (define-charset 249 'chinese-gbk-0
;; [2 94 2 0 ?0 0 "GBK (Level-1)" "GBK (Level-1) A1A1-F7FE"
;; "GB2312, GB12345, Big5 and Symbols Part (A1A1-F7FE) of GBK (Chinese
Simplified)"])
;;
;; chinese-gbk-1:
;; Range Words Marks
;; ------------------------------------------------------
;; 8140-A0FE 6080 GB13000 (GBK/3)
;;
(define-charset 250 'chinese-gbk-1
[2 94 2 0 ?1 0 "GBK (Level-2)" "GBK (Level-1) 8140-A0FE"
"GB13000 Part 1 (8140-A0FE) of GBK (Chinese Simplified)"])
;;
;; chinese-gbk-2:
;; Range Words Marks
;; -----------------------------------------------------
;;
;; A140-A7A0 672 User defined 3
;; A840-A940 192 Big5, Symbols (GBK/5)
;; AA40-FEA0 8160 GBK13000 (GBK/4)
;;
(define-charset 253 'chinese-gbk-2
[2 96 2 0 ?2 0 "GBK (Level-3)" "GBK (Level-3) A140-FEA0"
"GB13000 Part 2 (A140-FEA0) of GBK (Chinese Simplified)"])
;;
;; Include user-define regions???
;;
;;
;; Modify charset categories. Improve it later.
;;
(let (;;(generic-gbk-0-char (make-char 'chinese-gbk-0))
(generic-gbk-1-char (make-char 'chinese-gbk-1))
(generic-gbk-2-char (make-char 'chinese-gbk-2)))
;; (modify-syntax-entry generic-gbk-0-char "w")
(modify-syntax-entry generic-gbk-1-char "w")
(modify-syntax-entry generic-gbk-2-char "w")
;; (modify-category-entry generic-gbk-0-char ?c)
(modify-category-entry generic-gbk-1-char ?c)
(modify-category-entry generic-gbk-2-char ?c)
;; (modify-category-entry generic-gbk-0-char ?C)
(modify-category-entry generic-gbk-1-char ?C)
(modify-category-entry generic-gbk-2-char ?C)
;; (modify-category-entry generic-gbk-0-char ?\|)
(modify-category-entry generic-gbk-1-char ?\|)
(modify-category-entry generic-gbk-2-char ?\|))
;;
;; Coding category. Any use?
;;
;;(setq coding-category-gbk 'chinese-gbk)
;; (cons (gbk
;; (ascii chinese-gbk-0 chinese-gbk-1 chinese-gbk-2)
;; ccl-decode-gbk-char
;; ((32 127)
;; ((?\x81 ?\xFE) . (?\x40 ?\x7E ?\x80 ?\xFE))))
;; non-iso-charset-alist)
;;=============================;;
;;
;; ccl coder/decoder for gbk
;;
;;=============================;;
(define-ccl-program ccl-decode-gbk-char
`(2
((r3 = ,(charset-id 'eight-bit-control))
(r4 = ,(charset-id 'eight-bit-graphic))
(loop
(read r0)
(loop
;; 1 byte encoding, i.e., ascii
(if (r0 < #x80)
((write r0)
(break))
(if (r0 == #x80)
((write-multibyte-character r3 r0)
(break))
(r0 = r0)))
;; maybe 2-byte sequence
(read r1)
(if (r0 < #xa1)
(if (r1 < #x40)
;; invalid 2-byte sequence
((if (r0 < #xa0)
(write-multibyte-character r3 r0)
(write-multibyte-character r4 r0))
(r0 = r1)
(repeat))
(if (r1 == #x7f)
;; invalid 2-byte sequence
((if (r0 < #xa0)
(write-multibyte-character r3 r0)
(write-multibyte-character r4 r0))
(r0 = r1)
(repeat))
(if (r1 == #xff)
;; invalid 2-byte sequence
((if (r0 < #xa0)
(write-multibyte-character r3 r0)
(write-multibyte-character r4 r0))
(r0 = r1)
(repeat))
;; chinese-gbk-1
((if (r1 < #x80)
(r1 -= #x40)
(r1 -= #x41))
(r2 = (((r0 - #x81) * 190) + r1))
(r0 = (((r2 / 94) + 33) << 7))
(r0 += ((r2 % 94) + 33))
(r5 = ,(charset-id 'chinese-gbk-1))
(write-multibyte-character r5 r0)
(break)))))
;; r0 >= #xa1
((if (r1 < #x40)
;; invalid 2-byte sequence
((write-multibyte-character r4 r0)
(r0 = r1)
(repeat))
(if (r1 == #x7f)
;; invalid 2-byte sequence
((write-multibyte-character r4 r0)
(r0 = r1)
(repeat))
(if (r1 == #xff)
;; invalid 2-byte sequence
((write-multibyte-character r4 r0)
(r0 = r1)
(repeat))
(if (r1 <= #xa0)
;; chinese-gbk-2
((if (r1 >= #x80)
(r1 -= 1)
(r1 = r1))
(r0 = (((r0 - #xa0) + 32) << 7))
(r0 += ((r1 - #x40) + 32))
(r5 = ,(charset-id 'chinese-gbk-2))
(write-multibyte-character r5 r0)
(break))
;; chinese-gb2312
((r0 = (((r0 - #xa0) + 32) << 7))
(r0 += ((r1 - #xa0) + 32))
(r5 = ,(charset-id 'chinese-gb2312))
(write-multibyte-character r5 r0)
(break)))))))))
(repeat))))
"CCL GBK decoder.")
(define-ccl-program ccl-encode-gbk-char
`(1
(loop
(read-multibyte-character r0 r1)
(if (r0 == ,(charset-id 'ascii))
((write r1)
(repeat))
(if (r0 == ,(charset-id 'eight-bit-control))
((write r1)
(repeat))
(if (r0 == ,(charset-id 'eight-bit-graphic))
((write r1)
(repeat))
(r1 = r1))))
;; chinese-gb2312
(if (r0 == ,(charset-id 'chinese-gb2312))
((r2 = (((r1 & #x7f) - 32) + #xa0))
(r1 = (((r1 >> 7) - 32) + #xa0))
(write r1 r2))
;; chinese-gbk-1
(if (r0 == ,(charset-id 'chinese-gbk-1))
((r2 = ((r1 & #x7f) - 33))
(r1 = ((r1 >> 7) - 33))
(r3 = ((r1 * 94) + r2))
(r1 = ((r3 / 190) + #x81))
(r2 = (r3 % 190))
(if (r2 >= #x3f)
(r2 += #x41)
(r2 += #x40))
(write r1 r2))
;; chinese-gbk-2
(if (r0 == ,(charset-id 'chinese-gbk-2))
(
(r2 = (((r1 & #x7f) - 32) + #x40))
(r1 = (((r1 >> 7) - 32) + #xa0))
(if (r2 >= #x7f)
(r2 += 1)
(r2 += 0))
(write r1 r2))
(repeat))))
(repeat)))
"CCC GBK encoder.")
(define-ccl-program ccl-encode-gbk-font
`(0
;; In: R0:chinese-gb2312, chinese-gbk-1 or chinese-gbk-2
;; R1:position code 1
;; R2:position code 2
;; Out: R1:font code point 1
;; R2:font code point 2
(
;; chinese-gb2312
(if (r0 == ,(charset-id 'chinese-gb2312))
((r1 = ((r1 - 32) + #xa0))
(r2 = ((r2 - 32) + #xa0)))
;; chinese-gbk-1
(if (r0 == ,(charset-id 'chinese-gbk-1))
((r1 = (r1 - 33))
(r2 = (r2 - 33))
(r3 = ((r1 * 94) + r2))
(r1 = ((r3 / 190) + #x81))
(r2 = (r3 % 190))
(if (r2 >= #x3f)
(r2 += #x41)
(r2 += #x40)))
;; chinese-gbk-2
((r1 = ((r1 - 32) + #xa0))
(r2 = ((r2 - 32) + #x40))
(if (r2 >= #x7f)
(r2 += 1)
(r2 += 0)))))))
"CCL program to encode a GBK code to code point of GBK font.")
(setq font-ccl-encoder-alist
(cons (cons "gbk" ccl-encode-gbk-font) font-ccl-encoder-alist))
;;
;; Make GBK coding system
;; mime_charset?
;;
(make-coding-system
'chinese-gbk 4 ?Z "GBK 8-bit encoding for Chinese"
'(ccl-decode-gbk-char . ccl-encode-gbk-char)
'((safe-charsets ascii chinese-gb2312 chinese-gbk-1 chinese-gbk-2
chinese-big5-1 chinese-big5-2)
(valid-codes (0 . 255))))
; (charset-origin-alist (chinese-gbk-0 "GBK" ccl-encode-gbk-char)
; (chinese-gbk-1 "GBK" ccl-encode-gbk-char)
; (chinese-gbk-2 "GBK" ccl-encode-gbk-char))))
(define-coding-system-alias 'gbk 'chinese-gbk)
(define-coding-system-alias 'cn-gbk 'chinese-gbk)
(update-coding-systems-internal)
;;
;; fontset (see lisp/international/fontset.el)
;;
(let ((l `((chinese-gb2312 . (nil . "GBK"))
(chinese-gbk-1 . (nil . "GBK"))
(chinese-gbk-2 . (nil . "GBK"))))
charset font-spec arg)
(while l
(setq charset (car (car l)) font-spec (cdr (car l)) l (cdr l))
(if (symbolp charset)
(setq arg (make-char charset))
(setq arg charset))
(set-fontset-font "fontset-default" arg font-spec)))
;; Setting for suppressing XLoadQueryFont on big fonts.
(setq x-pixel-size-width-font-regexp
"gbk\\|gb2312\\|jisx0208\\|ksc5601\\|cns11643\\|big5")
;; These fonts require vertical centering.
(setq vertical-centering-font-regexp
"gbk\\|gb2312\\|jisx0208\\|jisx0212\\|ksc5601\\|cns11643\\|big5")
;;
;; kbd handler for gbk
;;
(defun encoded-kbd-self-insert-gbk ()
(interactive)
(let ((char (ccl-decode-gbk-char (+ (ash last-command-char 8)
(read-char-exclusive)))))
(setq unread-command-events (cons char unread-command-events))))
;;
;; GBK language environment
;;
(set-language-info-alist
"Chinese-GBK" '((charset chinese-gb2312 chinese-gbk-1 chinese-gbk-2
chinese-sisheng chinese-big5-1 chinese-big5-2)
(coding-system chinese-gbk chinese-iso-8bit chinese-big5
iso-2022-cn chinese-hz)
(coding-priority chinese-gbk chinese-iso-8bit chinese-big5
iso-2022-cn)
(input-method . "chinese-py-punct")
(features china-util)
(sample-text . "Chinese GBK (中文,普通话,汉语) 你好")
(documentation . "Support for Chinese GBK character set."))
'("Chinese"))
(provide 'gbk)
;;; gbk.el ends here
---------8<---------8<---------o--------->8--------->8--------->8------
_______________________________________________
Bug-gnu-emacs mailing list
Bug-gnu-emacs@gnu.org
http://mail.gnu.org/mailman/listinfo/bug-gnu-emacs
--- End Message ---