[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
mm-util changes needed
From: |
Dave Love |
Subject: |
mm-util changes needed |
Date: |
06 Feb 2002 13:58:53 +0000 |
User-agent: |
Gnus/5.09 (Gnus v5.9.0) Emacs/21.1.80 |
Changes seem never to have been made to the release branch to allow
Gnus (or other users of the MIME package) to DTRT with encoding,
particularly when choosing a MIME charset to encode a message.
For instance, currently the charset may not be determined correctly
for CCL-based coding systems or when, say, iso-8859 unification is
done on encoding. Things have been done on the Emacs trunk which
probably addresses this, but I have a hard time following that without
proper log entries, and some of it looks bogus.
Here are the changes to mm-util that I have again. I think the
important ones are to mm-mule-charset-to-mime-charset and
mm-charset-to-coding-system.
The ...-mule4 stuff was said to be necessary because of some Emacs 21
bug. I've never seen a problem running with the check for Mule 5
disabled, but any such bug needs to be rediscovered and fixed, not
covered up. (Also unibyte is likely to be significantly more
efficient for big binary data.)
2002-01-11 Dave Love <fx@gnu.org>
* mm-util.el (mm-mime-mule-charset-alist): Make it correct by
construction.
(mm-charset-synonym-alist): Remove windows-1252, windows-1250,
which are valid MIME charsets and should have proper coding
systems. Guard the other clauses, so that they're actually null
in Emacs.
(mm-mule-charset-to-mime-charset): Use
find-coding-systems-for-charsets.
(mm-charset-to-coding-system): Somewhat re-written.
(mm-enable-multibyte-mule4, mm-disable-multibyte-mule4)
(mm-with-unibyte-current-buffer-mule4): Don't treat Mule 5
specially.
(mm-find-mime-charset-region): Re-written to DTRT.
(mm-with-unibyte-current-buffer): Unwind-protect current buffer.
Index: mm-util.el
===================================================================
RCS file: /cvs/emacs/lisp/gnus/mm-util.el,v
retrieving revision 1.18
diff -u -p -c -r1.18 mm-util.el
cvs server: conflicting specifications of output style
*** mm-util.el 18 Sep 2001 14:59:24 -0000 1.18
--- mm-util.el 11 Jan 2002 19:15:52 -0000
***************
*** 32,37 ****
--- 32,39 ----
(or (and (fboundp 'coding-system-p) (coding-system-p sym))
(memq sym (mm-get-coding-system-list))))
+ ;; Fixme: some of the cars here aren't valid MIME charsets. That
+ ;; should only matter with XEmacs, though.
(defvar mm-mime-mule-charset-alist
`((us-ascii ascii)
(iso-8859-1 latin-iso8859-1)
***************
*** 84,89 ****
--- 86,105 ----
'safe-charsets))))))
"Alist of MIME-charset/MULE-charsets.")
+ ;; Correct by construction, but should be unnecessary:
+ (when (fboundp 'coding-system-list)
+ (setq mm-mime-mule-charset-alist
+ (apply
+ 'nconc
+ (mapcar
+ (lambda (cs)
+ (when (and (coding-system-get cs 'mime-charset)
+ (not (eq t (coding-system-get cs 'safe-charsets))))
+ (list (cons (coding-system-get cs 'mime-charset)
+ (delq 'ascii
+ (coding-system-get cs 'safe-charsets))))))
+ (sort-coding-systems (coding-system-list 'base-only))))))
+
(eval-and-compile
(mapcar
(lambda (elem)
***************
*** 137,154 ****
(setq mm-coding-system-list (mm-coding-system-list))))
(defvar mm-charset-synonym-alist
! `((big5 . cn-big5)
! (gb2312 . cn-gb-2312)
! ;; Windows-1252 is actually a superset of Latin-1. See also
! ;; `gnus-article-dumbquotes-map'.
! ,(unless (mm-coding-system-p 'windows-1252) ; should be defined eventually
! '(windows-1252 . iso-8859-1))
! ;; Windows-1250 is a variant of Latin-2 heavily used by Microsoft
! ;; Outlook users in Czech republic. Use this to allow reading of their
! ;; e-mails. cp1250 should be defined by M-x codepage-setup.
! ,(unless (mm-coding-system-p 'windows-1250) ; should be defined
eventually
! '(windows-1250 . cp1250))
! (x-ctext . ctext))
"A mapping from invalid charset names to the real charset names.")
(defvar mm-binary-coding-system
--- 153,168 ----
(setq mm-coding-system-list (mm-coding-system-list))))
(defvar mm-charset-synonym-alist
! `(
! ;; Perfectly fine? A valid MIME name, anyhow.
! ,@(unless (mm-coding-system-p 'big5)
! '((big5 . cn-big5)))
! ;; Not in XEmacs, but it's not a proper MIME charset anyhow.
! ,@(unless (mm-coding-system-p 'x-ctext)
! '((x-ctext . ctext)))
! ;; Apparently not defined in Emacs 20, but is a valid MIME name.
! ,@(unless (mm-coding-system-p 'gb2312)
! '((gb2312 . cn-gb-2312))))
"A mapping from invalid charset names to the real charset names.")
(defvar mm-binary-coding-system
***************
*** 183,198 ****
;;; Functions:
(defun mm-mule-charset-to-mime-charset (charset)
"Return the MIME charset corresponding to the given Mule CHARSET."
! (let ((alist mm-mime-mule-charset-alist)
! out)
! (while alist
! (when (memq charset (cdar alist))
! (setq out (caar alist)
! alist nil))
! (pop alist))
! out))
(defun mm-charset-to-coding-system (charset &optional lbt)
"Return coding-system corresponding to CHARSET.
--- 197,221 ----
;;; Functions:
+ ;; This is only used in `mm-mime-charset' (see comment there) and
+ ;; should be eliminated, at least for Emacs. -- fx
(defun mm-mule-charset-to-mime-charset (charset)
"Return the MIME charset corresponding to the given Mule CHARSET."
! (if (fboundp 'find-coding-systems-for-charsets)
! (let (mime)
! (dolist (cs (find-coding-systems-for-charsets (list charset)))
! (unless mime
! (when cs
! (setq mime (coding-system-get cs 'mime-charset)))))
! mime)
! (let ((alist mm-mime-mule-charset-alist)
! out)
! (while alist
! (when (memq charset (cdar alist))
! (setq out (caar alist)
! alist nil))
! (pop alist))
! out)))
(defun mm-charset-to-coding-system (charset &optional lbt)
"Return coding-system corresponding to CHARSET.
*************** If optional argument LBT (`unix', `dos'
*** 201,209 ****
used as the line break code type of the coding system."
(when (stringp charset)
(setq charset (intern (downcase charset))))
- (setq charset
- (or (cdr (assq charset mm-charset-synonym-alist))
- charset))
(when lbt
(setq charset (intern (format "%s-%s" charset lbt))))
(cond
--- 224,229 ----
*************** used as the line break code type of the
*** 215,226 ****
'ascii)
;; Check to see whether we can handle this charset. (This depends
;; on there being some coding system matching each `mime-charset'
! ;; coding sysytem property defined, as there should be.)
! ((memq charset (mm-get-coding-system-list))
charset)
! ;; Nope.
! (t
! nil)))
(if (fboundp 'subst-char-in-string)
(defsubst mm-replace-chars-in-string (string from to)
--- 235,261 ----
'ascii)
;; Check to see whether we can handle this charset. (This depends
;; on there being some coding system matching each `mime-charset'
! ;; property defined, as there should be.)
! ((and (coding-system-p charset)
! ;;; Doing this would potentially weed out incorrect charsets.
! ;;; charset
! ;;; (eq charset (coding-system-get charset 'mime-charset))
! )
! charset)
! ;; Translate invalid charsets.
! ((coding-system-p (setq charset
! (cdr (assq charset
! mm-charset-synonym-alist))))
charset)
! ;; Last resort: search the coding system list for entries which
! ;; have the right mime-charset in case the canonical name isn't
! ;; defined (though it should be).
! ((let (cs)
! (dolist (c coding-system-list)
! (if (and (null cs)
! (eq charset (coding-system-get c 'mime-charset)))
! (setq cs c)))
! cs))))
(if (fboundp 'subst-char-in-string)
(defsubst mm-replace-chars-in-string (string from to)
*************** This is a no-op in XEmacs."
*** 252,271 ****
(when (fboundp 'set-buffer-multibyte)
(set-buffer-multibyte nil)))
(defsubst mm-enable-multibyte-mule4 ()
"Enable multibyte in the current buffer.
Only used in Emacs Mule 4."
(when (and (fboundp 'set-buffer-multibyte)
(boundp 'enable-multibyte-characters)
(default-value 'enable-multibyte-characters)
! (not (charsetp 'eight-bit-control)))
(set-buffer-multibyte t)))
(defsubst mm-disable-multibyte-mule4 ()
"Disable multibyte in the current buffer.
Only used in Emacs Mule 4."
(when (and (fboundp 'set-buffer-multibyte)
! (not (charsetp 'eight-bit-control)))
(set-buffer-multibyte nil)))
(defun mm-preferred-coding-system (charset)
--- 287,312 ----
(when (fboundp 'set-buffer-multibyte)
(set-buffer-multibyte nil)))
+ ;; The clauses in the -mule4 functions are commented-out, since they
+ ;; should only make things less speed and space efficient in Emacs 21
+ ;; -- eight-bit-control characters have a leading byte. -- fx
+
(defsubst mm-enable-multibyte-mule4 ()
"Enable multibyte in the current buffer.
Only used in Emacs Mule 4."
(when (and (fboundp 'set-buffer-multibyte)
(boundp 'enable-multibyte-characters)
(default-value 'enable-multibyte-characters)
! ;; (not (charsetp 'eight-bit-control))
! )
(set-buffer-multibyte t)))
(defsubst mm-disable-multibyte-mule4 ()
"Disable multibyte in the current buffer.
Only used in Emacs Mule 4."
(when (and (fboundp 'set-buffer-multibyte)
! ;; (not (charsetp 'eight-bit-control))
! )
(set-buffer-multibyte nil)))
(defun mm-preferred-coding-system (charset)
*************** If the charset is `composition', return
*** 307,312 ****
--- 348,361 ----
'latin-iso8859-1)))
mail-parse-mule-charset)))))))
+ ;; This should be eliminated, at least for Emacs. It isn't
+ ;; appropriate -- and shouldn't be necessary -- to deal in terms of
+ ;; Mule charsets. MIME charsets should be determined on the basis of
+ ;; applicable coding systems; see the comment in
+ ;; `mm-find-mime-charset-region'. (The `preferred-coding-system'
+ ;; property of charsets shouldn't be depended on; it's only intended
+ ;; as information for a user, though users aren't supposed to worry
+ ;; about Mule charsets.) -- fx
(defun mm-mime-charset (charset)
"Return the MIME charset corresponding to the given Mule CHARSET."
(if (and (fboundp 'coding-system-get) (fboundp 'get-charset-property))
*************** If the charset is `composition', return
*** 330,350 ****
(setq result (cons head result)))
(nreverse result)))
! (defun mm-find-mime-charset-region (b e)
! "Return the MIME charsets needed to encode the region between B and E."
! (let ((charsets (mapcar 'mm-mime-charset
! (delq 'ascii
! (mm-find-charset-region b e)))))
! (when (memq 'iso-2022-jp-2 charsets)
! (setq charsets (delq 'iso-2022-jp charsets)))
! (setq charsets (mm-delete-duplicates charsets))
! (if (and (> (length charsets) 1)
! (fboundp 'find-coding-systems-region)
! (let ((cs (find-coding-systems-region b e)))
! (or (memq 'utf-8 cs) (memq 'mule-utf-8 cs))))
! '(utf-8)
! charsets)))
!
(defsubst mm-multibyte-p ()
"Say whether multibyte is enabled."
(if (and (not (featurep 'xemacs))
--- 379,386 ----
(setq result (cons head result)))
(nreverse result)))
! ;; It's not clear whether this is supposed to mean the global or local
! ;; setting. I think it's used inconsistently. -- fx
(defsubst mm-multibyte-p ()
"Say whether multibyte is enabled."
(if (and (not (featurep 'xemacs))
*************** If the charset is `composition', return
*** 352,357 ****
--- 388,429 ----
enable-multibyte-characters
(featurep 'mule)))
+ (defun mm-find-mime-charset-region (b e)
+ "Return the MIME charsets needed to encode the region between B and E.
+ Nil means ASCII, a single-element list represents an appropriate MIME
+ charset, and a longer list means no appropriate charset."
+ ;; The return possibilities of this function are a mess...
+ (or (and
+ (mm-multibyte-p)
+ ;; How are you supposed to do this in XEmacs?
+ (fboundp 'find-coding-systems-region)
+ ;; Find the mime-charset of the most preferred coding
+ ;; system that has one.
+ (let ((systems (find-coding-systems-region b e))
+ result)
+ ;; Fixme: The `mime-charset' (`x-ctext') of `compound-text'
+ ;; is not in the IANA list.
+ (setq systems (delq 'compound-text systems))
+ (unless (equal systems '(undecided))
+ (while systems
+ (let ((cs (coding-system-get (pop systems) 'mime-charset)))
+ (if cs
+ (setq systems nil
+ result (list cs))))))
+ result))
+ ;; Otherwise we're not multibyte or a single coding system won't
+ ;; cover it.
+ ;; This isn't really right. If it wants actually to find a list
+ ;; of MIME charsets, it ought to recurse into a binary chop,
+ ;; finding encodable regions. -- fx
+ (mm-delete-duplicates
+ (mapcar 'mm-mime-charset
+ ;; Why on earth delete iso-2022-jp?
+ ;;; (delq 'iso-2022-jp ; ??
+ ;;; (delq 'ascii
+ ;;; (mm-find-charset-region b e)))
+ (delq 'ascii (mm-find-charset-region b e))))))
+
(defmacro mm-with-unibyte-buffer (&rest forms)
"Create a temporary buffer, and evaluate FORMS there like `progn'.
Use unibyte mode for this."
*************** Use unibyte mode for this."
*** 361,376 ****
(put 'mm-with-unibyte-buffer 'edebug-form-spec '(body))
(defmacro mm-with-unibyte-current-buffer (&rest forms)
! "Evaluate FORMS with current current buffer temporarily made unibyte.
Also bind `default-enable-multibyte-characters' to nil.
Equivalent to `progn' in XEmacs"
! (let ((multibyte (make-symbol "multibyte")))
`(if (fboundp 'set-buffer-multibyte)
! (let ((,multibyte enable-multibyte-characters))
(unwind-protect
(let (default-enable-multibyte-characters)
(set-buffer-multibyte nil)
,@forms)
(set-buffer-multibyte ,multibyte)))
(progn
,@forms))))
--- 433,451 ----
(put 'mm-with-unibyte-buffer 'edebug-form-spec '(body))
(defmacro mm-with-unibyte-current-buffer (&rest forms)
! "Evaluate FORMS with current buffer temporarily made unibyte.
Also bind `default-enable-multibyte-characters' to nil.
Equivalent to `progn' in XEmacs"
! (let ((multibyte (make-symbol "multibyte"))
! (buffer (make-symbol "buffer")))
`(if (fboundp 'set-buffer-multibyte)
! (let ((,multibyte enable-multibyte-characters)
! (,buffer (current-buffer)))
(unwind-protect
(let (default-enable-multibyte-characters)
(set-buffer-multibyte nil)
,@forms)
+ (set-buffer ,buffer)
(set-buffer-multibyte ,multibyte)))
(progn
,@forms))))
*************** Equivalent to `progn' in XEmacs"
*** 381,389 ****
"Evaluate FORMS there like `progn' in current buffer.
Mule4 only."
(let ((multibyte (make-symbol "multibyte")))
! `(if (or (featurep 'xemacs)
! (not (fboundp 'set-buffer-multibyte))
! (charsetp 'eight-bit-control)) ;; For Emacs Mule 4 only.
(progn
,@forms)
(let ((,multibyte (default-value 'enable-multibyte-characters)))
--- 456,464 ----
"Evaluate FORMS there like `progn' in current buffer.
Mule4 only."
(let ((multibyte (make-symbol "multibyte")))
! `(if (or (not (fboundp 'set-buffer-multibyte))
! ;; (charsetp 'eight-bit-control)
! ) ;; For Emacs Mule 4 only.
(progn
,@forms)
(let ((,multibyte (default-value 'enable-multibyte-characters)))
- mm-util changes needed,
Dave Love <=