Line data Source code
1 : ;;; mule-util.el --- utility functions for multilingual environment (mule) -*- lexical-binding:t -*-
2 :
3 : ;; Copyright (C) 1997-1998, 2000-2017 Free Software Foundation, Inc.
4 : ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 : ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
6 : ;; National Institute of Advanced Industrial Science and Technology (AIST)
7 : ;; Registration Number H14PRO021
8 : ;; Copyright (C) 2003
9 : ;; National Institute of Advanced Industrial Science and Technology (AIST)
10 : ;; Registration Number H13PRO009
11 :
12 : ;; Keywords: mule, multilingual
13 :
14 : ;; This file is part of GNU Emacs.
15 :
16 : ;; GNU Emacs is free software: you can redistribute it and/or modify
17 : ;; it under the terms of the GNU General Public License as published by
18 : ;; the Free Software Foundation, either version 3 of the License, or
19 : ;; (at your option) any later version.
20 :
21 : ;; GNU Emacs is distributed in the hope that it will be useful,
22 : ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23 : ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 : ;; GNU General Public License for more details.
25 :
26 : ;; You should have received a copy of the GNU General Public License
27 : ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
28 :
29 : ;;; Commentary:
30 :
31 : ;;; Code:
32 :
33 : ;;; String manipulations while paying attention to multibyte characters.
34 :
35 : ;;;###autoload
36 : (defun store-substring (string idx obj)
37 : "Embed OBJ (string or character) at index IDX of STRING."
38 0 : (if (integerp obj)
39 0 : (aset string idx obj)
40 0 : (let ((len1 (length obj))
41 : (i 0))
42 0 : (while (< i len1)
43 0 : (aset string (+ idx i) (aref obj i))
44 0 : (setq i (1+ i)))))
45 0 : string)
46 :
47 : (defvar truncate-string-ellipsis "..." ;"…"
48 : "String to use to indicate truncation.
49 : Serves as default value of ELLIPSIS argument to `truncate-string-to-width'.")
50 :
51 : ;;;###autoload
52 : (defun truncate-string-to-width (str end-column
53 : &optional start-column padding ellipsis)
54 : "Truncate string STR to end at column END-COLUMN.
55 : The optional 3rd arg START-COLUMN, if non-nil, specifies the starting
56 : column; that means to return the characters occupying columns
57 : START-COLUMN ... END-COLUMN of STR. Both END-COLUMN and START-COLUMN
58 : are specified in terms of character display width in the current
59 : buffer; see also `char-width'.
60 :
61 : The optional 4th arg PADDING, if non-nil, specifies a padding
62 : character (which should have a display width of 1) to add at the end
63 : of the result if STR doesn't reach column END-COLUMN, or if END-COLUMN
64 : comes in the middle of a character in STR. PADDING is also added at
65 : the beginning of the result if column START-COLUMN appears in the
66 : middle of a character in STR.
67 :
68 : If PADDING is nil, no padding is added in these cases, so
69 : the resulting string may be narrower than END-COLUMN.
70 :
71 : If ELLIPSIS is non-nil, it should be a string which will replace the
72 : end of STR (including any padding) if it extends beyond END-COLUMN,
73 : unless the display width of STR is equal to or less than the display
74 : width of ELLIPSIS. If it is non-nil and not a string, then ELLIPSIS
75 : defaults to `truncate-string-ellipsis'."
76 0 : (or start-column
77 0 : (setq start-column 0))
78 0 : (when (and ellipsis (not (stringp ellipsis)))
79 0 : (setq ellipsis truncate-string-ellipsis))
80 0 : (let ((str-len (length str))
81 0 : (str-width (string-width str))
82 0 : (ellipsis-width (if ellipsis (string-width ellipsis) 0))
83 : (idx 0)
84 : (column 0)
85 : (head-padding "") (tail-padding "")
86 : ch last-column last-idx from-idx)
87 0 : (condition-case nil
88 0 : (while (< column start-column)
89 0 : (setq ch (aref str idx)
90 0 : column (+ column (char-width ch))
91 0 : idx (1+ idx)))
92 0 : (args-out-of-range (setq idx str-len)))
93 0 : (if (< column start-column)
94 0 : (if padding (make-string end-column padding) "")
95 0 : (when (and padding (> column start-column))
96 0 : (setq head-padding (make-string (- column start-column) padding)))
97 0 : (setq from-idx idx)
98 0 : (when (>= end-column column)
99 0 : (if (and (< end-column str-width)
100 0 : (> str-width ellipsis-width))
101 0 : (setq end-column (- end-column ellipsis-width))
102 0 : (setq ellipsis ""))
103 0 : (condition-case nil
104 0 : (while (< column end-column)
105 0 : (setq last-column column
106 0 : last-idx idx
107 0 : ch (aref str idx)
108 0 : column (+ column (char-width ch))
109 0 : idx (1+ idx)))
110 0 : (args-out-of-range (setq idx str-len)))
111 0 : (when (> column end-column)
112 0 : (setq column last-column
113 0 : idx last-idx))
114 0 : (when (and padding (< column end-column))
115 0 : (setq tail-padding (make-string (- end-column column) padding))))
116 0 : (concat head-padding (substring str from-idx idx)
117 0 : tail-padding ellipsis))))
118 :
119 :
120 : ;;; Nested alist handler.
121 : ;; Nested alist is alist whose elements are also nested alist.
122 :
123 : ;;;###autoload
124 : (defsubst nested-alist-p (obj)
125 : "Return t if OBJ is a nested alist.
126 :
127 : Nested alist is a list of the form (ENTRY . BRANCHES), where ENTRY is
128 : any Lisp object, and BRANCHES is a list of cons cells of the form
129 : \(KEY-ELEMENT . NESTED-ALIST).
130 :
131 : You can use a nested alist to store any Lisp object (ENTRY) for a key
132 : sequence KEYSEQ, where KEYSEQ is a sequence of KEY-ELEMENT. KEYSEQ
133 : can be a string, a vector, or a list."
134 : (and obj (listp obj) (listp (cdr obj))))
135 :
136 : ;;;###autoload
137 : (defun set-nested-alist (keyseq entry alist &optional len branches)
138 : "Set ENTRY for KEYSEQ in a nested alist ALIST.
139 : Optional 4th arg LEN non-nil means the first LEN elements in KEYSEQ
140 : are considered.
141 : Optional 5th argument BRANCHES if non-nil is branches for a keyseq
142 : longer than KEYSEQ.
143 : See the documentation of `nested-alist-p' for more detail."
144 0 : (or (nested-alist-p alist)
145 0 : (error "Invalid argument %s" alist))
146 0 : (let ((islist (listp keyseq))
147 0 : (len (or len (length keyseq)))
148 : (i 0)
149 : key-elt slot)
150 0 : (while (< i len)
151 0 : (if (null (nested-alist-p alist))
152 0 : (error "Keyseq %s is too long for this nested alist" keyseq))
153 0 : (setq key-elt (if islist (nth i keyseq) (aref keyseq i)))
154 0 : (setq slot (assoc key-elt (cdr alist)))
155 0 : (unless slot
156 0 : (setq slot (cons key-elt (list t)))
157 0 : (setcdr alist (cons slot (cdr alist))))
158 0 : (setq alist (cdr slot))
159 0 : (setq i (1+ i)))
160 0 : (setcar alist entry)
161 0 : (if branches
162 0 : (setcdr (last alist) branches))))
163 :
164 : ;;;###autoload
165 : (defun lookup-nested-alist (keyseq alist &optional len start nil-for-too-long)
166 : "Look up key sequence KEYSEQ in nested alist ALIST. Return the definition.
167 : Optional 3rd argument LEN specifies the length of KEYSEQ.
168 : Optional 4th argument START specifies index of the starting key.
169 : The returned value is normally a nested alist of which
170 : car part is the entry for KEYSEQ.
171 : If ALIST is not deep enough for KEYSEQ, return number which is
172 : how many key elements at the front of KEYSEQ it takes
173 : to reach a leaf in ALIST.
174 : Optional 5th argument NIL-FOR-TOO-LONG non-nil means return nil
175 : even if ALIST is not deep enough."
176 0 : (or (nested-alist-p alist)
177 0 : (error "Invalid argument %s" alist))
178 0 : (or len
179 0 : (setq len (length keyseq)))
180 0 : (let ((i (or start 0)))
181 0 : (if (catch 'lookup-nested-alist-tag
182 0 : (if (listp keyseq)
183 0 : (while (< i len)
184 0 : (if (setq alist (cdr (assoc (nth i keyseq) (cdr alist))))
185 0 : (setq i (1+ i))
186 0 : (throw 'lookup-nested-alist-tag t))))
187 0 : (while (< i len)
188 0 : (if (setq alist (cdr (assoc (aref keyseq i) (cdr alist))))
189 0 : (setq i (1+ i))
190 0 : (throw 'lookup-nested-alist-tag t))))
191 : ;; KEYSEQ is too long.
192 0 : (if nil-for-too-long nil i)
193 0 : alist)))
194 :
195 :
196 : ;; Coding system related functions.
197 :
198 : ;;;###autoload
199 : (defun coding-system-post-read-conversion (coding-system)
200 : "Return the value of CODING-SYSTEM's `post-read-conversion' property."
201 0 : (coding-system-get coding-system :post-read-conversion))
202 :
203 : ;;;###autoload
204 : (defun coding-system-pre-write-conversion (coding-system)
205 : "Return the value of CODING-SYSTEM's `pre-write-conversion' property."
206 0 : (coding-system-get coding-system :pre-write-conversion))
207 :
208 : ;;;###autoload
209 : (defun coding-system-translation-table-for-decode (coding-system)
210 : "Return the value of CODING-SYSTEM's `decode-translation-table' property."
211 0 : (coding-system-get coding-system :decode-translation-table))
212 :
213 : ;;;###autoload
214 : (defun coding-system-translation-table-for-encode (coding-system)
215 : "Return the value of CODING-SYSTEM's `encode-translation-table' property."
216 0 : (coding-system-get coding-system :encode-translation-table))
217 :
218 : ;;;###autoload
219 : (defmacro with-coding-priority (coding-systems &rest body)
220 : "Execute BODY like `progn' with CODING-SYSTEMS at the front of priority list.
221 : CODING-SYSTEMS is a list of coding systems. See `set-coding-system-priority'.
222 : This affects the implicit sorting of lists of coding systems returned by
223 : operations such as `find-coding-systems-region'."
224 1 : (let ((current (make-symbol "current")))
225 1 : `(let ((,current (coding-system-priority-list)))
226 1 : (apply #'set-coding-system-priority ,coding-systems)
227 : (unwind-protect
228 1 : (progn ,@body)
229 1 : (apply #'set-coding-system-priority ,current)))))
230 : ;;;###autoload(put 'with-coding-priority 'lisp-indent-function 1)
231 : (put 'with-coding-priority 'edebug-form-spec t)
232 :
233 : ;;;###autoload
234 : (defmacro detect-coding-with-priority (from to priority-list)
235 : "Detect a coding system of the text between FROM and TO with PRIORITY-LIST.
236 : PRIORITY-LIST is an alist of coding categories vs the corresponding
237 : coding systems ordered by priority."
238 : (declare (obsolete with-coding-priority "23.1"))
239 0 : `(with-coding-priority (mapcar #'cdr ,priority-list)
240 0 : (detect-coding-region ,from ,to)))
241 :
242 : ;;;###autoload
243 : (defun detect-coding-with-language-environment (from to lang-env)
244 : "Detect a coding system for the text between FROM and TO with LANG-ENV.
245 : The detection takes into account the coding system priorities for the
246 : language environment LANG-ENV."
247 0 : (let ((coding-priority (get-language-info lang-env 'coding-priority)))
248 0 : (if coding-priority
249 0 : (with-coding-priority coding-priority
250 0 : (detect-coding-region from to)))))
251 :
252 : (declare-function internal-char-font "font.c" (position &optional ch))
253 :
254 : ;;;###autoload
255 : (defun char-displayable-p (char)
256 : "Return non-nil if we should be able to display CHAR.
257 : On a multi-font display, the test is only whether there is an
258 : appropriate font from the selected frame's fontset to display
259 : CHAR's charset in general. Since fonts may be specified on a
260 : per-character basis, this may not be accurate."
261 0 : (cond ((< char 128)
262 : ;; ASCII characters are always displayable.
263 : t)
264 0 : ((not enable-multibyte-characters)
265 : ;; Maybe there's a font for it, but we can't put it in the buffer.
266 : nil)
267 : (t
268 0 : (let ((font-glyph (internal-char-font nil char)))
269 0 : (if font-glyph
270 0 : (if (consp font-glyph)
271 : ;; On a window system, a character is displayable
272 : ;; if a font for that character is in the default
273 : ;; face of the currently selected frame.
274 0 : (car font-glyph)
275 : ;; On a text terminal supporting glyph codes, CHAR is
276 : ;; displayable if its glyph code is nonnegative.
277 0 : (<= 0 font-glyph))
278 : ;; On a text terminal without glyph codes, CHAR is displayable
279 : ;; if the coding system for the terminal can encode it.
280 0 : (let ((coding (terminal-coding-system)))
281 0 : (when coding
282 0 : (let ((cs-list (coding-system-get coding :charset-list)))
283 0 : (cond
284 0 : ((listp cs-list)
285 0 : (catch 'tag
286 0 : (mapc #'(lambda (charset)
287 0 : (if (encode-char char charset)
288 0 : (throw 'tag charset)))
289 0 : cs-list)
290 0 : nil))
291 0 : ((eq cs-list 'iso-2022)
292 0 : (catch 'tag2
293 0 : (mapc #'(lambda (charset)
294 0 : (if (and (plist-get (charset-plist charset)
295 0 : :iso-final-char)
296 0 : (encode-char char charset))
297 0 : (throw 'tag2 charset)))
298 0 : charset-list)
299 0 : nil))
300 0 : ((eq cs-list 'emacs-mule)
301 0 : (catch 'tag3
302 0 : (mapc #'(lambda (charset)
303 0 : (if (and (plist-get (charset-plist charset)
304 0 : :emacs-mule-id)
305 0 : (encode-char char charset))
306 0 : (throw 'tag3 charset)))
307 0 : charset-list)
308 0 : nil)))))))))))
309 :
310 : (defun filepos-to-bufferpos--dos (byte f)
311 0 : (let ((eol-offset 0)
312 : ;; Make sure we terminate, even if BYTE falls right in the middle
313 : ;; of a CRLF or some other weird corner case.
314 0 : (omin 0) (omax most-positive-fixnum)
315 : pos lines)
316 0 : (while
317 0 : (progn
318 0 : (setq pos (funcall f (- byte eol-offset)))
319 : ;; Protect against accidental values of BYTE outside of the
320 : ;; valid region.
321 0 : (when (null pos)
322 0 : (if (<= byte eol-offset)
323 0 : (setq pos (point-min))
324 0 : (setq pos (point-max))))
325 : ;; Adjust POS for DOS EOL format.
326 0 : (setq lines (1- (line-number-at-pos pos)))
327 0 : (and (not (= lines eol-offset)) (> omax omin)))
328 0 : (if (> lines eol-offset)
329 0 : (setq omax (min (1- omax) lines)
330 0 : eol-offset omax)
331 0 : (setq omin (max (1+ omin) lines)
332 0 : eol-offset omin)))
333 0 : pos))
334 :
335 : ;;;###autoload
336 : (defun filepos-to-bufferpos (byte &optional quality coding-system)
337 : "Try to return the buffer position corresponding to a particular file position.
338 : The file position is given as a (0-based) BYTE count.
339 : The function presumes the file is encoded with CODING-SYSTEM, which defaults
340 : to `buffer-file-coding-system'.
341 : QUALITY can be:
342 : `approximate', in which case we may cut some corners to avoid
343 : excessive work.
344 : `exact', in which case we may end up re-(en/de)coding a large
345 : part of the file/buffer, this can be expensive and slow.
346 : nil, in which case we may return nil rather than an approximation."
347 0 : (unless coding-system (setq coding-system buffer-file-coding-system))
348 0 : (let ((eol (coding-system-eol-type coding-system))
349 0 : (type (coding-system-type coding-system))
350 0 : (base (coding-system-base coding-system))
351 0 : (pm (save-restriction (widen) (point-min))))
352 0 : (and (eq type 'utf-8)
353 : ;; Any post-read/pre-write conversions mean it's not really UTF-8.
354 0 : (not (null (coding-system-get coding-system :post-read-conversion)))
355 0 : (setq type 'not-utf-8))
356 0 : (and (memq type '(charset raw-text undecided))
357 : ;; The following are all of type 'charset', but they are
358 : ;; actually variable-width encodings.
359 0 : (not (memq base '(chinese-gbk chinese-gb18030 euc-tw euc-jis-2004
360 : korean-iso-8bit chinese-iso-8bit
361 : japanese-iso-8bit chinese-big5-hkscs
362 0 : japanese-cp932 korean-cp949)))
363 0 : (setq type 'single-byte))
364 0 : (pcase type
365 : (`utf-8
366 0 : (when (coding-system-get coding-system :bom)
367 0 : (setq byte (max 0 (- byte 3))))
368 0 : (if (= eol 1)
369 0 : (filepos-to-bufferpos--dos (+ pm byte) #'byte-to-position)
370 0 : (byte-to-position (+ pm byte))))
371 : (`single-byte
372 0 : (if (= eol 1)
373 0 : (filepos-to-bufferpos--dos (+ pm byte) #'identity)
374 0 : (+ pm byte)))
375 : ((and `utf-16
376 : ;; FIXME: For utf-16, we could use the same approach as used for
377 : ;; dos EOLs (counting the number of non-BMP chars instead of the
378 : ;; number of lines).
379 0 : (guard (not (eq quality 'exact))))
380 : ;; Account for BOM, which is always 2 bytes in UTF-16.
381 0 : (when (coding-system-get coding-system :bom)
382 0 : (setq byte (max 0 (- byte 2))))
383 : ;; In approximate mode, assume all characters are within the
384 : ;; BMP, i.e. take up 2 bytes.
385 0 : (setq byte (/ byte 2))
386 0 : (if (= eol 1)
387 0 : (filepos-to-bufferpos--dos (+ pm byte) #'identity)
388 0 : (+ pm byte)))
389 : (_
390 0 : (pcase quality
391 0 : (`approximate (byte-to-position (+ pm byte)))
392 : (`exact
393 : ;; Rather than assume that the file exists and still holds the right
394 : ;; data, we reconstruct it based on the buffer's content.
395 0 : (let ((buf (current-buffer)))
396 0 : (with-temp-buffer
397 0 : (set-buffer-multibyte nil)
398 0 : (let ((tmp-buf (current-buffer)))
399 0 : (with-current-buffer buf
400 0 : (save-restriction
401 0 : (widen)
402 : ;; Since encoding should always return more bytes than
403 : ;; there were chars, encoding all chars up to (+ byte pm)
404 : ;; guarantees the encoded result has at least `byte' bytes.
405 0 : (encode-coding-region pm (min (point-max) (+ pm byte))
406 0 : coding-system tmp-buf)))
407 0 : (+ pm (length
408 0 : (decode-coding-region (point-min)
409 0 : (min (point-max) (+ pm byte))
410 0 : coding-system t))))))))))))
411 : ;;;###autoload
412 : (defun bufferpos-to-filepos (position &optional quality coding-system)
413 : "Try to return the file byte corresponding to a particular buffer POSITION.
414 : Value is the file position given as a (0-based) byte count.
415 : The function presumes the file is encoded with CODING-SYSTEM, which defaults
416 : to `buffer-file-coding-system'.
417 : QUALITY can be:
418 : `approximate', in which case we may cut some corners to avoid
419 : excessive work.
420 : `exact', in which case we may end up re-(en/de)coding a large
421 : part of the file/buffer, this can be expensive and slow.
422 : nil, in which case we may return nil rather than an approximation."
423 0 : (unless coding-system (setq coding-system buffer-file-coding-system))
424 0 : (let* ((eol (coding-system-eol-type coding-system))
425 0 : (lineno (if (= eol 1) (1- (line-number-at-pos position)) 0))
426 0 : (type (coding-system-type coding-system))
427 0 : (base (coding-system-base coding-system))
428 : byte)
429 0 : (and (eq type 'utf-8)
430 : ;; Any post-read/pre-write conversions mean it's not really UTF-8.
431 0 : (not (null (coding-system-get coding-system :post-read-conversion)))
432 0 : (setq type 'not-utf-8))
433 0 : (and (memq type '(charset raw-text undecided))
434 : ;; The following are all of type 'charset', but they are
435 : ;; actually variable-width encodings.
436 0 : (not (memq base '(chinese-gbk chinese-gb18030 euc-tw euc-jis-2004
437 : korean-iso-8bit chinese-iso-8bit
438 : japanese-iso-8bit chinese-big5-hkscs
439 0 : japanese-cp932 korean-cp949)))
440 0 : (setq type 'single-byte))
441 0 : (pcase type
442 : (`utf-8
443 0 : (setq byte (position-bytes position))
444 0 : (when (null byte)
445 0 : (if (<= position 0)
446 0 : (setq byte 1)
447 0 : (setq byte (position-bytes (point-max)))))
448 0 : (setq byte (1- byte))
449 0 : (+ byte
450 : ;; Account for BOM, if any.
451 0 : (if (coding-system-get coding-system :bom) 3 0)
452 : ;; Account for CR in CRLF pairs.
453 0 : lineno))
454 : (`single-byte
455 0 : (+ position -1 lineno))
456 : ((and `utf-16
457 : ;; FIXME: For utf-16, we could use the same approach as used for
458 : ;; dos EOLs (counting the number of non-BMP chars instead of the
459 : ;; number of lines).
460 0 : (guard (not (eq quality 'exact))))
461 : ;; In approximate mode, assume all characters are within the
462 : ;; BMP, i.e. each one takes up 2 bytes.
463 0 : (+ (* (1- position) 2)
464 : ;; Account for BOM, if any.
465 0 : (if (coding-system-get coding-system :bom) 2 0)
466 : ;; Account for CR in CRLF pairs.
467 0 : lineno))
468 : (_
469 0 : (pcase quality
470 0 : (`approximate (+ (position-bytes position) -1 lineno))
471 : (`exact
472 : ;; Rather than assume that the file exists and still holds the right
473 : ;; data, we reconstruct its relevant portion.
474 0 : (let ((buf (current-buffer)))
475 0 : (with-temp-buffer
476 0 : (set-buffer-multibyte nil)
477 0 : (let ((tmp-buf (current-buffer)))
478 0 : (with-current-buffer buf
479 0 : (save-restriction
480 0 : (widen)
481 0 : (encode-coding-region (point-min) (min (point-max) position)
482 0 : coding-system tmp-buf)))
483 0 : (1- (point-max)))))))))))
484 :
485 : (provide 'mule-util)
486 :
487 : ;; Local Variables:
488 : ;; coding: utf-8
489 : ;; End:
490 :
491 : ;;; mule-util.el ends here
|