[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/pyim 8ae9ad9 10/36: New file pyim-cstring.el
From: |
ELPA Syncer |
Subject: |
[elpa] externals/pyim 8ae9ad9 10/36: New file pyim-cstring.el |
Date: |
Thu, 22 Apr 2021 22:57:16 -0400 (EDT) |
branch: externals/pyim
commit 8ae9ad9c2ea6857b1d443f44d81d712562933689
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>
New file pyim-cstring.el
* pyim-cstring.el (pyim-cstring-at-point)
(pyim-cstring-split-to-list, pyim-cstring-split-to-string)
(pyim-cstring-split-to-string-1, pyim-cstring-split-buffer): Move from
pyim.el
---
pyim-cstring.el | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
pyim.el | 168 ---------------------------------------------
2 files changed, 206 insertions(+), 168 deletions(-)
diff --git a/pyim-cstring.el b/pyim-cstring.el
new file mode 100644
index 0000000..db8c421
--- /dev/null
+++ b/pyim-cstring.el
@@ -0,0 +1,206 @@
+;;; pyim-cstring.el --- Chinese string tools for pyim. -*-
lexical-binding: t; -*-
+
+;; * Header
+;; Copyright (C) 2021 Free Software Foundation, Inc.
+
+;; Author: Feng Shu <tumashu@163.com>
+;; Maintainer: Feng Shu <tumashu@163.com>
+;; URL: https://github.com/tumashu/pyim
+;; Keywords: convenience, Chinese, pinyin, input-method
+;; Package-Requires: ((emacs "24.4") (async "1.6") (xr "1.13"))
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;;; Code:
+;; * 代码 :code:
+(require 'cl-lib)
+
+(defun pyim-cstring-at-point (&optional number)
+ "获取光标一个中文字符串,字符数量为:NUMBER."
+ (save-excursion
+ (let* ((point (point))
+ (begin (- point number))
+ (begin (if (> begin 0)
+ begin
+ (point-min)))
+ (string (buffer-substring-no-properties
+ point begin)))
+ (when (and (stringp string)
+ (= (length string) number)
+ (not (pyim-string-match-p "\\CC" string)))
+ string))))
+
+(defun pyim-cstring-split-to-list (chinese-string &optional max-word-length
delete-dups prefer-short-word)
+ "一个基于 pyim 的中文分词函数。这个函数可以将中文字符
+串 CHINESE-STRING 分词,得到一个词条 alist,这个 alist 的元素
+都是列表,其中第一个元素为分词得到的词条,第二个元素为词条相对于
+字符串中的起始位置,第三个元素为结束位置。分词时,默认词条不超过
+6个字符,用户可以通过 MAX-WORD-LENGTH 来自定义,但值得注意的是:
+这个值设置越大,分词速度越慢。
+
+如果 DELETE-DUPS 设置为 non-nil, 一个中文字符串只保留一种分割方式。
+比如:
+
+ 我爱北京天安门 => 我爱 北京 天安门
+
+如果 PREFER-SHORT-WORD 为 non-nil, 去重的时候则优先保留较短的词。
+
+注意事项:
+1. 这个工具使用暴力匹配模式来分词,*不能检测出* pyim 词库中不存在
+ 的中文词条。
+2. 这个函数的分词速度比较慢,仅仅适用于中文短句的分词,不适用于
+ 文章分词。根据评估,20个汉字组成的字符串需要大约0.3s, 40个
+ 汉字消耗1s,随着字符串长度的增大消耗的时间呈几何倍数增加。"
+ ;; (("天安" 5 7)
+ ;; 我爱北京天安门 -> ("天安门" 5 8)
+ ;; ("北京" 3 5)
+ ;; ("我爱" 1 3))
+ (cl-labels
+ ((get-possible-words-internal
+ ;; 内部函数,功能类似:
+ ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab")
+ (my-list number)
+ (cond
+ ((< (length my-list) 2) nil)
+ (t (append
+ (let* ((str (mapconcat #'identity my-list ""))
+ (length (length str)))
+ (when (<= length (or max-word-length 6))
+ (list (list str number (+ number length)))))
+ (get-possible-words-internal
+ (reverse (cdr (reverse my-list))) number)))))
+ (get-possible-words
+ ;; 内部函数,功能类似:
+ ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab" "bcd" "bc" "cd")
+ (my-list number)
+ (cond
+ ((null my-list) nil)
+ (t (append (get-possible-words-internal my-list number)
+ (get-possible-words (cdr my-list) (1+ number)))))))
+
+ ;; 如果 pyim 词库没有加载,加载 pyim 词库,
+ ;; 确保 `pyim-dcache-get' 可以正常运行。
+ (pyim-dcache-init-variables)
+
+ (let ((string-alist
+ (get-possible-words
+ (mapcar #'char-to-string
+ (string-to-vector chinese-string))
+ 1))
+ result)
+ (dolist (string-list string-alist)
+ (let ((pinyin-list (pyim-hanzi2pinyin (car string-list) nil "-" t)))
+ (dolist (pinyin pinyin-list)
+ (let ((words (pyim-dcache-get pinyin '(code2word)))) ; 忽略个人词库可以提高速度
+ (dolist (word words)
+ (when (equal word (car string-list))
+ (push string-list result)))))))
+
+ (if delete-dups
+ (cl-delete-duplicates
+ ;; 判断两个词条在字符串中的位置
+ ;; 是否冲突,如果冲突,仅保留一个,
+ ;; 删除其它。
+ result
+ :test #'(lambda (x1 x2)
+ (let ((begin1 (nth 1 x1))
+ (begin2 (nth 1 x2))
+ (end1 (nth 2 x1))
+ (end2 (nth 2 x2)))
+ (not (or (<= end1 begin2)
+ (<= end2 begin1)))))
+ :from-end prefer-short-word)
+ result))))
+
+;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
+;; (benchmark 1 '(pyim-cstring-split-to-list str)))
+
+;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
+;; (pyim-cstring-split-to-list str))
+
+(defun pyim-cstring-split-to-string (string &optional prefer-short-word
+ separator max-word-length)
+ "将中文字符串 STRING 分词.
+
+在分词的位置插入空格或者自定义分隔符 SEPERATERS,默认情况下较长的
+词条优先使用,如果 PREFER-SHORT-WORD 设置为 t,则优先使用较短的
+词条。默认最长词条不超过6个字符,用户可以通 MAX-WORD-LENGTH 来
+自定义词条的最大长度,但值得注意的是,这个值设置越大,分词速度越
+慢。"
+ (let ((string-list
+ (if (pyim-string-match-p "\\CC" string)
+ (split-string
+ (replace-regexp-in-string
+ "\\(\\CC+\\)" "@@@@\\1@@@@" string)
+ "@@@@")
+ (list string))))
+ (mapconcat
+ #'(lambda (str)
+ (when (> (length str) 0)
+ (if (not (pyim-string-match-p "\\CC" str))
+ (pyim-cstring-split-to-string-1
+ str prefer-short-word separator max-word-length)
+ (concat " " str " "))))
+ string-list "")))
+
+(defun pyim-cstring-split-to-string-1 (chinese-string &optional
prefer-short-word
+ separator
max-word-length)
+ "`pyim-cstring-split-to-string' 内部函数。"
+ (let ((str-length (length chinese-string))
+ (word-list (pyim-cstring-split-to-list
+ chinese-string max-word-length t prefer-short-word))
+ position-list result)
+
+ ;; 提取词条相对于字符串的位置信息。
+ (dolist (word word-list)
+ (push (nth 1 word) position-list)
+ (push (nth 2 word) position-list))
+
+ ;; 将位置信息由小到大排序。
+ (setq position-list
+ (cl-delete-duplicates (sort position-list #'<)))
+
+ ;; 在分词的位置插入空格或者用户指定的分隔符。
+ (dotimes (i str-length)
+ (when (member (1+ i) position-list)
+ (push (or separator " ") result))
+ (push (substring chinese-string i (1+ i)) result))
+ (setq result (nreverse result))
+ (mapconcat #'identity result "")))
+
+(defun pyim-cstring-split-buffer ()
+ "将一个 buffer 中的中文文章,进行分词操作。"
+ (interactive)
+ (message "分词开始!")
+ (goto-char (point-min))
+ (while (not (eobp))
+ (let ((string (buffer-substring-no-properties
+ (line-beginning-position)
+ (line-end-position))))
+ (delete-region (line-beginning-position)
+ (min (+ (line-end-position) 1) (point-max)))
+ (insert (pyim-cstring-split-to-string string))
+ (insert "\n")))
+ (goto-char (point-min))
+ (message "分词完成!"))
+
+
+;; * Footer
+(provide 'pyim-cstring)
+
+;;; pyim-cstring.el ends here
diff --git a/pyim.el b/pyim.el
index e10f74d..67af23f 100644
--- a/pyim.el
+++ b/pyim.el
@@ -1389,21 +1389,6 @@ code-prefix)。当RETURN-LIST 设置为 t 时,返回一个 code list。"
(substring (pyim-hanzi2xingma:wubi (nth (1- len)
string)) 0 1))))))
code)))
-(defun pyim-cstring-at-point (&optional number)
- "获取光标一个中文字符串,字符数量为:NUMBER."
- (save-excursion
- (let* ((point (point))
- (begin (- point number))
- (begin (if (> begin 0)
- begin
- (point-min)))
- (string (buffer-substring-no-properties
- point begin)))
- (when (and (stringp string)
- (= (length string) number)
- (not (pyim-string-match-p "\\CC" string)))
- string))))
-
(defun pyim-create-word-at-point (&optional number silent)
"将光标前字符数为 NUMBER 的中文字符串添加到个人词库中
当 SILENT 设置为 t 是,不显示提醒信息。"
@@ -3456,159 +3441,6 @@ alist 列表。"
(- current-pos str-beginning-pos)
(- str-end-pos current-pos)))))))
-(defun pyim-cstring-split-to-list (chinese-string &optional max-word-length
delete-dups prefer-short-word)
- "一个基于 pyim 的中文分词函数。这个函数可以将中文字符
-串 CHINESE-STRING 分词,得到一个词条 alist,这个 alist 的元素
-都是列表,其中第一个元素为分词得到的词条,第二个元素为词条相对于
-字符串中的起始位置,第三个元素为结束位置。分词时,默认词条不超过
-6个字符,用户可以通过 MAX-WORD-LENGTH 来自定义,但值得注意的是:
-这个值设置越大,分词速度越慢。
-
-如果 DELETE-DUPS 设置为 non-nil, 一个中文字符串只保留一种分割方式。
-比如:
-
- 我爱北京天安门 => 我爱 北京 天安门
-
-如果 PREFER-SHORT-WORD 为 non-nil, 去重的时候则优先保留较短的词。
-
-注意事项:
-1. 这个工具使用暴力匹配模式来分词,*不能检测出* pyim 词库中不存在
- 的中文词条。
-2. 这个函数的分词速度比较慢,仅仅适用于中文短句的分词,不适用于
- 文章分词。根据评估,20个汉字组成的字符串需要大约0.3s, 40个
- 汉字消耗1s,随着字符串长度的增大消耗的时间呈几何倍数增加。"
- ;; (("天安" 5 7)
- ;; 我爱北京天安门 -> ("天安门" 5 8)
- ;; ("北京" 3 5)
- ;; ("我爱" 1 3))
- (cl-labels
- ((get-possible-words-internal
- ;; 内部函数,功能类似:
- ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab")
- (my-list number)
- (cond
- ((< (length my-list) 2) nil)
- (t (append
- (let* ((str (mapconcat #'identity my-list ""))
- (length (length str)))
- (when (<= length (or max-word-length 6))
- (list (list str number (+ number length)))))
- (get-possible-words-internal
- (reverse (cdr (reverse my-list))) number)))))
- (get-possible-words
- ;; 内部函数,功能类似:
- ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab" "bcd" "bc" "cd")
- (my-list number)
- (cond
- ((null my-list) nil)
- (t (append (get-possible-words-internal my-list number)
- (get-possible-words (cdr my-list) (1+ number)))))))
-
- ;; 如果 pyim 词库没有加载,加载 pyim 词库,
- ;; 确保 `pyim-dcache-get' 可以正常运行。
- (pyim-dcache-init-variables)
-
- (let ((string-alist
- (get-possible-words
- (mapcar #'char-to-string
- (string-to-vector chinese-string))
- 1))
- result)
- (dolist (string-list string-alist)
- (let ((pinyin-list (pyim-hanzi2pinyin (car string-list) nil "-" t)))
- (dolist (pinyin pinyin-list)
- (let ((words (pyim-dcache-get pinyin '(code2word)))) ; 忽略个人词库可以提高速度
- (dolist (word words)
- (when (equal word (car string-list))
- (push string-list result)))))))
-
- (if delete-dups
- (cl-delete-duplicates
- ;; 判断两个词条在字符串中的位置
- ;; 是否冲突,如果冲突,仅保留一个,
- ;; 删除其它。
- result
- :test #'(lambda (x1 x2)
- (let ((begin1 (nth 1 x1))
- (begin2 (nth 1 x2))
- (end1 (nth 2 x1))
- (end2 (nth 2 x2)))
- (not (or (<= end1 begin2)
- (<= end2 begin1)))))
- :from-end prefer-short-word)
- result))))
-
-;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
-;; (benchmark 1 '(pyim-cstring-split-to-list str)))
-
-;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
-;; (pyim-cstring-split-to-list str))
-
-(defun pyim-cstring-split-to-string (string &optional prefer-short-word
- separator max-word-length)
- "将中文字符串 STRING 分词.
-
-在分词的位置插入空格或者自定义分隔符 SEPERATERS,默认情况下较长的
-词条优先使用,如果 PREFER-SHORT-WORD 设置为 t,则优先使用较短的
-词条。默认最长词条不超过6个字符,用户可以通 MAX-WORD-LENGTH 来
-自定义词条的最大长度,但值得注意的是,这个值设置越大,分词速度越
-慢。"
- (let ((string-list
- (if (pyim-string-match-p "\\CC" string)
- (split-string
- (replace-regexp-in-string
- "\\(\\CC+\\)" "@@@@\\1@@@@" string)
- "@@@@")
- (list string))))
- (mapconcat
- #'(lambda (str)
- (when (> (length str) 0)
- (if (not (pyim-string-match-p "\\CC" str))
- (pyim-cstring-split-to-string-1
- str prefer-short-word separator max-word-length)
- (concat " " str " "))))
- string-list "")))
-
-(defun pyim-cstring-split-to-string-1 (chinese-string &optional
prefer-short-word
- separator
max-word-length)
- "`pyim-cstring-split-to-string' 内部函数。"
- (let ((str-length (length chinese-string))
- (word-list (pyim-cstring-split-to-list
- chinese-string max-word-length t prefer-short-word))
- position-list result)
-
- ;; 提取词条相对于字符串的位置信息。
- (dolist (word word-list)
- (push (nth 1 word) position-list)
- (push (nth 2 word) position-list))
-
- ;; 将位置信息由小到大排序。
- (setq position-list
- (cl-delete-duplicates (sort position-list #'<)))
-
- ;; 在分词的位置插入空格或者用户指定的分隔符。
- (dotimes (i str-length)
- (when (member (1+ i) position-list)
- (push (or separator " ") result))
- (push (substring chinese-string i (1+ i)) result))
- (setq result (nreverse result))
- (mapconcat #'identity result "")))
-
-(defun pyim-cstring-split-buffer ()
- "将一个 buffer 中的中文文章,进行分词操作。"
- (interactive)
- (message "分词开始!")
- (goto-char (point-min))
- (while (not (eobp))
- (let ((string (buffer-substring-no-properties
- (line-beginning-position)
- (line-end-position))))
- (delete-region (line-beginning-position)
- (min (+ (line-end-position) 1) (point-max)))
- (insert (pyim-cstring-split-to-string string))
- (insert "\n")))
- (goto-char (point-min))
- (message "分词完成!"))
;; ** 汉字到拼音的转换工具
;;;###autoload
- [elpa] externals/pyim updated (c16ca2b -> aa8dbd9), ELPA Syncer, 2021/04/22
- [elpa] externals/pyim f744469 01/36: Move code to pyim-pymap.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 900071a 02/36: Add pyim-pinyin.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim c13c2a6 06/36: Move code to pyim-common.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 09fd273 07/36: Sort pyim-common.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 6dfe546 08/36: New pyim-punctuation.el file., ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 6cc63e8 09/36: New file: pyim-dict.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 8ae9ad9 10/36: New file pyim-cstring.el,
ELPA Syncer <=
- [elpa] externals/pyim 678240a 16/36: Add pyim-candidates.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim fa9a82a 17/36: pyim-posframe-* -> pyim-page-posframe-*, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 8023823 15/36: New file: pyim-page.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim cfd96c6 20/36: update pyim-candidates.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim eafaef3 22/36: update pyim-cstring.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim c9cd6bf 27/36: Add pyim-autoselector.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim f56d0a6 30/36: Add README.org, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim f26fecc 03/36: New file: pyim-cregexp.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim 819f9c6 04/36: Move pyim-permutate-list* to pyim-common.el, ELPA Syncer, 2021/04/22
- [elpa] externals/pyim bf3626c 05/36: * pyim-common.el (pyim-flatten-list): Move from pyim.el, ELPA Syncer, 2021/04/22