emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/pyim 8ae9ad9 10/36: New file pyim-cstring.el


From: ELPA Syncer
Subject: [elpa] externals/pyim 8ae9ad9 10/36: New file pyim-cstring.el
Date: Thu, 22 Apr 2021 22:57:16 -0400 (EDT)

branch: externals/pyim
commit 8ae9ad9c2ea6857b1d443f44d81d712562933689
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>

    New file pyim-cstring.el
    
        * pyim-cstring.el (pyim-cstring-at-point)
        (pyim-cstring-split-to-list, pyim-cstring-split-to-string)
        (pyim-cstring-split-to-string-1, pyim-cstring-split-buffer): Move from 
pyim.el
---
 pyim-cstring.el | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 pyim.el         | 168 ---------------------------------------------
 2 files changed, 206 insertions(+), 168 deletions(-)

diff --git a/pyim-cstring.el b/pyim-cstring.el
new file mode 100644
index 0000000..db8c421
--- /dev/null
+++ b/pyim-cstring.el
@@ -0,0 +1,206 @@
+;;; pyim-cstring.el --- Chinese string tools for pyim.        -*- 
lexical-binding: t; -*-
+
+;; * Header
+;; Copyright (C) 2021 Free Software Foundation, Inc.
+
+;; Author: Feng Shu <tumashu@163.com>
+;; Maintainer: Feng Shu <tumashu@163.com>
+;; URL: https://github.com/tumashu/pyim
+;; Keywords: convenience, Chinese, pinyin, input-method
+;; Package-Requires: ((emacs "24.4") (async "1.6") (xr "1.13"))
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;;; Code:
+;; * 代码                                                           :code:
+(require 'cl-lib)
+
+(defun pyim-cstring-at-point (&optional number)
+  "获取光标一个中文字符串,字符数量为:NUMBER."
+  (save-excursion
+    (let* ((point (point))
+           (begin (- point number))
+           (begin (if (> begin 0)
+                      begin
+                    (point-min)))
+           (string (buffer-substring-no-properties
+                    point begin)))
+      (when (and (stringp string)
+                 (= (length string) number)
+                 (not (pyim-string-match-p "\\CC" string)))
+        string))))
+
+(defun pyim-cstring-split-to-list (chinese-string &optional max-word-length 
delete-dups prefer-short-word)
+  "一个基于 pyim 的中文分词函数。这个函数可以将中文字符
+串 CHINESE-STRING 分词,得到一个词条 alist,这个 alist 的元素
+都是列表,其中第一个元素为分词得到的词条,第二个元素为词条相对于
+字符串中的起始位置,第三个元素为结束位置。分词时,默认词条不超过
+6个字符,用户可以通过 MAX-WORD-LENGTH 来自定义,但值得注意的是:
+这个值设置越大,分词速度越慢。
+
+如果 DELETE-DUPS 设置为 non-nil, 一个中文字符串只保留一种分割方式。
+比如:
+
+  我爱北京天安门 => 我爱 北京 天安门
+
+如果 PREFER-SHORT-WORD 为 non-nil, 去重的时候则优先保留较短的词。
+
+注意事项:
+1. 这个工具使用暴力匹配模式来分词,*不能检测出* pyim 词库中不存在
+   的中文词条。
+2. 这个函数的分词速度比较慢,仅仅适用于中文短句的分词,不适用于
+   文章分词。根据评估,20个汉字组成的字符串需要大约0.3s, 40个
+   汉字消耗1s,随着字符串长度的增大消耗的时间呈几何倍数增加。"
+  ;;                   (("天安" 5 7)
+  ;; 我爱北京天安门 ->  ("天安门" 5 8)
+  ;;                    ("北京" 3 5)
+  ;;                    ("我爱" 1 3))
+  (cl-labels
+      ((get-possible-words-internal
+        ;; 内部函数,功能类似:
+        ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab")
+        (my-list number)
+        (cond
+         ((< (length my-list) 2) nil)
+         (t (append
+             (let* ((str (mapconcat #'identity my-list ""))
+                    (length (length str)))
+               (when (<= length (or max-word-length 6))
+                 (list (list str number (+ number length)))))
+             (get-possible-words-internal
+              (reverse (cdr (reverse my-list))) number)))))
+       (get-possible-words
+        ;; 内部函数,功能类似:
+        ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab" "bcd" "bc" "cd")
+        (my-list number)
+        (cond
+         ((null my-list) nil)
+         (t (append (get-possible-words-internal my-list number)
+                    (get-possible-words (cdr my-list) (1+ number)))))))
+
+    ;; 如果 pyim 词库没有加载,加载 pyim 词库,
+    ;; 确保 `pyim-dcache-get' 可以正常运行。
+    (pyim-dcache-init-variables)
+
+    (let ((string-alist
+           (get-possible-words
+            (mapcar #'char-to-string
+                    (string-to-vector chinese-string))
+            1))
+          result)
+      (dolist (string-list string-alist)
+        (let ((pinyin-list (pyim-hanzi2pinyin (car string-list) nil "-" t)))
+          (dolist (pinyin pinyin-list)
+            (let ((words (pyim-dcache-get pinyin '(code2word)))) ; 忽略个人词库可以提高速度
+              (dolist (word words)
+                (when (equal word (car string-list))
+                  (push string-list result)))))))
+
+      (if delete-dups
+          (cl-delete-duplicates
+           ;;  判断两个词条在字符串中的位置
+           ;;  是否冲突,如果冲突,仅保留一个,
+           ;;  删除其它。
+           result
+           :test #'(lambda (x1 x2)
+                     (let ((begin1 (nth 1 x1))
+                           (begin2 (nth 1 x2))
+                           (end1 (nth 2 x1))
+                           (end2 (nth 2 x2)))
+                       (not (or (<= end1 begin2)
+                                (<= end2 begin1)))))
+           :from-end prefer-short-word)
+        result))))
+
+;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
+;;   (benchmark 1 '(pyim-cstring-split-to-list str)))
+
+;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
+;;   (pyim-cstring-split-to-list str))
+
+(defun pyim-cstring-split-to-string (string &optional prefer-short-word
+                                            separator max-word-length)
+  "将中文字符串 STRING 分词.
+
+在分词的位置插入空格或者自定义分隔符 SEPERATERS,默认情况下较长的
+词条优先使用,如果 PREFER-SHORT-WORD 设置为 t,则优先使用较短的
+词条。默认最长词条不超过6个字符,用户可以通 MAX-WORD-LENGTH 来
+自定义词条的最大长度,但值得注意的是,这个值设置越大,分词速度越
+慢。"
+  (let ((string-list
+         (if (pyim-string-match-p "\\CC" string)
+             (split-string
+              (replace-regexp-in-string
+               "\\(\\CC+\\)" "@@@@\\1@@@@" string)
+              "@@@@")
+           (list string))))
+    (mapconcat
+     #'(lambda (str)
+         (when (> (length str) 0)
+           (if (not (pyim-string-match-p "\\CC" str))
+               (pyim-cstring-split-to-string-1
+                str prefer-short-word separator max-word-length)
+             (concat " " str " "))))
+     string-list "")))
+
+(defun pyim-cstring-split-to-string-1 (chinese-string &optional 
prefer-short-word
+                                                      separator 
max-word-length)
+  "`pyim-cstring-split-to-string' 内部函数。"
+  (let ((str-length (length chinese-string))
+        (word-list (pyim-cstring-split-to-list
+                    chinese-string max-word-length t prefer-short-word))
+        position-list result)
+
+    ;; 提取词条相对于字符串的位置信息。
+    (dolist (word word-list)
+      (push (nth 1 word) position-list)
+      (push (nth 2 word) position-list))
+
+    ;; 将位置信息由小到大排序。
+    (setq position-list
+          (cl-delete-duplicates (sort position-list #'<)))
+
+    ;; 在分词的位置插入空格或者用户指定的分隔符。
+    (dotimes (i str-length)
+      (when (member (1+ i) position-list)
+        (push (or separator " ") result))
+      (push (substring chinese-string i (1+ i))  result))
+    (setq result (nreverse result))
+    (mapconcat #'identity result "")))
+
+(defun pyim-cstring-split-buffer ()
+  "将一个 buffer 中的中文文章,进行分词操作。"
+  (interactive)
+  (message "分词开始!")
+  (goto-char (point-min))
+  (while (not (eobp))
+    (let ((string (buffer-substring-no-properties
+                   (line-beginning-position)
+                   (line-end-position))))
+      (delete-region (line-beginning-position)
+                     (min (+ (line-end-position) 1) (point-max)))
+      (insert (pyim-cstring-split-to-string string))
+      (insert "\n")))
+  (goto-char (point-min))
+  (message "分词完成!"))
+
+
+;; * Footer
+(provide 'pyim-cstring)
+
+;;; pyim-cstring.el ends here
diff --git a/pyim.el b/pyim.el
index e10f74d..67af23f 100644
--- a/pyim.el
+++ b/pyim.el
@@ -1389,21 +1389,6 @@ code-prefix)。当RETURN-LIST 设置为 t 时,返回一个 code list。"
                            (substring (pyim-hanzi2xingma:wubi (nth (1- len) 
string)) 0 1))))))
       code)))
 
-(defun pyim-cstring-at-point (&optional number)
-  "获取光标一个中文字符串,字符数量为:NUMBER."
-  (save-excursion
-    (let* ((point (point))
-           (begin (- point number))
-           (begin (if (> begin 0)
-                      begin
-                    (point-min)))
-           (string (buffer-substring-no-properties
-                    point begin)))
-      (when (and (stringp string)
-                 (= (length string) number)
-                 (not (pyim-string-match-p "\\CC" string)))
-        string))))
-
 (defun pyim-create-word-at-point (&optional number silent)
   "将光标前字符数为 NUMBER 的中文字符串添加到个人词库中
 当 SILENT 设置为 t 是,不显示提醒信息。"
@@ -3456,159 +3441,6 @@ alist 列表。"
                       (- current-pos str-beginning-pos)
                       (- str-end-pos current-pos)))))))
 
-(defun pyim-cstring-split-to-list (chinese-string &optional max-word-length 
delete-dups prefer-short-word)
-  "一个基于 pyim 的中文分词函数。这个函数可以将中文字符
-串 CHINESE-STRING 分词,得到一个词条 alist,这个 alist 的元素
-都是列表,其中第一个元素为分词得到的词条,第二个元素为词条相对于
-字符串中的起始位置,第三个元素为结束位置。分词时,默认词条不超过
-6个字符,用户可以通过 MAX-WORD-LENGTH 来自定义,但值得注意的是:
-这个值设置越大,分词速度越慢。
-
-如果 DELETE-DUPS 设置为 non-nil, 一个中文字符串只保留一种分割方式。
-比如:
-
-  我爱北京天安门 => 我爱 北京 天安门
-
-如果 PREFER-SHORT-WORD 为 non-nil, 去重的时候则优先保留较短的词。
-
-注意事项:
-1. 这个工具使用暴力匹配模式来分词,*不能检测出* pyim 词库中不存在
-   的中文词条。
-2. 这个函数的分词速度比较慢,仅仅适用于中文短句的分词,不适用于
-   文章分词。根据评估,20个汉字组成的字符串需要大约0.3s, 40个
-   汉字消耗1s,随着字符串长度的增大消耗的时间呈几何倍数增加。"
-  ;;                   (("天安" 5 7)
-  ;; 我爱北京天安门 ->  ("天安门" 5 8)
-  ;;                    ("北京" 3 5)
-  ;;                    ("我爱" 1 3))
-  (cl-labels
-      ((get-possible-words-internal
-        ;; 内部函数,功能类似:
-        ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab")
-        (my-list number)
-        (cond
-         ((< (length my-list) 2) nil)
-         (t (append
-             (let* ((str (mapconcat #'identity my-list ""))
-                    (length (length str)))
-               (when (<= length (or max-word-length 6))
-                 (list (list str number (+ number length)))))
-             (get-possible-words-internal
-              (reverse (cdr (reverse my-list))) number)))))
-       (get-possible-words
-        ;; 内部函数,功能类似:
-        ;; ("a" "b" "c" "d") -> ("abcd" "abc" "ab" "bcd" "bc" "cd")
-        (my-list number)
-        (cond
-         ((null my-list) nil)
-         (t (append (get-possible-words-internal my-list number)
-                    (get-possible-words (cdr my-list) (1+ number)))))))
-
-    ;; 如果 pyim 词库没有加载,加载 pyim 词库,
-    ;; 确保 `pyim-dcache-get' 可以正常运行。
-    (pyim-dcache-init-variables)
-
-    (let ((string-alist
-           (get-possible-words
-            (mapcar #'char-to-string
-                    (string-to-vector chinese-string))
-            1))
-          result)
-      (dolist (string-list string-alist)
-        (let ((pinyin-list (pyim-hanzi2pinyin (car string-list) nil "-" t)))
-          (dolist (pinyin pinyin-list)
-            (let ((words (pyim-dcache-get pinyin '(code2word)))) ; 忽略个人词库可以提高速度
-              (dolist (word words)
-                (when (equal word (car string-list))
-                  (push string-list result)))))))
-
-      (if delete-dups
-          (cl-delete-duplicates
-           ;;  判断两个词条在字符串中的位置
-           ;;  是否冲突,如果冲突,仅保留一个,
-           ;;  删除其它。
-           result
-           :test #'(lambda (x1 x2)
-                     (let ((begin1 (nth 1 x1))
-                           (begin2 (nth 1 x2))
-                           (end1 (nth 2 x1))
-                           (end2 (nth 2 x2)))
-                       (not (or (<= end1 begin2)
-                                (<= end2 begin1)))))
-           :from-end prefer-short-word)
-        result))))
-
-;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
-;;   (benchmark 1 '(pyim-cstring-split-to-list str)))
-
-;; (let ((str "医生随时都有可能被患者及其家属反咬一口"))
-;;   (pyim-cstring-split-to-list str))
-
-(defun pyim-cstring-split-to-string (string &optional prefer-short-word
-                                            separator max-word-length)
-  "将中文字符串 STRING 分词.
-
-在分词的位置插入空格或者自定义分隔符 SEPERATERS,默认情况下较长的
-词条优先使用,如果 PREFER-SHORT-WORD 设置为 t,则优先使用较短的
-词条。默认最长词条不超过6个字符,用户可以通 MAX-WORD-LENGTH 来
-自定义词条的最大长度,但值得注意的是,这个值设置越大,分词速度越
-慢。"
-  (let ((string-list
-         (if (pyim-string-match-p "\\CC" string)
-             (split-string
-              (replace-regexp-in-string
-               "\\(\\CC+\\)" "@@@@\\1@@@@" string)
-              "@@@@")
-           (list string))))
-    (mapconcat
-     #'(lambda (str)
-         (when (> (length str) 0)
-           (if (not (pyim-string-match-p "\\CC" str))
-               (pyim-cstring-split-to-string-1
-                str prefer-short-word separator max-word-length)
-             (concat " " str " "))))
-     string-list "")))
-
-(defun pyim-cstring-split-to-string-1 (chinese-string &optional 
prefer-short-word
-                                                      separator 
max-word-length)
-  "`pyim-cstring-split-to-string' 内部函数。"
-  (let ((str-length (length chinese-string))
-        (word-list (pyim-cstring-split-to-list
-                    chinese-string max-word-length t prefer-short-word))
-        position-list result)
-
-    ;; 提取词条相对于字符串的位置信息。
-    (dolist (word word-list)
-      (push (nth 1 word) position-list)
-      (push (nth 2 word) position-list))
-
-    ;; 将位置信息由小到大排序。
-    (setq position-list
-          (cl-delete-duplicates (sort position-list #'<)))
-
-    ;; 在分词的位置插入空格或者用户指定的分隔符。
-    (dotimes (i str-length)
-      (when (member (1+ i) position-list)
-        (push (or separator " ") result))
-      (push (substring chinese-string i (1+ i))  result))
-    (setq result (nreverse result))
-    (mapconcat #'identity result "")))
-
-(defun pyim-cstring-split-buffer ()
-  "将一个 buffer 中的中文文章,进行分词操作。"
-  (interactive)
-  (message "分词开始!")
-  (goto-char (point-min))
-  (while (not (eobp))
-    (let ((string (buffer-substring-no-properties
-                   (line-beginning-position)
-                   (line-end-position))))
-      (delete-region (line-beginning-position)
-                     (min (+ (line-end-position) 1) (point-max)))
-      (insert (pyim-cstring-split-to-string string))
-      (insert "\n")))
-  (goto-char (point-min))
-  (message "分词完成!"))
 
 ;; ** 汉字到拼音的转换工具
 ;;;###autoload



reply via email to

[Prev in Thread] Current Thread [Next in Thread]