[Top][All Lists]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/guess-language 30e23aa 001/101: Added initial proof of

From: Stefan Monnier
Subject: [elpa] externals/guess-language 30e23aa 001/101: Added initial proof of concept.
Date: Sat, 23 Feb 2019 10:34:31 -0500 (EST)

branch: externals/guess-language
commit 30e23aaaac776393e65a944131316fa87c9c90ad
Author: Titus von der Malsburg <address@hidden>
Commit: Titus von der Malsburg <address@hidden>

    Added initial proof of concept.
 guess-language.el |  92 +++++++++++++++++
 trigrams/de       | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 trigrams/en       | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 692 insertions(+)

diff --git a/guess-language.el b/guess-language.el
new file mode 100644
index 0000000..304a375
--- /dev/null
+++ b/guess-language.el
@@ -0,0 +1,92 @@
+;;; guess-language.el --- Automatically detect human language
+;; Author: Titus von der Malsburg <address@hidden>
+;; Maintainer: Titus von der Malsburg <address@hidden>
+;; Version: 2.0.0
+;; Package-Requires: ((cl-lib "0.5"))
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; GNU General Public License for more details.
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;;; Commentary:
+;; Just a proof of concept at this time.  Only supports English and
+;; German but can easily be extended to handle other languages.
+;; The detection algorithm is based on counts of character
+;; trigrams.  The trigrams are copied from guess_language.py
+;; (https://github.com/kent37/guess-language).
+(defvar guess-language-languages '(en de))
+(defun guess-language-load-trigrams ()
+  (cl-loop
+   for lang in guess-language-languages
+   for trigrams = (with-temp-buffer
+                    (insert-file-contents (symbol-name lang))
+                    (split-string (buffer-string) "\n" t))
+   collect (cons lang trigrams)))
+(defun guess-language-compile-regexps ()
+  (setq guess-language-regexps
+        (cl-loop
+         for lang in (guess-language-load-trigrams)
+         for regexp = (mapconcat 'identity (cdr lang) "\\)\\|\\(")
+         for regexp = (concat "\\(" regexp "\\)")
+         collect (cons (car lang) regexp))))
+(defun guess-language (beginning end)
+  (let ((tally (cl-loop
+                for lang in guess-language-regexps
+                for regexp = (cdr lang)
+                collect (cons (car lang) (how-many regexp beginning end)))))
+    (print tally)
+    (car (--max-by (> (cdr it) (cdr other)) tally))))
+(defun guess-language-buffer ()
+  (interactive)
+  (print (guess-language (point-min) (point-max))))
+(defun guess-language-paragraph ()
+  (interactive)
+  (let ((beginning (save-excursion (backward-paragraph) (point)))
+        (end       (save-excursion (forward-paragraph) (point))))
+    (print (guess-language beginning end))))
+(defun guess-language-region ()
+  (interactive)
+  (print (guess-language (region-beginning) (region-end))))
+(defun guess-language-autoset ()
+  "Detects language of the current paragraph and sets things like
+ispell dictionaries accordingly."
+  (interactive)
+  (pcase (guess-language-paragraph)
+    ('en (progn
+           (ispell-change-dictionary "en")
+           (typo-change-language "English")))
+    ('de (progn
+           (ispell-change-dictionary "de")
+           (typo-change-language "German"))))
+  (flyspell-region (save-excursion (backward-paragraph) (point))
+                   (save-excursion (forward-paragraph) (point))))
+(provide 'guess-language)
+;; Local Variables:
+;; byte-compile-warnings: (not cl-functions obsolete)
+;; coding: utf-8
+;; indent-tabs-mode: nil
+;; End:
+;;; guess-language.el ends here
diff --git a/trigrams/de b/trigrams/de
new file mode 100644
index 0000000..ebbeca8
--- /dev/null
+++ b/trigrams/de
@@ -0,0 +1,300 @@
+ de
+ di
+ ei
+n d
+ be
+ zu
+ un
+ au
+ in
+ da
+ ve
+ ge
+ mi
+r d
+ vo
+e d
+ st
+t d
+ er
+n s
+ se
+e s
+e a
+ re
+ we
+ fü
+ so
+e e
+r s
+ ha
+ an
+ pa
+ sa
+ sp
+ wi
+ sc
+n a
+ ab
+ si
+ wa
+n e
+ im
+ la
+n w
+ ni
+e p
+g d
+ al
+ pr
+r e
+n m
+s s
+ tü
+e b
+n i
+s d
+ me
+n b
+s a
+ es
+ fo
+ gr
+ ja
+e n
+n u
+r v
+ am
+n p
+n v
+ is
+ üb
+e f
+e m
+r a
+t a
+t s
+ ko
+n z
+r f
+r w
+t i
+ br
+ np
+d d
+e g
+e k
+n r
+r b
+t w
+ fr
+ ih
+ ke
+ ma
+d s
+h d
+i d
+n f
+n l
+ ar
+ en
+ ka
diff --git a/trigrams/en b/trigrams/en
new file mode 100644
index 0000000..9d9139d
--- /dev/null
+++ b/trigrams/en
@@ -0,0 +1,300 @@
+ th
+ to
+ in
+ an
+ of
+ co
+ a 
+d t
+ he
+e t
+ re
+ sa
+ st
+ ha
+ ''
+ wh
+e s
+n t
+s a
+t t
+ be
+e a
+ wa
+ ma
+e i
+ fo
+ hi
+ mo
+ se
+ pr
+s t
+d a
+ wi
+e c
+ on
+r t
+ ca
+ no
+s o
+e o
+f t
+e w
+n a
+t h
+ we
+ it
+ di
+d h
+d s
+e m
+ so
+g t
+ ch
+ de
+ al
+t a
+ ab
+ te
+ wo
+s s
+t w
+e b
+e h
+t s
+y t
+e p
+s i
+ li
+ do
+s w
+ as
+ fr
+ tr
+ el
+ ne
+ su
+ ye
+d o
+o t
+y o
+ ho
+ me
+e e
+ at
+ bu
+ la
+d b
+s h
+t i
+ ar
+e f
+ is
+ pa
+ sh
+n s
+r a
+y a
+ un
+n c
+ mi
+d i
+e d
+e n
+t o
+ by
+e r
+ gr
+r s
+s f
+ ba
+ vo
+ or
+ po
+e l
+r o
+ bo
+ le
+s m
+ fa

reply via email to

[Prev in Thread] Current Thread [Next in Thread]