[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/guess-language 30e23aa 001/101: Added initial proof of
From: |
Stefan Monnier |
Subject: |
[elpa] externals/guess-language 30e23aa 001/101: Added initial proof of concept. |
Date: |
Sat, 23 Feb 2019 10:34:31 -0500 (EST) |
branch: externals/guess-language
commit 30e23aaaac776393e65a944131316fa87c9c90ad
Author: Titus von der Malsburg <address@hidden>
Commit: Titus von der Malsburg <address@hidden>
Added initial proof of concept.
---
guess-language.el | 92 +++++++++++++++++
trigrams/de | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
trigrams/en | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 692 insertions(+)
diff --git a/guess-language.el b/guess-language.el
new file mode 100644
index 0000000..304a375
--- /dev/null
+++ b/guess-language.el
@@ -0,0 +1,92 @@
+;;; guess-language.el --- Automatically detect human language
+
+;; Author: Titus von der Malsburg <address@hidden>
+;; Maintainer: Titus von der Malsburg <address@hidden>
+;; Version: 2.0.0
+;; Package-Requires: ((cl-lib "0.5"))
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; Just a proof of concept at this time. Only supports English and
+;; German but can easily be extended to handle other languages.
+
+;; The detection algorithm is based on counts of character
+;; trigrams. The trigrams are copied from guess_language.py
+;; (https://github.com/kent37/guess-language).
+
+(defvar guess-language-languages '(en de))
+
+(defun guess-language-load-trigrams ()
+ (cl-loop
+ for lang in guess-language-languages
+ for trigrams = (with-temp-buffer
+ (insert-file-contents (symbol-name lang))
+ (split-string (buffer-string) "\n" t))
+ collect (cons lang trigrams)))
+
+(defun guess-language-compile-regexps ()
+ (setq guess-language-regexps
+ (cl-loop
+ for lang in (guess-language-load-trigrams)
+ for regexp = (mapconcat 'identity (cdr lang) "\\)\\|\\(")
+ for regexp = (concat "\\(" regexp "\\)")
+ collect (cons (car lang) regexp))))
+
+(defun guess-language (beginning end)
+ (let ((tally (cl-loop
+ for lang in guess-language-regexps
+ for regexp = (cdr lang)
+ collect (cons (car lang) (how-many regexp beginning end)))))
+ (print tally)
+ (car (--max-by (> (cdr it) (cdr other)) tally))))
+
+(defun guess-language-buffer ()
+ (interactive)
+ (print (guess-language (point-min) (point-max))))
+
+(defun guess-language-paragraph ()
+ (interactive)
+ (let ((beginning (save-excursion (backward-paragraph) (point)))
+ (end (save-excursion (forward-paragraph) (point))))
+ (print (guess-language beginning end))))
+
+(defun guess-language-region ()
+ (interactive)
+ (print (guess-language (region-beginning) (region-end))))
+
+(defun guess-language-autoset ()
+ "Detects language of the current paragraph and sets things like
+ispell dictionaries accordingly."
+ (interactive)
+ (pcase (guess-language-paragraph)
+ ('en (progn
+ (ispell-change-dictionary "en")
+ (typo-change-language "English")))
+ ('de (progn
+ (ispell-change-dictionary "de")
+ (typo-change-language "German"))))
+ (flyspell-region (save-excursion (backward-paragraph) (point))
+ (save-excursion (forward-paragraph) (point))))
+
+(provide 'guess-language)
+
+;; Local Variables:
+;; byte-compile-warnings: (not cl-functions obsolete)
+;; coding: utf-8
+;; indent-tabs-mode: nil
+;; End:
+
+;;; guess-language.el ends here
diff --git a/trigrams/de b/trigrams/de
new file mode 100644
index 0000000..ebbeca8
--- /dev/null
+++ b/trigrams/de
@@ -0,0 +1,300 @@
+en
+er
+ de
+der
+ie
+ di
+die
+sch
+ein
+che
+ich
+den
+in
+te
+ch
+ ei
+ung
+n d
+nd
+ be
+ver
+es
+ zu
+eit
+gen
+und
+ un
+ au
+ in
+cht
+it
+ten
+ da
+ent
+ ve
+and
+ ge
+ine
+ mi
+r d
+hen
+ng
+nde
+ vo
+e d
+ber
+men
+ei
+mit
+ st
+ter
+ren
+t d
+ er
+ere
+n s
+ste
+ se
+e s
+ht
+des
+ist
+ne
+auf
+e a
+isc
+on
+rte
+ re
+ we
+ges
+uch
+ fü
+ so
+bei
+e e
+nen
+r s
+ach
+für
+ier
+par
+ür
+ ha
+as
+ert
+ an
+ pa
+ sa
+ sp
+ wi
+for
+tag
+zu
+das
+rei
+he
+hre
+nte
+sen
+vor
+ sc
+ech
+etz
+hei
+lan
+n a
+pd
+st
+sta
+ese
+lic
+ ab
+ si
+gte
+ wa
+iti
+kei
+n e
+nge
+sei
+tra
+zen
+ im
+ la
+art
+im
+lle
+n w
+rde
+rec
+set
+str
+tei
+tte
+ ni
+e p
+ehe
+ers
+g d
+nic
+von
+ al
+ pr
+an
+aus
+erf
+r e
+tze
+tür
+uf
+ag
+als
+ar
+chs
+end
+ge
+ige
+ion
+ls
+n m
+ngs
+nis
+nt
+ord
+s s
+sse
+ tü
+ahl
+e b
+ede
+em
+len
+n i
+orm
+pro
+rke
+run
+s d
+wah
+wer
+ürk
+ me
+age
+att
+ell
+est
+hat
+n b
+oll
+raf
+s a
+tsc
+ es
+ fo
+ gr
+ ja
+abe
+auc
+ben
+e n
+ege
+lie
+n u
+r v
+re
+rit
+sag
+ am
+agt
+ahr
+bra
+de
+erd
+her
+ite
+le
+n p
+n v
+or
+rbe
+rt
+sic
+wie
+übe
+ is
+ üb
+cha
+chi
+e f
+e m
+eri
+ied
+mme
+ner
+r a
+sti
+t a
+t s
+tis
+ ko
+arb
+ds
+gan
+n z
+r f
+r w
+ran
+se
+t i
+wei
+wir
+ br
+ np
+am
+bes
+d d
+deu
+e g
+e k
+efo
+et
+eut
+fen
+hse
+lte
+n r
+npd
+r b
+rhe
+t w
+tz
+ fr
+ ih
+ ke
+ ma
+ame
+ang
+d s
+eil
+el
+era
+erh
+h d
+i d
+kan
+n f
+n l
+nts
+och
+rag
+rd
+spd
+spr
+tio
+ ar
+ en
+ ka
+ark
+ass
diff --git a/trigrams/en b/trigrams/en
new file mode 100644
index 0000000..9d9139d
--- /dev/null
+++ b/trigrams/en
@@ -0,0 +1,300 @@
+ th
+the
+he
+ed
+ to
+ in
+er
+ing
+ng
+ an
+nd
+ of
+and
+to
+of
+ co
+at
+on
+in
+ a
+d t
+ he
+e t
+ion
+es
+ re
+re
+hat
+ sa
+ st
+ ha
+her
+tha
+tio
+or
+ ''
+en
+ wh
+e s
+ent
+n t
+s a
+as
+for
+is
+t t
+ be
+ld
+e a
+rs
+ wa
+ut
+ve
+ll
+al
+ ma
+e i
+ fo
+'s
+an
+est
+ hi
+ mo
+ se
+ pr
+s t
+ate
+st
+ter
+ere
+ted
+nt
+ver
+d a
+ wi
+se
+e c
+ect
+ns
+ on
+ly
+tol
+ey
+r t
+ ca
+ati
+ts
+all
+ no
+his
+s o
+ers
+con
+e o
+ear
+f t
+e w
+was
+ons
+sta
+''
+sti
+n a
+sto
+t h
+ we
+id
+th
+ it
+ce
+ di
+ave
+d h
+cou
+pro
+ad
+oll
+ry
+d s
+e m
+ so
+ill
+cti
+te
+tor
+eve
+g t
+it
+ ch
+ de
+hav
+oul
+ty
+uld
+use
+ al
+are
+ch
+me
+out
+ove
+wit
+ys
+chi
+t a
+ith
+oth
+ ab
+ te
+ wo
+s s
+res
+t w
+tin
+e b
+e h
+nce
+t s
+y t
+e p
+ele
+hin
+s i
+nte
+ li
+le
+ do
+aid
+hey
+ne
+s w
+ as
+ fr
+ tr
+end
+sai
+ el
+ ne
+ su
+'t
+ay
+hou
+ive
+lec
+n't
+ ye
+but
+d o
+o t
+y o
+ ho
+ me
+be
+cal
+e e
+had
+ple
+ at
+ bu
+ la
+d b
+s h
+say
+t i
+ ar
+e f
+ght
+hil
+igh
+int
+not
+ren
+ is
+ pa
+ sh
+ays
+com
+n s
+r a
+rin
+y a
+ un
+n c
+om
+thi
+ mi
+by
+d i
+e d
+e n
+t o
+ by
+e r
+eri
+old
+ome
+whe
+yea
+ gr
+ar
+ity
+mpl
+oun
+one
+ow
+r s
+s f
+tat
+ ba
+ vo
+bou
+sam
+tim
+vot
+abo
+ant
+ds
+ial
+ine
+man
+men
+ or
+ po
+amp
+can
+der
+e l
+les
+ny
+ot
+rec
+tes
+tho
+ica
+ild
+ir
+nde
+ose
+ous
+pre
+ste
+era
+per
+r o
+red
+rie
+ bo
+ le
+ali
+ars
+ore
+ric
+s m
+str
+ fa
+ess
+ie
+ist
+lat
+uri
- [elpa] branch externals/guess-language created (now 1107b93), Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 43eb329 004/101: Fixed version number., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 2792ac4 003/101: Guesses the language only if flyspell is unhappy with what we, Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language e4c103c 009/101: Compile regexps after load., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language e1db84e 006/101: Make this into a minor mode., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 5261d2f 010/101: Make defcustoms., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language e6ffe8d 011/101: Added some documentation., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 6ee7f77 005/101: Only guess when paragraph has some minimal length., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 30e23aa 001/101: Added initial proof of concept.,
Stefan Monnier <=
- [elpa] externals/guess-language e13f5a9 013/101: (Re)compile regexps when necessary., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 24213cb 032/101: Added some data for testing., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 9b1048f 026/101: More idiomatic way to do local hooks., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 38d7ab1 017/101: README: minor fixes, Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 9117deb 027/101: Made it easier to add new languages., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 47ff6ad 022/101: Fix package, Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language c543465 023/101: Merge pull request #1 from syohex/fix-package, Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language a2c048c 016/101: Updated README., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language b0b1438 028/101: Made some functions non-interactive., Stefan Monnier, 2019/02/23
- [elpa] externals/guess-language 8daec47 019/101: Depend on emacs 24., Stefan Monnier, 2019/02/23