tiny-lang-detect

Generate tiny models for language detection  https://p.ce9e.org/tiny-lang-detect/demo/
git clone https://git.ce9e.org/tiny-lang-detect.git

commit
6d8c914ce3795f6e5d2fc26800b3514000c8e4dd
parent
c70e23e44137b076ce3db100ac66b940358e4bd8
Author
Tobias Bengfort <tobias.bengfort@posteo.de>
Date
2025-05-19 11:36
further simplify

Diffstat

M README.md 8 +++-----
M demo/demo.js 23 ++++-------------------
M test.py 9 +++++----

3 files changed, 12 insertions, 28 deletions


diff --git a/README.md b/README.md

@@ -32,15 +32,13 @@ A model might look like this:
   32    32 You can use the model like this:
   33    33 
   34    34 ```py
   35    -1 def dist(p, q):
   36    -1     # 0 does not mean impossible, just very unlikely
   37    -1     qq = [qi + 0.0000001 for qi in q]
   38    -1     return 1 / math.prod(qi ** (pi / sum(p)) for pi, qi in zip(p, qq))
   -1    35 def probability(p, q):
   -1    36     return math.prod(qi ** pi for pi, qi in zip(p, q))
   39    37 
   40    38 def classify(model, text):
   41    39     n = len(text) + 1
   42    40     freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]
   43    -1     return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang]))
   -1    41     return max(model['freq'], key=lambda lang: probability(freq, model['freq'][lang]))
   44    42 ```
   45    43 
   46    44 ## An even simpler classifier

diff --git a/demo/demo.js b/demo/demo.js

@@ -10,32 +10,17 @@ var count = (text, ngram) => {
   10    10     return (text.match(new RegExp(ngram, 'g')) || []).length;
   11    11 };
   12    12 
   13    -1 var sum = a => a.reduce((s, v) => s + v, 0);
   14    13 var prod = a => a.reduce((s, v) => s * v, 1);
   -1    14 var max = (a, key) => a.reduce((m, v) => !m || key(v) > key(m) ? v : m, null);
   15    15 
   16    -1 var dist = (p, q) => {
   17    -1     if (p.length === 1) {
   18    -1         return Math.abs(p[0] - q[0]);
   19    -1     }
   20    -1 
   21    -1     // 0 does not mean impossible, just very unlikely
   22    -1     var qq = q.map(qi => qi + 0.0000001);
   23    -1     return 1 / prod(p.map((pi, i) => Math.pow(qq[i], pi / sum(p))));
   -1    16 var probability = (p, q) => {
   -1    17     return prod(p.map((pi, i) => Math.pow(q[i], pi)));
   24    18 };
   25    19 
   26    20 var classify = text => {
   27    21     var n = text.length + 1;
   28    22     var freq = model.ngrams.map(g => count(text, g) / (n - g.length));
   29    -1     var best = null;
   30    -1     var bestDist = Infinity;
   31    -1     for (const lang of Object.keys(model.freq)) {
   32    -1         var d = dist(freq, model.freq[lang]);
   33    -1         if (d < bestDist) {
   34    -1             bestDist = d;
   35    -1             best = lang;
   36    -1         }
   37    -1     }
   38    -1     return best;
   -1    23     return max(Object.keys(model.freq), lang => probability(freq, model.freq[lang]));
   39    24 };
   40    25 
   41    26 var textarea = document.querySelector('textarea');

diff --git a/test.py b/test.py

@@ -61,19 +61,20 @@ LANG_MAP = {
   61    61 }
   62    62 
   63    63 
   64    -1 def dist(p, q):
   -1    64 def probability(p, q):
   65    65     if len(p) == 1:
   66    -1         return abs(p[0] - q[0])
   -1    66         p = [p[0], 1 - p[0]]
   -1    67         q = [q[0], 1 - q[0]]
   67    68 
   68    69     # 0 does not mean impossible, just very unlikely
   69    70     qq = [qi + 0.0000001 for qi in q]
   70    -1     return 1 / math.prod(qi ** (pi / sum(p)) for pi, qi in zip(p, qq))
   -1    71     return math.prod(qi ** pi for pi, qi in zip(p, qq))
   71    72 
   72    73 
   73    74 def classify(model, text):
   74    75     n = len(text) + 1
   75    76     freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]
   76    -1     return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang]))
   -1    77     return max(model['freq'], key=lambda lang: probability(freq, model['freq'][lang]))
   77    78 
   78    79 
   79    80 def test(model):