tiny-lang-detect: Generate tiny models for language detection - switch from euclidean distance to Kullback

commit: 8184ad526ec1080060f790a1fa99b0533fdb1b8a
parent: 2f92e8aef5503af2ab121000b74fe1c1b3bf160a
Author: Tobias Bengfort <tobias.bengfort@posteo.de>
Date: 2025-05-14 05:10

switch from euclidean distance to Kullback–Leibler divergence

This has better results and is also more in line with the Bayesian
approach used by langdetect.

Diffstat

M	README.md	15	++++++++-------
M	demo/demo.js	20	+++++++++++++++++---
M	test.py	16	+++++++++++++---

3 files changed, 38 insertions, 13 deletions

diff --git a/README.md b/README.md

@@ -14,7 +14,7 @@ Example usage:
   14    14 $ ./download_data.sh
   15    15 $ python gen_model.py en de -n 10 > en_de.json
   16    16 $ python test.py en_de.json
   17    -1 963 out of 1000 samples were detected correctly (96.3%)
   -1    17 984 out of 1000 samples were detected correctly (98.4%)
   18    18 ```
   19    19 
   20    20 A model might look like this:
@@ -32,14 +32,16 @@ A model might look like this:
   32    32 You can use the model like this:
   33    33 
   34    34 ```py
   35    -1 def dist(a, b):
   36    -1     return sum((av - bv) ** 2 for av, bv in zip(a, b))
   37    -1 
   -1    35 def dist(p, q):
   -1    36     # https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
   -1    37     pp = [pi + 0.0000001 for pi in p]
   -1    38     qq = [qi + 0.0000001 for qi in q]
   -1    39     return sum(pi * math.log(pi / qi) for pi, qi in zip(pp, qq)) / sum(pp)
   38    40 
   39    41 def classify(model, text):
   40    42     n = len(text) + 1
   41    43     freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]
   42    -1     return min(model['freq'], key=lambda lang: dist(model['freq'][lang], freq))
   -1    44     return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang]))
   43    45 ```
   44    46 
   45    47 ## An even simpler classifier
@@ -65,8 +67,7 @@ punctuation, URLs, or Latin characters in non-Latin texts. Then it uses
   65    67 Bayesian methods to find the most likely language for those frequencies.
   66    68 
   67    69 The examples in this repo are much simpler though. They do not do any
   68    -1 pre-processing, and they use the euclidean distance to find the best match.
   69    -1 This is ultimately a trade-off between accuracy and simplicity.
   -1    70 pre-processing. This is ultimately a trade-off between accuracy and simplicity.
   70    71 
   71    72 To simplify the model, `gen_model.py` filters out all but the most significant
   72    73 n-grams. N-grams are considered more significant if their frequencies have a

diff --git a/demo/demo.js b/demo/demo.js

@@ -10,8 +10,22 @@ var count = (text, ngram) => {
   10    10     return (text.match(new RegExp(ngram, 'g')) || []).length;
   11    11 };
   12    12 
   13    -1 var dist = (a, b) => {
   14    -1     return a.reduce((sum, v, i) => sum + Math.pow(v - b[i], 2), 0);
   -1    13 var sum = a => {
   -1    14     return a.reduce((s, v) => s + v, 0);
   -1    15 };
   -1    16 
   -1    17 var dist = (p, q) => {
   -1    18     // KL divergence breaks down for a single value
   -1    19     if (p.length === 1) {
   -1    20         return Math.abs(p[0] - q[0]);
   -1    21     }
   -1    22 
   -1    23     // 0 does not mean impossible, just very unlikely
   -1    24     var pp = p.map(pi => pi + 0.0000001);
   -1    25     var qq = q.map(qi => qi + 0.0000001);
   -1    26 
   -1    27     // https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
   -1    28     return sum(pp.map((pi, i) => pi * Math.log(pi / qq[i]))) / sum(pp);
   15    29 };
   16    30 
   17    31 var classify = text => {
@@ -20,7 +34,7 @@ var classify = text => {
   20    34     var best = null;
   21    35     var bestDist = Infinity;
   22    36     for (const lang of Object.keys(model.freq)) {
   23    -1         var d = dist(model.freq[lang], freq);
   -1    37         var d = dist(freq, model.freq[lang]);
   24    38         if (d < bestDist) {
   25    39             bestDist = d;
   26    40             best = lang;

diff --git a/test.py b/test.py

@@ -1,5 +1,6 @@
    1     1 import argparse
    2     2 import json
   -1     3 import math
    3     4 
    4     5 LANG_MAP = {
    5     6     'afr': 'af',
@@ -60,14 +61,23 @@ LANG_MAP = {
   60    61 }
   61    62 
   62    63 
   63    -1 def dist(a, b):
   64    -1     return sum((av - bv) ** 2 for av, bv in zip(a, b))
   -1    64 def dist(p, q):
   -1    65     # KL divergence breaks down for a single value
   -1    66     if len(p) == 1:
   -1    67         return abs(p[0] - q[0])
   -1    68 
   -1    69     # 0 does not mean impossible, just very unlikely
   -1    70     pp = [pi + 0.0000001 for pi in p]
   -1    71     qq = [qi + 0.0000001 for qi in q]
   -1    72 
   -1    73     # https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
   -1    74     return sum(pi * math.log(pi / qi) for pi, qi in zip(pp, qq)) / sum(pp)
   65    75 
   66    76 
   67    77 def classify(model, text):
   68    78     n = len(text) + 1
   69    79     freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]
   70    -1     return min(model['freq'], key=lambda lang: dist(model['freq'][lang], freq))
   -1    80     return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang]))
   71    81 
   72    82 
   73    83 def test(model):