- commit
- 8184ad526ec1080060f790a1fa99b0533fdb1b8a
- parent
- 2f92e8aef5503af2ab121000b74fe1c1b3bf160a
- Author
- Tobias Bengfort <tobias.bengfort@posteo.de>
- Date
- 2025-05-14 05:10
switch from euclidean distance to Kullback–Leibler divergence This has better results and is also more in line with the Bayesian approach used by langdetect.
Diffstat
| M | README.md | 15 | ++++++++------- |
| M | demo/demo.js | 20 | +++++++++++++++++--- |
| M | test.py | 16 | +++++++++++++--- |
3 files changed, 38 insertions, 13 deletions
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Example usage: 14 14 $ ./download_data.sh 15 15 $ python gen_model.py en de -n 10 > en_de.json 16 16 $ python test.py en_de.json17 -1 963 out of 1000 samples were detected correctly (96.3%)-1 17 984 out of 1000 samples were detected correctly (98.4%) 18 18 ``` 19 19 20 20 A model might look like this: @@ -32,14 +32,16 @@ A model might look like this: 32 32 You can use the model like this: 33 33 34 34 ```py35 -1 def dist(a, b):36 -1 return sum((av - bv) ** 2 for av, bv in zip(a, b))37 -1-1 35 def dist(p, q): -1 36 # https://en.wikipedia.org/wiki/Kullback-Leibler_divergence -1 37 pp = [pi + 0.0000001 for pi in p] -1 38 qq = [qi + 0.0000001 for qi in q] -1 39 return sum(pi * math.log(pi / qi) for pi, qi in zip(pp, qq)) / sum(pp) 38 40 39 41 def classify(model, text): 40 42 n = len(text) + 1 41 43 freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]42 -1 return min(model['freq'], key=lambda lang: dist(model['freq'][lang], freq))-1 44 return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang])) 43 45 ``` 44 46 45 47 ## An even simpler classifier @@ -65,8 +67,7 @@ punctuation, URLs, or Latin characters in non-Latin texts. Then it uses 65 67 Bayesian methods to find the most likely language for those frequencies. 66 68 67 69 The examples in this repo are much simpler though. They do not do any68 -1 pre-processing, and they use the euclidean distance to find the best match.69 -1 This is ultimately a trade-off between accuracy and simplicity.-1 70 pre-processing. This is ultimately a trade-off between accuracy and simplicity. 70 71 71 72 To simplify the model, `gen_model.py` filters out all but the most significant 72 73 n-grams. N-grams are considered more significant if their frequencies have a
diff --git a/demo/demo.js b/demo/demo.js
@@ -10,8 +10,22 @@ var count = (text, ngram) => {
10 10 return (text.match(new RegExp(ngram, 'g')) || []).length;
11 11 };
12 12
13 -1 var dist = (a, b) => {
14 -1 return a.reduce((sum, v, i) => sum + Math.pow(v - b[i], 2), 0);
-1 13 var sum = a => {
-1 14 return a.reduce((s, v) => s + v, 0);
-1 15 };
-1 16
-1 17 var dist = (p, q) => {
-1 18 // KL divergence breaks down for a single value
-1 19 if (p.length === 1) {
-1 20 return Math.abs(p[0] - q[0]);
-1 21 }
-1 22
-1 23 // 0 does not mean impossible, just very unlikely
-1 24 var pp = p.map(pi => pi + 0.0000001);
-1 25 var qq = q.map(qi => qi + 0.0000001);
-1 26
-1 27 // https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
-1 28 return sum(pp.map((pi, i) => pi * Math.log(pi / qq[i]))) / sum(pp);
15 29 };
16 30
17 31 var classify = text => {
@@ -20,7 +34,7 @@ var classify = text => {
20 34 var best = null;
21 35 var bestDist = Infinity;
22 36 for (const lang of Object.keys(model.freq)) {
23 -1 var d = dist(model.freq[lang], freq);
-1 37 var d = dist(freq, model.freq[lang]);
24 38 if (d < bestDist) {
25 39 bestDist = d;
26 40 best = lang;
diff --git a/test.py b/test.py
@@ -1,5 +1,6 @@ 1 1 import argparse 2 2 import json -1 3 import math 3 4 4 5 LANG_MAP = { 5 6 'afr': 'af', @@ -60,14 +61,23 @@ LANG_MAP = { 60 61 } 61 62 62 6363 -1 def dist(a, b):64 -1 return sum((av - bv) ** 2 for av, bv in zip(a, b))-1 64 def dist(p, q): -1 65 # KL divergence breaks down for a single value -1 66 if len(p) == 1: -1 67 return abs(p[0] - q[0]) -1 68 -1 69 # 0 does not mean impossible, just very unlikely -1 70 pp = [pi + 0.0000001 for pi in p] -1 71 qq = [qi + 0.0000001 for qi in q] -1 72 -1 73 # https://en.wikipedia.org/wiki/Kullback-Leibler_divergence -1 74 return sum(pi * math.log(pi / qi) for pi, qi in zip(pp, qq)) / sum(pp) 65 75 66 76 67 77 def classify(model, text): 68 78 n = len(text) + 1 69 79 freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]70 -1 return min(model['freq'], key=lambda lang: dist(model['freq'][lang], freq))-1 80 return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang])) 71 81 72 82 73 83 def test(model):