- commit
- 6d8c914ce3795f6e5d2fc26800b3514000c8e4dd
- parent
- c70e23e44137b076ce3db100ac66b940358e4bd8
- Author
- Tobias Bengfort <tobias.bengfort@posteo.de>
- Date
- 2025-05-19 11:36
further simplify
Diffstat
| M | README.md | 8 | +++----- |
| M | demo/demo.js | 23 | ++++------------------- |
| M | test.py | 9 | +++++---- |
3 files changed, 12 insertions, 28 deletions
diff --git a/README.md b/README.md
@@ -32,15 +32,13 @@ A model might look like this: 32 32 You can use the model like this: 33 33 34 34 ```py35 -1 def dist(p, q):36 -1 # 0 does not mean impossible, just very unlikely37 -1 qq = [qi + 0.0000001 for qi in q]38 -1 return 1 / math.prod(qi ** (pi / sum(p)) for pi, qi in zip(p, qq))-1 35 def probability(p, q): -1 36 return math.prod(qi ** pi for pi, qi in zip(p, q)) 39 37 40 38 def classify(model, text): 41 39 n = len(text) + 1 42 40 freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]43 -1 return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang]))-1 41 return max(model['freq'], key=lambda lang: probability(freq, model['freq'][lang])) 44 42 ``` 45 43 46 44 ## An even simpler classifier
diff --git a/demo/demo.js b/demo/demo.js
@@ -10,32 +10,17 @@ var count = (text, ngram) => {
10 10 return (text.match(new RegExp(ngram, 'g')) || []).length;
11 11 };
12 12
13 -1 var sum = a => a.reduce((s, v) => s + v, 0);
14 13 var prod = a => a.reduce((s, v) => s * v, 1);
-1 14 var max = (a, key) => a.reduce((m, v) => !m || key(v) > key(m) ? v : m, null);
15 15
16 -1 var dist = (p, q) => {
17 -1 if (p.length === 1) {
18 -1 return Math.abs(p[0] - q[0]);
19 -1 }
20 -1
21 -1 // 0 does not mean impossible, just very unlikely
22 -1 var qq = q.map(qi => qi + 0.0000001);
23 -1 return 1 / prod(p.map((pi, i) => Math.pow(qq[i], pi / sum(p))));
-1 16 var probability = (p, q) => {
-1 17 return prod(p.map((pi, i) => Math.pow(q[i], pi)));
24 18 };
25 19
26 20 var classify = text => {
27 21 var n = text.length + 1;
28 22 var freq = model.ngrams.map(g => count(text, g) / (n - g.length));
29 -1 var best = null;
30 -1 var bestDist = Infinity;
31 -1 for (const lang of Object.keys(model.freq)) {
32 -1 var d = dist(freq, model.freq[lang]);
33 -1 if (d < bestDist) {
34 -1 bestDist = d;
35 -1 best = lang;
36 -1 }
37 -1 }
38 -1 return best;
-1 23 return max(Object.keys(model.freq), lang => probability(freq, model.freq[lang]));
39 24 };
40 25
41 26 var textarea = document.querySelector('textarea');
diff --git a/test.py b/test.py
@@ -61,19 +61,20 @@ LANG_MAP = {
61 61 }
62 62
63 63
64 -1 def dist(p, q):
-1 64 def probability(p, q):
65 65 if len(p) == 1:
66 -1 return abs(p[0] - q[0])
-1 66 p = [p[0], 1 - p[0]]
-1 67 q = [q[0], 1 - q[0]]
67 68
68 69 # 0 does not mean impossible, just very unlikely
69 70 qq = [qi + 0.0000001 for qi in q]
70 -1 return 1 / math.prod(qi ** (pi / sum(p)) for pi, qi in zip(p, qq))
-1 71 return math.prod(qi ** pi for pi, qi in zip(p, qq))
71 72
72 73
73 74 def classify(model, text):
74 75 n = len(text) + 1
75 76 freq = [text.count(g) / (n - len(g)) for g in model['ngrams']]
76 -1 return min(model['freq'], key=lambda lang: dist(freq, model['freq'][lang]))
-1 77 return max(model['freq'], key=lambda lang: probability(freq, model['freq'][lang]))
77 78
78 79
79 80 def test(model):