tiny-lang-detect: Generate tiny models for language detection

commit: c70e23e44137b076ce3db100ac66b940358e4bd8
parent: 8184ad526ec1080060f790a1fa99b0533fdb1b8a
Author: Tobias Bengfort <tobias.bengfort@posteo.de>
Date: 2025-05-19 09:17

simplify distance

`prod (pi/qi)**pi` is how much more likely it is that observation p was
created by model p than by model q. Applying log() turns this into the
KL-divergence.

`prod qi**pi` is the probability that observation p was created by model
q.

`prod (pi/qi)**pi * prod qi**pi = prod pi**pi` is the same for each q.

Diffstat

M	README.md	5	++---
M	demo/demo.js	11	+++--------
M	test.py	6	+-----

3 files changed, 6 insertions, 16 deletions

diff --git a/README.md b/README.md

@@ -33,10 +33,9 @@ You can use the model like this:
   33    33 
   34    34 ```py
   35    35 def dist(p, q):
   36    -1     # https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
   37    -1     pp = [pi + 0.0000001 for pi in p]
   -1    36     # 0 does not mean impossible, just very unlikely
   38    37     qq = [qi + 0.0000001 for qi in q]
   39    -1     return sum(pi * math.log(pi / qi) for pi, qi in zip(pp, qq)) / sum(pp)
   -1    38     return 1 / math.prod(qi ** (pi / sum(p)) for pi, qi in zip(p, qq))
   40    39 
   41    40 def classify(model, text):
   42    41     n = len(text) + 1

diff --git a/demo/demo.js b/demo/demo.js

@@ -10,22 +10,17 @@ var count = (text, ngram) => {
   10    10     return (text.match(new RegExp(ngram, 'g')) || []).length;
   11    11 };
   12    12 
   13    -1 var sum = a => {
   14    -1     return a.reduce((s, v) => s + v, 0);
   15    -1 };
   -1    13 var sum = a => a.reduce((s, v) => s + v, 0);
   -1    14 var prod = a => a.reduce((s, v) => s * v, 1);
   16    15 
   17    16 var dist = (p, q) => {
   18    -1     // KL divergence breaks down for a single value
   19    17     if (p.length === 1) {
   20    18         return Math.abs(p[0] - q[0]);
   21    19     }
   22    20 
   23    21     // 0 does not mean impossible, just very unlikely
   24    -1     var pp = p.map(pi => pi + 0.0000001);
   25    22     var qq = q.map(qi => qi + 0.0000001);
   26    -1 
   27    -1     // https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
   28    -1     return sum(pp.map((pi, i) => pi * Math.log(pi / qq[i]))) / sum(pp);
   -1    23     return 1 / prod(p.map((pi, i) => Math.pow(qq[i], pi / sum(p))));
   29    24 };
   30    25 
   31    26 var classify = text => {

diff --git a/test.py b/test.py

@@ -62,16 +62,12 @@ LANG_MAP = {
   62    62 
   63    63 
   64    64 def dist(p, q):
   65    -1     # KL divergence breaks down for a single value
   66    65     if len(p) == 1:
   67    66         return abs(p[0] - q[0])
   68    67 
   69    68     # 0 does not mean impossible, just very unlikely
   70    -1     pp = [pi + 0.0000001 for pi in p]
   71    69     qq = [qi + 0.0000001 for qi in q]
   72    -1 
   73    -1     # https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
   74    -1     return sum(pi * math.log(pi / qi) for pi, qi in zip(pp, qq)) / sum(pp)
   -1    70     return 1 / math.prod(qi ** (pi / sum(p)) for pi, qi in zip(p, qq))
   75    71 
   76    72 
   77    73 def classify(model, text):