tiny-lang-detect

Generate tiny models for language detection  https://p.ce9e.org/tiny-lang-detect/demo/
git clone https://git.ce9e.org/tiny-lang-detect.git

commit
315458d0a09b6e790689e7e56ccb0428eca4a4c2
parent
281071942a7647e36686aae2deb2511b2079b637
Author
Tobias Bengfort <tobias.bengfort@posteo.de>
Date
2025-05-10 18:29
gen_model: allow to pass more than two languages

Diffstat

M gen_model.py 22 ++++++++++++++--------

1 files changed, 14 insertions, 8 deletions


diff --git a/gen_model.py b/gen_model.py

@@ -8,28 +8,34 @@ def get_data(lang):
    8     8     return {k: v / raw['n_words'][len(k) - 1] for k, v in raw['freq'].items()}
    9     9 
   10    10 
   11    -1 def get_model(lang1, lang2, n=8, ndigits=None):
   12    -1     data1 = get_data(lang1)
   13    -1     data2 = get_data(lang2)
   -1    11 def abs_diff(arr):
   -1    12     return max(arr) - min(arr)
   14    13 
   15    -1     ngrams = list(set(data1.keys()) | set(data2.keys()))
   -1    14 
   -1    15 def get_model(*langs, n=8, ndigits=None):
   -1    16     data = {lang: get_data(lang) for lang in langs}
   -1    17 
   -1    18     ngrams = set()
   -1    19     for d in data.values():
   -1    20         ngrams.update(d.keys())
   -1    21     ngrams = list(ngrams)
   16    22 
   17    23     # prioritize by biggest absolute difference
   18    -1     ngrams.sort(key=lambda k: abs(data1.get(k, 0) - data2.get(k, 0)))
   -1    24     ngrams.sort(key=lambda k: abs_diff([d.get(k, 0) for d in data.values()]))
   19    25     ngrams = ngrams[-n:]
   20    26 
   21    27     return {
   22    28         'ngrams': ngrams,
   23    29         'freq': {
   24    -1             lang1: [round(data1.get(g, 0), ndigits) for g in ngrams],
   25    -1             lang2: [round(data2.get(g, 0), ndigits) for g in ngrams],
   -1    30             lang: [round(d.get(g, 0), ndigits) for g in ngrams]
   -1    31             for lang, d in data.items()
   26    32         },
   27    33     }
   28    34 
   29    35 
   30    36 if __name__ == '__main__':
   31    37     parser = argparse.ArgumentParser()
   32    -1     parser.add_argument('lang', nargs=2)
   -1    38     parser.add_argument('lang', nargs='+')
   33    39     parser.add_argument('-n', type=int, default=8)
   34    40     parser.add_argument('-p', type=int, default=4)
   35    41     args = parser.parse_args()