- commit
- 315458d0a09b6e790689e7e56ccb0428eca4a4c2
- parent
- 281071942a7647e36686aae2deb2511b2079b637
- Author
- Tobias Bengfort <tobias.bengfort@posteo.de>
- Date
- 2025-05-10 18:29
gen_model: allow to pass more than two languages
Diffstat
| M | gen_model.py | 22 | ++++++++++++++-------- |
1 files changed, 14 insertions, 8 deletions
diff --git a/gen_model.py b/gen_model.py
@@ -8,28 +8,34 @@ def get_data(lang): 8 8 return {k: v / raw['n_words'][len(k) - 1] for k, v in raw['freq'].items()} 9 9 10 1011 -1 def get_model(lang1, lang2, n=8, ndigits=None):12 -1 data1 = get_data(lang1)13 -1 data2 = get_data(lang2)-1 11 def abs_diff(arr): -1 12 return max(arr) - min(arr) 14 1315 -1 ngrams = list(set(data1.keys()) | set(data2.keys()))-1 14 -1 15 def get_model(*langs, n=8, ndigits=None): -1 16 data = {lang: get_data(lang) for lang in langs} -1 17 -1 18 ngrams = set() -1 19 for d in data.values(): -1 20 ngrams.update(d.keys()) -1 21 ngrams = list(ngrams) 16 22 17 23 # prioritize by biggest absolute difference18 -1 ngrams.sort(key=lambda k: abs(data1.get(k, 0) - data2.get(k, 0)))-1 24 ngrams.sort(key=lambda k: abs_diff([d.get(k, 0) for d in data.values()])) 19 25 ngrams = ngrams[-n:] 20 26 21 27 return { 22 28 'ngrams': ngrams, 23 29 'freq': {24 -1 lang1: [round(data1.get(g, 0), ndigits) for g in ngrams],25 -1 lang2: [round(data2.get(g, 0), ndigits) for g in ngrams],-1 30 lang: [round(d.get(g, 0), ndigits) for g in ngrams] -1 31 for lang, d in data.items() 26 32 }, 27 33 } 28 34 29 35 30 36 if __name__ == '__main__': 31 37 parser = argparse.ArgumentParser()32 -1 parser.add_argument('lang', nargs=2)-1 38 parser.add_argument('lang', nargs='+') 33 39 parser.add_argument('-n', type=int, default=8) 34 40 parser.add_argument('-p', type=int, default=4) 35 41 args = parser.parse_args()