- commit
- 39930b5dc497f10d174802bfc943c5c18cdb4c6f
- parent
- 3db712ebeea5d1322b6ef9d9470955a566710262
- Author
- Tobias Bengfort <tobias.bengfort@posteo.de>
- Date
- 2025-05-06 05:20
add gen_model.py
Diffstat
| A | gen_model.py | 37 | +++++++++++++++++++++++++++++++++++++ |
1 files changed, 37 insertions, 0 deletions
diff --git a/gen_model.py b/gen_model.py
@@ -0,0 +1,37 @@
-1 1 import argparse
-1 2 import json
-1 3
-1 4
-1 5 def get_data(lang):
-1 6 with open(f'data/profiles/{lang}') as fh:
-1 7 raw = json.load(fh)
-1 8 return {k: v / raw['n_words'][len(k) - 1] for k, v in raw['freq'].items()}
-1 9
-1 10
-1 11 def get_model(lang1, lang2, n=8):
-1 12 data1 = get_data(lang1)
-1 13 data2 = get_data(lang2)
-1 14
-1 15 ngrams = list(set(data1.keys()) | set(data2.keys()))
-1 16
-1 17 # prioritize by biggest absolute difference
-1 18 ngrams.sort(key=lambda k: abs(data1.get(k, 0) - data2.get(k, 0)))
-1 19 ngrams = ngrams[-n:]
-1 20
-1 21 return {
-1 22 'ngrams': ngrams,
-1 23 'freq': {
-1 24 lang1: [data1.get(g, 0) for g in ngrams],
-1 25 lang2: [data2.get(g, 0) for g in ngrams],
-1 26 },
-1 27 }
-1 28
-1 29
-1 30 if __name__ == '__main__':
-1 31 parser = argparse.ArgumentParser()
-1 32 parser.add_argument('lang', nargs=2)
-1 33 parser.add_argument('-n', type=int, default=8)
-1 34 args = parser.parse_args()
-1 35
-1 36 model = get_model(*args.lang, n=args.n)
-1 37 print(json.dumps(model))