-
Notifications
You must be signed in to change notification settings - Fork 0
/
sort.py
31 lines (23 loc) · 850 Bytes
/
sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import csv
import html
import sys
import wordfreq
if len(sys.argv) != 3:
print('Usage: python3 sort.py target-lang pairs.csv')
sys.exit(1)
targetLang = sys.argv[1]
pairsPath = sys.argv[2]
pairs = {}
with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
reader = csv.reader(pairsFile, delimiter='\t')
for row in reader:
words = wordfreq.tokenize(html.unescape(row[0]), targetLang)
freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
for word in words]
minfreq = min(freqs)
avgfreq = sum(freqs) / float(len(freqs))
pairs[row[0]] = (minfreq, avgfreq, row[1])
pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])
for pair in pairList:
sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))