-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrc_ud2.py
82 lines (78 loc) · 2.04 KB
/
src_ud2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
treebanks = {
'ar': 'Arabic',
'bg': 'Bulgarian',
'ca': 'Catalan',
'cs': 'Czech',
'cs_cac': 'Czech-CAC',
'cs_cltt': 'Czech-CLTT',
'cu': 'Old_Church_Slavonic',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'en_lines': 'English-LinES',
'en_partut': 'English-ParTUT',
'es': 'Spanish',
'es_ancora': 'Spanish-AnCora',
'et': 'Estonian',
'eu': 'Basque',
'fa': 'Persian',
'fi': 'Finnish',
'fi_ftb': 'Finnish-FTB',
'fr': 'French',
'fr_partut': 'French-ParTUT',
'fr_sequoia': 'French-Sequoia',
'ga': 'Irish',
'gl': 'Galician',
'gl_treegal': 'Galician-TreeGal',
'got': 'Gothic',
'grc': 'Ancient_Greek',
'grc_proiel': 'Ancient_Greek-PROIEL',
'he': 'Hebrew',
'hi': 'Hindi',
'hr': 'Croatian',
'hu': 'Hungarian',
'id': 'Indonesian',
'it': 'Italian',
'it_partut': 'Italian-ParTUT',
'ja': 'Japanese',
'kk': 'Kazakh',
'ko': 'Korean',
'la': 'Latin',
'la_ittb': 'Latin-ITTB',
'la_proiel': 'Latin-PROIEL',
'lv': 'Latvian',
'nl': 'Dutch',
'nl_lassysmall': 'Dutch-LassySmall',
'no_bokmaal': 'Norwegian-Bokmaal',
'no_nynorsk': 'Norwegian-Nynorsk',
'pl': 'Polish',
'pt': 'Portuguese',
'pt_br': 'Portuguese-BR',
'ro': 'Romanian',
'ru': 'Russian',
'ru_syntagrus': 'Russian-SynTagRus',
'sk': 'Slovak',
'sl': 'Slovenian',
'sl_sst': 'Slovenian-SST',
'sv': 'Swedish',
'sv_lines': 'Swedish-LinES',
'tr': 'Turkish',
'ug': 'Uyghur',
'uk': 'Ukrainian',
'ur': 'Urdu',
'vi': 'Vietnamese',
'zh': 'Chinese',
}
surprise = {
'bxr': 'Buryat',
'kmr': 'Kurmanji',
'sme': 'North_Sami',
'hsb': 'Upper_Sorbian',
}
no_dev = {'fr_partut', 'ga', 'gl_treegal', 'kk', 'la', 'sl_sst', 'ug', 'uk'}
no_lemma = {'en_lines', 'id', 'ko', 'pt_br', 'sv_lines', 'ug'}
def path(lang, ds='train', folder="/data/ud-treebanks-conll2017/"):
"""-> str: the path for lang"""
return "{}UD_{}/{}-ud-{}.conllu" \
.format(folder, treebanks[lang], lang, ds)