|
7 | 7 | import re
|
8 | 8 | import sys
|
9 | 9 | from collections import namedtuple
|
10 |
| -from acainn import Lemmatizer |
11 |
| -from acainn import Features |
| 10 | +from gdtools.acainn import Lemmatizer |
| 11 | +from gdtools.acainn import Features |
12 | 12 | from pyconll.unit import Conll
|
13 | 13 |
|
14 | 14 | Split = namedtuple("split", "form1 upos1 xpos1 form2 upos2 xpos2")
|
@@ -166,19 +166,18 @@ def parse_brown_token(brown_token):
|
166 | 166 | return subtokens[0], "__MW" # special cases for multiword expressions like "ann an"
|
167 | 167 | return subtokens[0], subtokens[1].strip("*")
|
168 | 168 |
|
169 |
| -def process_file(brown_file, filename): |
| 169 | +def process_file(brown_file, file_id): |
170 | 170 | """Does the initial conversion to CoNLL-U format."""
|
171 | 171 | lemmatizer = Lemmatizer()
|
172 | 172 | result = []
|
173 |
| - file_id = filename.replace(".txt", "") |
174 | 173 | subcorpus = re.findall("^[a-z]*", file_id)[0]
|
175 | 174 | if subcorpus in ["c", "p", "s"] or file_id in ["n06", "n07", "n08", "n09", "n10"]:
|
176 | 175 | genre = "oral"
|
177 | 176 | else:
|
178 | 177 | genre = "written"
|
179 | 178 | sent_id = 0
|
180 | 179 | for sentence in split_sentences(brown_file, genre):
|
181 |
| - conllu_tokens = [s for s in process_sentence(sentence, lemmatizer)] |
| 180 | + conllu_tokens = process_sentence(sentence, lemmatizer) |
182 | 181 | if len(conllu_tokens) > 0:
|
183 | 182 | result.append(f"# sent_id = {file_id}_{sent_id}")
|
184 | 183 | result.extend(conllu_tokens)
|
@@ -293,7 +292,7 @@ def add_text(corpus):
|
293 | 292 | for filename in files:
|
294 | 293 | if filename.startswith(sys.argv[2]):
|
295 | 294 | with open(os.path.join(sys.argv[1], filename)) as file:
|
296 |
| - lines = process_file(file, filename) |
| 295 | + lines = process_file(file, filename.replace(".txt", "")) |
297 | 296 |
|
298 | 297 | c = Conll(lines)
|
299 | 298 | with_text = add_text(c)
|
|
0 commit comments