Skip to content

Commit 798221f

Browse files
hived off libraries into new gdtools project
1 parent 26e1887 commit 798221f

27 files changed

+595
-3177
lines changed

brown_gd_to_conll.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
import re
88
import sys
99
from collections import namedtuple
10-
from acainn import Lemmatizer
11-
from acainn import Features
10+
from gdtools.acainn import Lemmatizer
11+
from gdtools.acainn import Features
1212
from pyconll.unit import Conll
1313

1414
Split = namedtuple("split", "form1 upos1 xpos1 form2 upos2 xpos2")
@@ -166,19 +166,18 @@ def parse_brown_token(brown_token):
166166
return subtokens[0], "__MW" # special cases for multiword expressions like "ann an"
167167
return subtokens[0], subtokens[1].strip("*")
168168

169-
def process_file(brown_file, filename):
169+
def process_file(brown_file, file_id):
170170
"""Does the initial conversion to CoNLL-U format."""
171171
lemmatizer = Lemmatizer()
172172
result = []
173-
file_id = filename.replace(".txt", "")
174173
subcorpus = re.findall("^[a-z]*", file_id)[0]
175174
if subcorpus in ["c", "p", "s"] or file_id in ["n06", "n07", "n08", "n09", "n10"]:
176175
genre = "oral"
177176
else:
178177
genre = "written"
179178
sent_id = 0
180179
for sentence in split_sentences(brown_file, genre):
181-
conllu_tokens = [s for s in process_sentence(sentence, lemmatizer)]
180+
conllu_tokens = process_sentence(sentence, lemmatizer)
182181
if len(conllu_tokens) > 0:
183182
result.append(f"# sent_id = {file_id}_{sent_id}")
184183
result.extend(conllu_tokens)
@@ -293,7 +292,7 @@ def add_text(corpus):
293292
for filename in files:
294293
if filename.startswith(sys.argv[2]):
295294
with open(os.path.join(sys.argv[1], filename)) as file:
296-
lines = process_file(file, filename)
295+
lines = process_file(file, filename.replace(".txt", ""))
297296

298297
c = Conll(lines)
299298
with_text = add_text(c)

brown_gd_to_dot_ccg.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import pickle
33
import sys
4-
from acainn import Lemmatizer, Retagger, Subcat, Typer
4+
from gdtools.acainn import Lemmatizer, Retagger, Subcat, Typer
55

66
def tidy_word(string):
77
"""outputs string suitable for XMLification further down the pipeline"""

0 commit comments

Comments
 (0)