-
Notifications
You must be signed in to change notification settings - Fork 0
/
dump_speaker_data.py
87 lines (74 loc) · 2.79 KB
/
dump_speaker_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- py-shell-name: "python2" -*-
from __future__ import print_function
import itertools
import logging
import pickle
import sys
from os.path import dirname, abspath, basename, splitext
from os.path import join as pjoin
from util.talkbank_parser import MorParser
from util.tagset_rewrite_rules import cha_to_larc as rewrite_rules
here = dirname(abspath(__file__))
logging.basicConfig(filename='translations.log', format='[%(levelname)s] %(message)s')
def rule_size(rule):
size = 0
constraints, rewrite = rule
for feature, pattern in constraints.items():
if isinstance(pattern, str):
size += 1
else:
for item in pattern:
size += 1
return size
class BoomException(Exception):
pass
def rewriter(mortoken):
""" Accepts a MorToken and returns the translated part of speech. If there
is no suitable translation, mortoken.pos is returned.
"""
token = mortoken._asdict()
rules = sorted(rewrite_rules, key=rule_size, reverse=True)
for pattern, rewrite in rules:
# rule matches if pattern's values are equal to those in token. pattern
# can be a subset.
match = True
for attrib, value in pattern.items():
if isinstance(value, str):
if token[attrib] != value:
match = False
else:
for part in value:
if part not in token[attrib]:
match = False
if match:
return mortoken.word, rewrite
logging.warning("failed to translate {0}".format(mortoken.__repr__()))
return mortoken.word, mortoken.pos
def extract_feature_utterances(filenames, feature, speaker=None, cutoff=0):
parser = MorParser("{http://www.talkbank.org/ns/talkbank}")
corpus = itertools.chain(*(parser.parse(i) for i in filenames))
if feature == "pos":
f = rewriter
elif feature == "word":
f = lambda x: x.word
utterances = [[f(w) for w in u[1]]
for u in corpus
if ((u[0] == speaker or speaker is None)
and len(u[1]) >= cutoff)]
return utterances
if __name__ == "__main__":
try:
speaker, feature, cutoff = sys.argv[1:4]
filenames = sys.argv[4:]
except ValueError:
raise Exception("Three+ args required: speaker, feature (pos | word) "
"and xml files")
name = splitext(basename(filenames[0]))[0]
utterances = extract_feature_utterances(filenames, feature,
speaker, int(cutoff))
for line in utterances:
print(line)
outfn = pjoin(here, 'utterances', "%s-%s.pk" % (speaker, feature))
print("pickling utterances")
with open(outfn, 'w') as fh:
pickle.dump(utterances, fh)