-
Notifications
You must be signed in to change notification settings - Fork 0
/
untemplate.py
184 lines (153 loc) · 5.05 KB
/
untemplate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Copyright 2010 The Untemplate Authors. All Rights Reserved.
"""Reverse engineers templates."""
def getData(*docs):
""" Takes docs and returns relevant data for each docs """
# split docs into list of words
split_docs = []
split_docs = map(lambda(doc): doc.split(), docs)
# make template
template = makeTemplate(split_docs)
#return template # uncomment to just get template for debugging
# return list of lists of data
data = map(lambda(doc): getDataFromDoc(doc, template), split_docs)
return data
def compareLists(a, b):
"""Computes the edit distance and list from the two lists.
Uses the Levenshtein edit distance algorithm, with code modified from
http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Python
Returns:
The edit distance, the edit list.
"""
source_len, target_len = len(a), len(b)
edit_lists = [[]]
distance = [[]]
for i in range(target_len+1):
edit_lists[0].append(['I'] * i)
distance[0].append(i)
for j in range(1, source_len+1):
edit_lists.append([['D'] * j])
distance.append([j])
for i in range(source_len):
for j in range(target_len):
cost = 1
if a[i] == b[j]:
cost = 0
deletion = distance[i][j+1] + 1
insertion = distance[i+1][j] + 1
substitution = distance[i][j] + cost
if deletion <= insertion and deletion <= substitution:
# Deletion is best.
best = deletion
edit_list = list(edit_lists[i][j+1])
edit_list.append('D')
elif insertion <= substitution:
# Insertion is best.
best = insertion
edit_list = list(edit_lists[i+1][j])
edit_list.append('I')
edit_lists[i+1].append(edit_list)
else:
# Substitution is best.
best = substitution
edit_list = list(edit_lists[i][j])
if cost:
edit_list.append('S')
else:
edit_list.append('=')
edit_lists[i+1].append(edit_list)
distance[i+1].append(best)
return distance[source_len][target_len], edit_lists[source_len][target_len]
def makeTemplateFromPair(doc1, doc2):
"""
Returns a list template given a pair of documents
Notice that a template of the form returned by this
function can also be in input for generalization
"""
# use the longest one as the source for consistency
if len(doc1) >= len(doc2):
longer = doc1
shorter = doc2
else:
longer = doc2
shorter = doc1
score, edit_list = compareLists(longer, shorter)
# convert edit_list to template representation
template = []
for i in range(len(longer)):
if edit_list[i] == '=':
template.append(longer[i])
else:
template.append('*')
return template
def makeTemplate(docs):
"""
Makes the most general template given a collection of documents
@ param docs - a list of documents
"""
# handle trivial cases
if len(docs) == 0:
return None
if len(docs) == 1:
return docs[0]
# apply pairwise templating
template = makeTemplateFromPair(docs[0], docs[1])
for i in range(1, len(docs)):
template = makeTemplateFromPair(template, docs[i])
return formatTemplate(template)
def formatTemplate(raw_template):
"""
Get the template into the format we want for parsing.
Interfaces with getDataFromDoc().
Could replace with regex type thing eventually.
"""
clean_template = []
# Compress any runs of multiple *s into single *
last = raw_template[0]
clean_template.append(last)
for i in range(1, len(raw_template)):
if last == '*' and raw_template[i] == '*':
continue
else:
last = raw_template[i]
clean_template.append(last)
return clean_template
def getDataFromDoc(doc, template):
"""
Applies template to doc to pull out content.
Must interface with formatTemplate().
Could eventually replace this with some regex thing.
Use compareLists and state machine type implementation.
"""
states = {'no_content' : 0, 'writing_content' : 1}
state = states['no_content']
matches = []
# get edit_list which should only have '=' and runs of 'S'('I')* or 'D's
score, edit_list = compareLists(template, doc)
#print 'template: ', template
#print 'edit_list: ', edit_list
#print 'doc: ', doc
assert(len(edit_list) >= len(doc))
current_matches = []
# loop over edit_list to to find matches
for i in range(len(edit_list)):
if state == states['no_content']:
# check whether to enter writing state
if edit_list[i] == '=':
continue
elif edit_list[i] == 'D':
matches.append([]) # signify empty frame
elif edit_list[i] == 'S':
state = states['writing_content']
current_matches = [doc[i]]
else: #state == states['writing_content']
# check whether to stop writing content
if edit_list[i] == '=':
state = states['no_content']
matches.append(current_matches)
current_matches = []
else:
current_matches.append(doc[i])
# add anything that was in writing state when loop ended
if len(current_matches) > 0:
matches.append(current_matches)
return matches