-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathslashnulls
executable file
·353 lines (297 loc) · 16.3 KB
/
slashnulls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/local/bin/perl
# Author: Jason Eisner, University of Pennsylvania
# Usage: slashnulls [-u] [files ...]
#
# Munges parses produced by headall, and for which tags are NOT
# canonicalized. All empty categories become simply (-NONE- 0) if
# unrealized, or (EXP It) or (EXP it) if realized as an
# expletive. A slash category is given to the parent, e.g., (NP
# (-NONE- *T*)) becomes (NP/NP (-NONE- 0)) and (NP (-NONE- *T*-5))
# becomes (NP/NP-5 (-NONE 0)). If this slash category is indexed, as
# in /NP-5, then it is propagated up through its ancestors until it
# cancels with a real NP-5 (or perhaps WHNP-5 or S-5 or something) on
# its sibling. If it is not indexed, than it is not propagated up --
# meaning that its parent is considered to be complete even though
# this piece is missing -- with one exception: (S (NP-SBJ/NP-SBJ 0)
# ...) passes the /NP-SBJ up one level, to get (S/NP-SBJ-i
# (NP-SBJ/NP-SBJ-i 0) ...), for a new index i, since in fact
# argument-dropping in this case is licensed not by the subject's
# parent (the verb) but by its grandparent (the verb's governor).
# Notice that -i isn't matched in this case -- really its governor
# should be indexed with -i, in parallel to WH words, which likewise
# license gaps and are so indexed.
#
# We also have a concept of "knobs," which are the opposite of gaps,
# and represent constituents that have merely been string-moved in
# some way (e.g., extraposed or perhaps scrambled) without taking on
# new theta roles. See the file SLASH-AND-PLUS for details. An
# indexed category such as NP-5 that does not cancel against an /NP-5
# in its sibling is marked as +NP-5, and propagated up through
# ancestors until it cancels with a /NP-5 at one of those siblings.
# Note that + categories of this sort should cancel only against /
# categories that represent *ICH* or *EXP* nulls; this is not currently
# checked. !!! Where knobs are introduced, e.g., +NP-5, they do not
# take theta roles from their parents. The moreknobs script can
# be used to mark additional such constituents as knobs, using +.
#
# I make some effort to handle slash direction appropriately. If the
# empty category falls to the left of its parent's head kid, I use \.
# if it's to the right, I use /. There is one other possibility: that
# the empty category is its parent's head kid. (See killnulls -h for
# a discussion of these cases, or rather the subcases for which the empty
# category does have overt siblings.) In this case, we default to /, but if
# such an empty category is coindexed, we discard the sentence, on the
# grounds that it's almost certainly a case of ellipsis -- note that
# we generally discard gapped constructions, as noted below.
#
# Note that movement chains may have more than two positions.
# For example (leaving out some levels for readability):
#
# (SBAR (WHNP-1 which)
# (S\NP-1 (NP-2\NP-1 0) (VP\NP-2 struggled (S\NP-2 (NP\NP-2 0) (VP to live)))))
#
# It is not reasonable to run killnulls on such output, because
# then nothing would connect \NP-2 to WHNP-1. This is one reason
# one might use the -u flag, which arranges for indices in the same
# chain to be unified, giving
#
# (SBAR (WHNP-3 which)
# (S\NP-3 (NP-3\NP-3 0) (VP\NP-3 struggled (S\NP-3 (NP\NP-3 0) (VP to live)))))
#
# It is then possible to run killnulls on the result, yielding
#
# (SBAR (WHNP-3 which)
# (S\NP-3 (VP\NP-3 struggled (S\NP-3 (VP to live)))))
#
# Of course, if killnulls is run, this result doesn't show that object 3
# fills three roles -- without a stopping place in the subject
# position of the S, it misleadingly looks like S is just passing the
# gap along.
#
# Note in cases of conjunction or parasitic gaps, or PRO control by an
# already-moved element, two empty categories may have the same index
# and the same binder. We are careful not to pass the same slashed
# category twice in such cases, i.e., any constituent looking for two
# copies of NP-5 would really be satisfied by just one copy.
#
# The null elements we deal with do include *RNR* (right node raising)
# and *ICH* (site of extraposition leading to discontinuous
# constituents), as well as *EXP* (sister to expletives, which we merge
# with the expletive).
#
# But we simply delete the following null elements (which in practice
# should never be heads and should always have siblings -- which we check):
# *PPA* nulls, which give alternative possible attachment sites
# *U*, which represents absent units like "dollars"
#
# In addition, we discard sentences that include
# tags containing = (intrasentential gapping, e.g. in conjunction; even if we
# used discardconj, some gapping constructions may have
# survived, since implicit conjunction with commas is not
# always (not ever?) annotated with CONJP)
# FRAG tags (intersentential gapping)
# ?*? nulls (intrasentential gapping placeholder in certain environments)
# *NOT* ("anti-placeholder" for not-quite-parallel gapping)
# coindexed nulls in head position
# (usually denotes ellipsis: see discussion above)
#
# Special handling:
# (SBAR (-NONE- 0) ...) is preprocessed into (SBAR (IN (-NONE- 0)) ...),
# which is what it really means. Then
# (SBAR (IN (-NONE- 0)) (S (-NONE- *T*-n))) -- as in "It's raining, she said" --
# is preprocessed into
# (SBAR (-NONE- *T*-n))
# which is really what's going on, since you can't get "It's raining, she said that":
# we'd like to pick up that SBAR/S-n always rewrites as 0, i.e., we can't extract
# just the S-n part. The real point of doing this is to avoid failing on these
# cases just because a head is a coindexed null as described above.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
$uniq = 1, shift(@ARGV) if $ARGV[0] eq "-u";
die "$0: -u not implemented yet, aborting\n" if $uniq;
die "$0: bad command line flags" if @ARGV && $ARGV[0] =~ /^-./;
$token = "[^ \t\n()]+"; # anything but parens or whitespace can be a token
$restoftoken = "[^ \t\n()]*";
$ind = "-[0-9]+\\b"; # index on null or overt element
$tokennoind = "(?:(?!$ind)[^ \t\n()])+"; # part of token that does not include an index (basically, stops at first -[0-9])
$expletiveword = "(?:[Ii]t)"; # regexp matching expletive headwords
while (<>) { # for each sentence
chop;
s/^(\S+:[0-9]+:\t)?//, $location = $&;
unless (/^\#/) { # unless a comment
$sent++;
# throw out sentences involving gapping etc. -- currently have no good convention for these
if (/\(FRAG\b/) {
$discards++, $_ = "# DISCARDED by $0: contains FRAG nonterminal for intrasentential gapping";
} elsif (/\($token=[0-9]/o) {
$discards++, $_ = "# DISCARDED by $0: contains gapping $&)";
} elsif (/ \?\*\?/) {
$discards++, $_ = "# DISCARDED by $0: contains ?*? trace for intrasentential gapping";
} elsif (/ \*NOT\*/) {
$discards++, $_ = "# DISCARDED by $0: contains *NOT* placeholder for gapping";
} else {
# fix up various things about particular nulls
$nullcomps += s/(\(SBAR\b$restoftoken \@?)(\(-NONE- \@0\))/$1(IN \@$2)/og; # wrap (IN ...) around null complementizers
s/(\(SBAR\b$restoftoken) \@?\(IN \@\(-NONE- \@0\)\) \@?\(S\b$restoftoken( \@\(-NONE- $token\)\))\)/$1$2/og; # fix SBAR traces that are written as having separate null complementizer and S trace
die "$0:$location *PPA* serves as head, aborting\n" if s/ \@\($token \@\(-NONE- \@\*PPA\*$restoftoken\)\)//og;
$killnulls += s/ \($token \@\(-NONE- \@\*PPA\*$restoftoken\)\)//og; # remove *PPA*
die "$0:$location *U* serves as head, aborting\n" if s/ \@\(-NONE- \@\*U\*$restoftoken\)//og;
$killnulls += s/ \(-NONE- \@\*U\*$restoftoken\)//og; # remove *U*; this is different from *PPA* because it isn't promoted by any unary rule
s/\((NP\b$restoftoken) \@?\(NP \@\(PRP (\@$expletiveword)\)\) \@?\($token \@\(-NONE- \@\*EXP\*($ind)\)\)\)/($1 \@(-NONE- $2$3))/og; # merge expletive with the *EXP* trace (a sister); the expletive becomes -NONE- (later to be changed to EXP), rather than an NP over PRP, and inherits the index from *EXP*. Note that we have dropped a level of structure, to prevent the trace from looking like it was buried inside the subject and therefore being a head. Also, we drop the nonterminal that used to dominate *EXP* (this usually shows up at the landing site), together with any index on that (which only appears occasionally, and seemingly by accident, i.e., it doesn't corefer).
die "$0:$location *EXP* appeared in unexpected context, aborting\n" if /\(-NONE- \*EXP\*$restoftoken\)/;
die "$0:$location empty constituent, probably from deleting *PPA* or *U*; aborting\n" if /\($token\)/;
# another opportunity to discard, once we've done the fixup above
if (/\@\($token \@\(-NONE- \@$token$ind\)\)/o) {
$discards++, $_ = "# DISCARDED by $0: coindexed null in head position (usually denotes ellipsis): $&";
} else {
# ok, now go change the nulls into slashes etc.
($_, @export) = &constit(0);
# another opportunity to discard -- quit if there are unmatched slashes.
for ($i=1; $i < @export; $i+=2) {
$_ = "# DISCARDED by $0: unmatched slash $export[$i-1]", last if $export[$i] < 0;
}
unless ($i < @export) { # unless we discarded and quit the loop early
# if there were unmatched knobs left over, get rid of them throughout the parse.
# contrasts with behavior of unmatched slashes, above.
# the annotation standard says it's okay (though accidental) to index constituents
# that no trace refers to.
for ($i=1; $i < @export; $i+=2) {
warn "$0:$location removing unwanted index from $export[$i-1]; also removing corresponding knobs\n";
local($index) = -$export[$i];
$export[$i-1] =~ /^\+($tokennoind)$index$/ || die "$0:$location internal error";
local($simpletag) = $1;
s/\(\+?$simpletag$index /\($simpletag /g; # strip the index (and any added +) from the base position
s/\+$simpletag$index\b//g; # strip the knob anywhere it shows up
}
# as post-processing, propagate any unindexed slash for NP-SBJ
# as described above. For the purposes of this comment, I'll
# just write NP for brevity.
#
# Careful about cases like (S~
# (NP~-4\NP~ @(-NONE ..., which should turn into
# (S~\NP-8 (NP~-4\NP~-8 @(-NONE ..., preserving the
# existing knob index. Notice that the containing S might actually
# be marked with +, and passed upward as a knob:
# It (VP+S-1 is refreshing (+S-1 NP/NP to drink lemonade)).
# We turn this into
# (NP\NP-1 It) (VP+S-1 is refreshing (+S-1/NP-8 NP/NP-8 to drink lemonade)).
# One might think we'd turn it into
# (NP\NP-1 It) (VP+[S-1/NP-8] is refreshing (+S-1/NP-8 NP/NP-8 to drink lemonade)).
# but this script never produces complex gaps or complex knobs.
# Similar troublesome examples can exist before the postprocessing step:
local($minind) = 0; # interpret index as negative number
while (/\($token($ind) /og) { $minind = $1 if $1 < $minind; }
s/(\(\+?S\b$restoftoken)( \(\+?NP~?-SBJ$restoftoken)(\\$tokennoind)( \@?\(-NONE- )/$minind--, $1.$3.$minind.$2.$3.$minind.$4/geo;
}
# Relabel expletives as EXP, not -NONE-. We left them as -NONE- earlier
# so they'd be treated as slashes.
s/\(-NONE- (\@$expletiveword)\)/\(EXP $1\)/go;
}
}
}
print "$location$_\n";
}
print STDERR "$0: $boundnulls bound nulls, $unboundnulls unbound nulls including $nullcomps complementizers, $killnulls removed nulls; discarded $discards of $sent sentences\n";
# -------------------------
# Reads in the next constit, and following whitespace, from the front of $_.
# Any preceding @ is assumed to have been read by the caller.
#
# input: -1, 0, or 1 according to whether this child is to the left, inside, or to the
# right of the caller's head
# output: a tuple:
# - an appropriately slashed version of the text of the constituent.
# (Includes tag and index.)
# - a list of gaps and knobs exported by this constituent, in some
# aesthetically semireasonable order. For convenience, each knob
# is followed by a positive integer denoting its index, and each
# gap is similarly followed by a negative integer.
# Example: ("/NP-5", -5, "+S-3", 3, ...)
# This includes a knob for the constituent itself.
# Discipline: each regexp that eats text is required to eat
# any following whitespace, too.
sub constit {
local($slashdir) = @_;
local($basictag, $index, $fulltag, $kidtext, @export);
s/^\(\s*// || die "$0:$location open paren expected to start $_"; # eat open paren
s/^($tokennoind)($ind)?\s*//o || die "$0:$location no tag"; # eat tag
$basictag = $1;
$index = $2;
$fulltag = $basictag.$index; # we may add slashed categories to this. $kidtext will be the text of the kids.
if (s/^\@\(-NONE- \@($tokennoind)($ind)?\)\s*//o) { # this constituent holds just an empty category
local($null) = $1;
local($traceindex) = $2;
$null = "0" unless $null =~ /^[Ii]t$/; # force all nulls to be 0 (except expletives)
die "$0:$location internal error: indexed trace as a head, should already have discarded this sentence: $_" if $slashdir == 0 && $traceindex;
if ($traceindex) { $boundnulls++ } else { $unboundnulls++ }
local($slash) = ($slashdir < 0 ? "\\" : "/") . "$basictag$traceindex";
$fulltag .= $slash;
$kidtext = " \@(-NONE- \@$null)";
@export = ($slash, $traceindex) if $traceindex; # pass the slash up if it's indexed
} elsif (/^\@?\(/) { # if tag is followed by at least one subconstituent (possibly marked with @)
local($subslashdir) = -1; # haven't seen head yet
until (/^\)/) { # eat all the subconstits recursively
local($headmark);
$headmark = "@" if s/^\@//;
$subslashdir++ if $headmark;
local($subtext, @subexport) = &constit($subslashdir);
$kidtext .= " $headmark$subtext";
# In the lines below, we're a little careful to get a nice order
# for the various knobs and slashes, sort of reflecting their
# obliqueness. Not perfect though!
if ($subslashdir <= 0) {
unshift(@export, @subexport);
} else {
push(@export, @subexport);
}
$subslashdir++ if $headmark;
}
# Duplicate slashes can arise in across-the-board gapping. "Unify" them
# by removing one. They may have different forms, such as /X-i and \Y-i;
# pick one arbitrarily. Similarly check for duplicate knobs, though we don't
# expect these to ever occur.
for ($i=$#export; $i >= 1; $i-=2) {
for ($j=$#export; $j > $i; $j-=2) {
if ($export[$j] == $export[$i]) { # duplication
warn "$0:$location astonished by duplicate knobs $export[$i-1] and $export[$j-1] (removing one)\n" if $export[$j] > 0;
splice(@export, $j-1, 2); # remove $j, but not $i
}
}
}
# Now cancel knobs against gaps.
for ($i=$#export; $i >= 1; $i-=2) {
local($matched) = 0;
for ($j=$#export; $j > $i; $j-=2) {
if ($export[$j] == -$export[$i]) { # cancellation
splice(@export, $j-1, 2); # remove $j
$matched = 1; # and remember to remove $i
}
}
splice(@export, $i-1, 2) if $matched; # now delete $i
}
# add the remaining knobs and slashes to the tag.
for ($i=0; $i < @export; $i += 2) {
$fulltag .= $export[$i];
if ($export[$i+1] > 0) {
}
}
} else { # tag is followed by lexical item
s/^(\@$token)\s*//o || die "$0:$location no lex item or lex item not marked as head: $_";
$kidtext = " $1";
@export = ();
}
s/^\)\s*// || die "$0:$location close paren expected to start $_";
# this constituent itself provides a knob if indexed. Before adding
# it to @export, we'll check that it doesn't match anything already
# in @export, which would be an error.
if ($index) {
local($knob) = "+$basictag$index";
for ($i=1; $i<@export; $i+=2) {
if ($index == $export[$i] || $index == -$export[$i]) {
warn "$0:$location i-within-i condition violated by $knob and $export[$i-1] (will ignore higher one)\n";
undef $knob, last;
}
}
unshift(@export, $knob, -$index) if defined $knob;
}
# return value.
("($fulltag$kidtext)", @export);
}