Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ptb.py (new file), convert.py (lines of code added inside) #8

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 282 additions & 0 deletions semstr/conversion/ptb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
from ucca import core, layer0, layer1
from ucca.layer1 import EdgeTags
from .format import FormatConverter

import sys
import re


class PtbConverter(FormatConverter):
def __init__(self):
self.node_by_id = {}
self.terminals = []
self.pending_nodes = []
self.node_ids_with_children = set()
self.sentence_id = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is self.sentence_id used anywhere? If not, please remove it.

self.passage_id = 3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better initialize self.passage_id to None or remove it if it's not needed.


def _build_passage(self, stream):
# p = core.Passage(self.sentence_id or self.passage_id)
p = core.Passage(self.passage_id)
l0 = layer0.Layer0(p)
l1 = layer1.Layer1(p)
paragraph = 1

next(self.parse(stream))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line seems to parse the first expression from stream but does not assign it to anything. What is it for then? To initialize self.pending_nodes? Please document it appropriately.


# add normal nodes
self.pending_nodes = list(reversed(self.pending_nodes))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason for reversing the nodes? Please document appropriately.

while self.pending_nodes:
for i in reversed(range(len(self.pending_nodes))):
parent_id, edge_tag, node_id = self.pending_nodes[i]
parent = self.node_by_id.get(parent_id, -1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to use None instead of -1 here. Note that None is the default in the dict get method, so you can just call self.node_by_id.get(parent_id) without a second argument to get.

if parent != -1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And then this line will be if parent is not None.

del self.pending_nodes[i]
implicit = node_id not in self.node_ids_with_children
node = l1.add_fnode(parent, edge_tag, implicit=implicit)
if edge_tag == EdgeTags.Punctuation:
node.tag = layer1.NodeTags.Punctuation
self.node_by_id[node_id] = node

# add terminals
for text, tag, edge_tag, parent_id in self.terminals:
punctuation = (tag == layer0.NodeTags.Punct)
terminal = l0.add_terminal(text=text, punct=punctuation, paragraph=paragraph)
try:
parent = self.node_by_id[parent_id]
except KeyError as e:
raise ValueError("Terminal ('%s') with bad parent (%s) in passage %s" % (text, parent_id, p.ID)) from e
if parent is None:
print("Terminal is a child of the root: '%s'" % text, file=sys.stderr)
parent = l1.add_fnode(parent, edge_tag)
if edge_tag != EdgeTags.Terminal:
print("Terminal with incoming %s edge: '%s'" % (edge_tag, text), file=sys.stderr)
parent.add(EdgeTags.Terminal, terminal)
return p

def from_format(self, stream, passage_id, return_original=False, **kwargs):
self.passage_id = passage_id
passage = self._build_passage(stream)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if there is more than one tree in the stream? Shouldn't we have a loop and then yield a passage for each one?

passage.extra["format"] = kwargs.get("format") or "ptb"
# yield (passage, self.lines_read, passage.ID) if return_original else passage
yield passage
self.node_by_id = None
self.lines_read = []

def parse(self, line_or_lines):
def istok(t, i):
return getattr(t, 'token_id', None) is i

stack = []
node_id = 0
for tok in lex(line_or_lines):
if tok.token_id is LPAREN_TOKEN:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The content of these two if is the same, so better join them as if tok.token_id is LPAREN_TOKEN or tok.token_id is STRING_TOKEN.
Also, why is here and not ==?

stack.append(tok)
elif tok.token_id is STRING_TOKEN:
stack.append(tok)
# right parentheses, wrap up tree node
else:
# if terminal
if (istok(stack[-1], STRING_TOKEN) and
istok(stack[-2], STRING_TOKEN) and
istok(stack[-3], LPAREN_TOKEN)):
w = Leaf(stack[-1].value, stack[-2].value, parent_id=node_id)
stack.pop()
stack.pop()
stack.pop()
stack.append(TExpr(w, node_id=node_id))
self.terminals.append((w.word, w.tag, w.edge_tag, node_id))
# if regular node
else:
tx = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does tx stand for? Maybe use a more meaningful name.

tail = None
peers = []
while not istok(stack[-1], LPAREN_TOKEN):
head = stack.pop()
# if it is the edge tag
if istok(head, STRING_TOKEN):
tx = TExpr(
Symbol(head.value), first_child=tail, next_sibling=None, node_id=node_id
)
# if it is a TExp object
else:
peers.append(head)
head.next_sibling = tail
tail = head
stack.pop()

if tx is None:
tx = TExpr(None, node_id=node_id)

tx.child_list = peers
for child in tx.children():
child.parent_node = tx
# if the child is a terminal
if isinstance(child.head, Leaf):
self.node_ids_with_children.add(child.node_id)
self.pending_nodes.append((child.parent_node.node_id, str(child.head.pos), child.node_id))
self.node_ids_with_children.add(child.parent_node.node_id)
# if the child is a regular node
else:
self.pending_nodes.append((child.parent_node.node_id, str(child.head), child.node_id))
self.node_ids_with_children.add(child.parent_node.node_id)
# node is root
if child.parent_node.node_id is None:
self.node_by_id[child.node_id] = None
if not stack:
self.node_by_id[node_id] = None
yield tx
else:
stack.append(tx)
node_id += 1


#######
# Utils
#######

def gensym():
return object()


##################
# Lexer
##################


LPAREN_TOKEN = gensym()
RPAREN_TOKEN = gensym()
STRING_TOKEN = gensym()

class Token(object):
_token_ids = {LPAREN_TOKEN:"(", RPAREN_TOKEN:")", STRING_TOKEN:"STRING"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just use the string itself as the value?
I mean, something like LPAREN = "(" etc.


def __init__(self, token_id, value=None, lineno=None):
self.token_id = token_id
self.value = value
self.lineno = lineno

def __str__(self):
return "Token:'{tok}'{ln}".format(
tok=(self.value if self.value is not None else self._token_ids[self.token_id]),
ln=(':{}'.format(self.lineno) if self.lineno is not None else '')
)


_token_pat = re.compile(r'\(|\)|[^()\s]+')
def lex(line_or_lines):
"""
Create a generator which returns tokens parsed from the input.
The input can be either a single string or a sequence of strings.
"""

if isinstance(line_or_lines, str):
line_or_lines = [line_or_lines]

counter = 0
for n, line in enumerate(line_or_lines):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the enumerate necessary? I don't see that n is used.

line.strip()
for m in _token_pat.finditer(line):
if m.group() == '(':
# if there is an extra '(' at the beginning of the sentence
if counter == 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't it be an error if there is an extra ( at the beginning?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some sentences starts with an extra ( that plays the role of 'ROOT', so there's shouldn't be an error. It doesn't get in count here in order to "ignore" the additional ROOT node that will appear if it will be taken into acount.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine, but please document this in a comment.

continue
counter += 1
yield Token(LPAREN_TOKEN)
elif m.group() == ')':
counter += 1
yield Token(RPAREN_TOKEN)
else:
counter += 1
yield Token(STRING_TOKEN, value=m.group())


##################
# Parser
##################


class Symbol:
_pat = re.compile(r'(?P<label>^[^0-9=-]+)|(?:-(?P<tag>[^0-9=-]+))|(?:=(?P<parind>[0-9]+))|(?:-(?P<coind>[0-9]+))')

def __init__(self, label):
self.label = label
self.tags = []
self.coindex = None
self.parindex = None
for m in self._pat.finditer(label):
if m.group('label'):
self.label = m.group('label')
elif m.group('tag'):
self.tags.append(m.group('tag'))
elif m.group('parind'):
self.parindex = m.group('parind')
elif m.group('coind'):
self.coindex = m.group('coind')

def simplify(self):
self.tags = []
self.coindex = None
self.parindex = None

def __str__(self):
return '{}{}{}{}'.format(
self.label,
''.join('-{}'.format(t) for t in self.tags),
('={}'.format(self.parindex) if self.parindex is not None else ''),
('-{}'.format(self.coindex) if self.coindex is not None else '')
)


PUNC = [',', '.']
class Leaf:
def __init__(self, word, pos, parent_id, edge_tag = "Terminal", node_id = None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use the constant layer1.EdgeTags.Terminal

self.word = word
self.pos = pos
self.parent_id = parent_id
self.edge_tag = edge_tag
self.tag = self.is_punc(word)
self.node_id = node_id
# self.is_punc = (self.pos == layer0.NodeTags.Punct)

def __str__(self):
return '({} {})'.format(self.pos, self.word)

def is_punc(self, word):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method's name is a little misleading, since it implies that it returns True or False. Maybe rename it to get_node_tag or something. Also, I think it's better to use the PTB tag to determine if a token is punctuation. See the list of tags: https://www.clips.uantwerpen.be/pages/mbsp-tags

if word in PUNC:
return "Punctuation"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use the constants from layer0.NodeTags

else:
return "Word"

class TExpr:
def __init__(self, head, first_child = None, next_sibling=None, node_id=None):
self.head = head
self.child_list = []
self.parent_node = None
self.node_id = node_id
self.first_child = first_child
self.next_sibling = next_sibling

def symbol(self):
if hasattr(self.head, 'label'):
return self.head
else:
return None

def parent(self):
return self.parent_node

def children(self):
return self.child_list

def leaf(self):
if hasattr(self.head, 'pos'):
return self.head
else:
return None

def rule(self):
if self.leaf():
return '{} -> {}'.format(self.leaf().pos, self.leaf().word)
else:
return '{} -> {}'.format(self.symbol(), ' '.join(str(c.symbol() or c.leaf().pos) for c in self.children()))
30 changes: 30 additions & 0 deletions semstr/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,34 @@ def to_sdp(passage, test=False, tree=False, mark_aux=False, **kwargs):
return SdpConverter(mark_aux=mark_aux, tree=tree).to_format(passage, test=test, format=kwargs.get("format"))


def from_ptb(lines, passage_id=None, return_original=False, **kwargs):
#def from_ptb(passage, passage_id=None, return_original=False, **kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove commented-out lines unless there's a reason to keep them.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed.

"""Converts from parsed text in Penn Treebank mrg format to a Passage object.

:param lines: iterable of lines in Penn Treebank mrg format, describing a single passage.
:param passage_id: ID to set for passage, overriding the ID from the file
:param return_original: return triple of (UCCA passage, Mrg string, sentence ID)

:return generator of Passage objects
"""
from semstr.conversion.ptb import PtbConverter
return PtbConverter().from_format(lines, passage_id=passage_id, return_original=return_original,
format=kwargs.get("format"))


def to_ptb(passage, test=False, tree=False, **kwargs):
""" Convert from a Passage object to a string in Penn TreeBank mrg format (export)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not export, that's the name of another format.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, forgot to change it after copying it. I removed it.


:param passage: the Passage object to convert
:param test: whether to omit the edge and parent columns. Defaults to False
:param tree: whether to omit columns for non-primary parents. Defaults to False

:return list of lines representing a (discontinuous) tree structure constructed from the passage
"""
from semstr.conversion.ptb import PtbConverter
return PtbConverter().to_format(passage, test=test, tree=tree, format=kwargs.get("format"))


CONVERTERS = {
None: (None, None),
"json": (from_json, to_json),
Expand All @@ -180,6 +208,8 @@ def to_sdp(passage, test=False, tree=False, mark_aux=False, **kwargs):
"export": (from_export, to_export),
"amr": (from_amr, to_amr),
"txt": (from_text, to_text),
"ptb": (from_ptb, to_ptb),
"mrg": (from_ptb, to_ptb),
}
FROM_FORMAT = {f: c[0] for f, c in CONVERTERS.items() if c[0] is not None}
TO_FORMAT = {f: c[1] for f, c in CONVERTERS.items() if c[1] is not None}
Expand Down