Skip to content

Commit

Permalink
Produce dependency parses of SST
Browse files Browse the repository at this point in the history
  • Loading branch information
kaishengtai committed May 29, 2015
1 parent 33da5a5 commit b451977
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 175 deletions.
3 changes: 3 additions & 0 deletions fetch_and_preprocess.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash
python2.7 scripts/download.py

CLASSPATH="lib:lib/stanford-parser/stanford-parser.jar:lib/stanford-parser/stanford-parser-3.5.1-models.jar"
javac -cp $CLASSPATH lib/*.java
python2.7 scripts/preprocess-sick.py
python2.7 scripts/preprocess-sst.py

Expand Down
274 changes: 179 additions & 95 deletions lib/ConstituencyParse.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
Expand All @@ -21,118 +27,197 @@

public class ConstituencyParse {

private boolean tokenize;
private BufferedWriter tokWriter, parentWriter;
private LexicalizedParser parser;
private TreeBinarizer binarizer;
private CollapseUnaryTransformer transformer;
private GrammaticalStructureFactory gsf;

private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException {
this.tokenize = tokenize;
if (tokPath != null) {
tokWriter = new BufferedWriter(new FileWriter(tokPath));
}
parentWriter = new BufferedWriter(new FileWriter(parentPath));
parser = LexicalizedParser.loadModel(PCFG_PATH);
binarizer = TreeBinarizer.simpleTreeBinarizer(
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
transformer = new CollapseUnaryTransformer();

// set up to produce dependency representations from constituency trees
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
gsf = tlp.grammaticalStructureFactory();
}

public List<HasWord> sentenceToTokens(String line) {
List<HasWord> tokens = new ArrayList<>();
if (tokenize) {
PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
for (Word label; tokenizer.hasNext(); ) {
tokens.add(tokenizer.next());
}
} else {
for (String word : line.split(" ")) {
tokens.add(new Word(word));
}
}

return tokens;
}

public Tree parse(List<HasWord> tokens) {
Tree tree = parser.apply(tokens);
return tree;
}

public int[] constTreeParents(Tree tree) {
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();
List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
int[] parents = new int[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();

int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
parents[curIdx] = 0;
break;
}

int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}

parents[curIdx] = parentIdx + 1;
cur = parent;
curIdx = parentIdx;
}
}

return parents;
}

// convert constituency parse to a dependency representation and return the
// parent pointer representation of the tree
public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
Collection<TypedDependency> tdl = gs.typedDependencies();
int len = tokens.size();
int[] parents = new int[len];
for (int i = 0; i < len; i++) {
// if a node has a parent of -1 at the end of parsing, then the node
// has no parent.
parents[i] = -1;
}

for (TypedDependency td : tdl) {
// let root have index 0
int child = td.dep().index();
int parent = td.gov().index();
parents[child - 1] = parent;
}

return parents;
}

public void printTokens(List<HasWord> tokens) throws IOException {
int len = tokens.size();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
} else {
sb.append(tokens.get(i).word());
}
sb.append(' ');
}

if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
} else {
sb.append(tokens.get(len - 1).word());
}

sb.append('\n');
tokWriter.write(sb.toString());
}

public void printParents(int[] parents) throws IOException {
StringBuilder sb = new StringBuilder();
int size = parents.length;
for (int i = 0; i < size - 1; i++) {
sb.append(parents[i]);
sb.append(' ');
}
sb.append(parents[size - 1]);
sb.append('\n');
parentWriter.write(sb.toString());
}

public void close() throws IOException {
if (tokWriter != null) tokWriter.close();
parentWriter.close();
}

public static void main(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
if (!props.containsKey("tokpath") ||
!props.containsKey("parentpath")) {
if (!props.containsKey("parentpath")) {
System.err.println(
"usage: java ConstituencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
"usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
System.exit(1);
}

// whether to tokenize input sentences
boolean tokenize = false;
if (props.containsKey("tokenize")) {
tokenize = true;
}

String tokPath = props.getProperty("tokpath");
String parentPath = props.getProperty("parentpath");

BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
// whether to produce dependency trees from the constituency parse
boolean deps = false;
if (props.containsKey("deps")) {
deps = true;
}

LexicalizedParser parser = LexicalizedParser.loadModel(
"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
String parentPath = props.getProperty("parentpath");
ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize);

Scanner stdin = new Scanner(System.in);
int count = 0;
long start = System.currentTimeMillis();
while (stdin.hasNextLine()) {
String line = stdin.nextLine();
List<HasWord> tokens = new ArrayList<>();
if (tokenize) {
PTBTokenizer<Word> tokenizer = new PTBTokenizer(
new StringReader(line), new WordTokenFactory(), "");
for (Word label; tokenizer.hasNext(); ) {
tokens.add(tokenizer.next());
}
} else {
for (String word : line.split(" ")) {
tokens.add(new Word(word));
}
}

Tree tree = parser.apply(tokens);
Tree binarized = binarizer.transformTree(tree);
Tree collapsedUnary = transformer.transformTree(binarized);
Trees.convertToCoreLabels(collapsedUnary);
collapsedUnary.indexSpans();

List<Tree> leaves = collapsedUnary.getLeaves();
int size = collapsedUnary.size() - leaves.size();
int[] parents = new int[size];
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();

int idx = leaves.size();
int leafIdx = 0;
for (Tree leaf : leaves) {
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
int curIdx = leafIdx++;
boolean done = false;
while (!done) {
Tree parent = cur.parent(collapsedUnary);
if (parent == null) {
parents[curIdx] = 0;
break;
}

int parentIdx;
int parentNumber = parent.nodeNumber(collapsedUnary);
if (!index.containsKey(parentNumber)) {
parentIdx = idx++;
index.put(parentNumber, parentIdx);
} else {
parentIdx = index.get(parentNumber);
done = true;
}

parents[curIdx] = parentIdx + 1;
cur = parent;
curIdx = parentIdx;
}
}

// print tokens
int len = tokens.size();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
} else {
sb.append(tokens.get(i).word());
}
sb.append(' ');
}
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
} else {
sb.append(tokens.get(len - 1).word());
List<HasWord> tokens = processor.sentenceToTokens(line);
Tree parse = processor.parse(tokens);

// produce parent pointer representation
int[] parents = deps ? processor.depTreeParents(parse, tokens)
: processor.constTreeParents(parse);

// print
if (tokPath != null) {
processor.printTokens(tokens);
}
sb.append('\n');
tokWriter.write(sb.toString());

// print parent pointers
sb = new StringBuilder();
for (int i = 0; i < size - 1; i++) {
sb.append(parents[i]);
sb.append(' ');
}
sb.append(parents[size - 1]);
sb.append('\n');
parentWriter.write(sb.toString());
processor.printParents(parents);

count++;
if (count % 1000 == 0) {
Expand All @@ -144,7 +229,6 @@ public static void main(String[] args) throws Exception {
long totalTimeMillis = System.currentTimeMillis() - start;
System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count);
tokWriter.close();
parentWriter.close();
processor.close();
}
}
Loading

0 comments on commit b451977

Please sign in to comment.