Produce dependency parses of SST

stanfordnlp · May 29, 2015 · b451977 · b451977
1 parent 33da5a5
commit b451977
Show file tree

Hide file tree

Showing 5 changed files with 267 additions and 175 deletions.
diff --git a/fetch_and_preprocess.sh b/fetch_and_preprocess.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 python2.7 scripts/download.py
+
+CLASSPATH="lib:lib/stanford-parser/stanford-parser.jar:lib/stanford-parser/stanford-parser-3.5.1-models.jar"
+javac -cp $CLASSPATH lib/*.java
 python2.7 scripts/preprocess-sick.py
 python2.7 scripts/preprocess-sst.py
 

diff --git a/lib/ConstituencyParse.java b/lib/ConstituencyParse.java
@@ -6,12 +6,18 @@
 import edu.stanford.nlp.util.StringUtils;
 import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
 import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
 import edu.stanford.nlp.trees.Tree;
 import edu.stanford.nlp.trees.Trees;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+import edu.stanford.nlp.trees.TypedDependency;
 
 import java.io.BufferedWriter;
 import java.io.FileWriter;
 import java.io.StringReader;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
@@ -21,118 +27,197 @@
 
 public class ConstituencyParse {
 
+  private boolean tokenize;
+  private BufferedWriter tokWriter, parentWriter;
+  private LexicalizedParser parser;
+  private TreeBinarizer binarizer;
+  private CollapseUnaryTransformer transformer;
+  private GrammaticalStructureFactory gsf;
+
+  private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
+
+  public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException {
+    this.tokenize = tokenize;
+    if (tokPath != null) {
+      tokWriter = new BufferedWriter(new FileWriter(tokPath));
+    }
+    parentWriter = new BufferedWriter(new FileWriter(parentPath));
+    parser = LexicalizedParser.loadModel(PCFG_PATH);
+    binarizer = TreeBinarizer.simpleTreeBinarizer(
+      parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
+    transformer = new CollapseUnaryTransformer();
+
+    // set up to produce dependency representations from constituency trees
+    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+    gsf = tlp.grammaticalStructureFactory();
+  }
+
+  public List<HasWord> sentenceToTokens(String line) {
+    List<HasWord> tokens = new ArrayList<>();
+    if (tokenize) {
+      PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
+      for (Word label; tokenizer.hasNext(); ) {
+        tokens.add(tokenizer.next());
+      }
+    } else {
+      for (String word : line.split(" ")) {
+        tokens.add(new Word(word));
+      }
+    }
+
+    return tokens;
+  }
+
+  public Tree parse(List<HasWord> tokens) {
+    Tree tree = parser.apply(tokens);
+    return tree;
+  }
+
+  public int[] constTreeParents(Tree tree) {
+    Tree binarized = binarizer.transformTree(tree);
+    Tree collapsedUnary = transformer.transformTree(binarized);
+    Trees.convertToCoreLabels(collapsedUnary);
+    collapsedUnary.indexSpans();
+    List<Tree> leaves = collapsedUnary.getLeaves();
+    int size = collapsedUnary.size() - leaves.size();
+    int[] parents = new int[size];
+    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
+
+    int idx = leaves.size();
+    int leafIdx = 0;
+    for (Tree leaf : leaves) {
+      Tree cur = leaf.parent(collapsedUnary); // go to preterminal
+      int curIdx = leafIdx++;
+      boolean done = false;
+      while (!done) {
+        Tree parent = cur.parent(collapsedUnary);
+        if (parent == null) {
+          parents[curIdx] = 0;
+          break;
+        }
+
+        int parentIdx;
+        int parentNumber = parent.nodeNumber(collapsedUnary);
+        if (!index.containsKey(parentNumber)) {
+          parentIdx = idx++;
+          index.put(parentNumber, parentIdx);
+        } else {
+          parentIdx = index.get(parentNumber);
+          done = true;
+        }
+
+        parents[curIdx] = parentIdx + 1;
+        cur = parent;
+        curIdx = parentIdx;
+      }
+    }
+
+    return parents;
+  }
+
+  // convert constituency parse to a dependency representation and return the
+  // parent pointer representation of the tree
+  public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
+    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
+    Collection<TypedDependency> tdl = gs.typedDependencies();
+    int len = tokens.size();
+    int[] parents = new int[len];
+    for (int i = 0; i < len; i++) {
+      // if a node has a parent of -1 at the end of parsing, then the node
+      // has no parent.
+      parents[i] = -1;
+    }
+
+    for (TypedDependency td : tdl) {
+      // let root have index 0
+      int child = td.dep().index();
+      int parent = td.gov().index();
+      parents[child - 1] = parent;
+    }
+
+    return parents;
+  }
+
+  public void printTokens(List<HasWord> tokens) throws IOException {
+    int len = tokens.size();
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < len - 1; i++) {
+      if (tokenize) {
+        sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
+      } else {
+        sb.append(tokens.get(i).word());
+      }
+      sb.append(' ');
+    }
+
+    if (tokenize) {
+      sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
+    } else {
+      sb.append(tokens.get(len - 1).word());
+    }
+
+    sb.append('\n');
+    tokWriter.write(sb.toString());
+  }
+
+  public void printParents(int[] parents) throws IOException {
+    StringBuilder sb = new StringBuilder();
+    int size = parents.length;
+    for (int i = 0; i < size - 1; i++) {
+      sb.append(parents[i]);
+      sb.append(' ');
+    }
+    sb.append(parents[size - 1]);
+    sb.append('\n');
+    parentWriter.write(sb.toString());
+  }
+
+  public void close() throws IOException {
+    if (tokWriter != null) tokWriter.close();
+    parentWriter.close();
+  }
+
   public static void main(String[] args) throws Exception {
     Properties props = StringUtils.argsToProperties(args);
-    if (!props.containsKey("tokpath") ||
-        !props.containsKey("parentpath")) {
+    if (!props.containsKey("parentpath")) {
       System.err.println(
-        "usage: java ConstituencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
+        "usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
       System.exit(1);
     }
 
+    // whether to tokenize input sentences
     boolean tokenize = false;
     if (props.containsKey("tokenize")) {
       tokenize = true;
     }
 
-    String tokPath = props.getProperty("tokpath");
-    String parentPath = props.getProperty("parentpath");
-
-    BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
-    BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
+    // whether to produce dependency trees from the constituency parse
+    boolean deps = false;
+    if (props.containsKey("deps")) {
+      deps = true;
+    }
 
-    LexicalizedParser parser = LexicalizedParser.loadModel(
-      "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
-    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(
-      parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
-    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
+    String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
+    String parentPath = props.getProperty("parentpath");
+    ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize);
 
     Scanner stdin = new Scanner(System.in);
     int count = 0;
     long start = System.currentTimeMillis();
     while (stdin.hasNextLine()) {
       String line = stdin.nextLine();
-      List<HasWord> tokens = new ArrayList<>();
-      if (tokenize) {
-        PTBTokenizer<Word> tokenizer = new PTBTokenizer(
-          new StringReader(line), new WordTokenFactory(), "");
-        for (Word label; tokenizer.hasNext(); ) {
-          tokens.add(tokenizer.next());
-        }
-      } else {
-        for (String word : line.split(" ")) {
-          tokens.add(new Word(word));
-        }
-      }
-
-      Tree tree = parser.apply(tokens);
-      Tree binarized = binarizer.transformTree(tree);
-      Tree collapsedUnary = transformer.transformTree(binarized);
-      Trees.convertToCoreLabels(collapsedUnary);
-      collapsedUnary.indexSpans();
-
-      List<Tree> leaves = collapsedUnary.getLeaves();
-      int size = collapsedUnary.size() - leaves.size();
-      int[] parents = new int[size];
-      HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
-
-      int idx = leaves.size();
-      int leafIdx = 0;
-      for (Tree leaf : leaves) {
-        Tree cur = leaf.parent(collapsedUnary); // go to preterminal
-        int curIdx = leafIdx++;
-        boolean done = false;
-        while (!done) {
-          Tree parent = cur.parent(collapsedUnary);
-          if (parent == null) {
-            parents[curIdx] = 0;
-            break;
-          }
-
-          int parentIdx;
-          int parentNumber = parent.nodeNumber(collapsedUnary);
-          if (!index.containsKey(parentNumber)) {
-            parentIdx = idx++;
-            index.put(parentNumber, parentIdx);
-          } else {
-            parentIdx = index.get(parentNumber);
-            done = true;
-          }
-
-          parents[curIdx] = parentIdx + 1;
-          cur = parent;
-          curIdx = parentIdx;
-        }
-      }
-
-      // print tokens
-      int len = tokens.size();
-      StringBuilder sb = new StringBuilder();
-      for (int i = 0; i < len - 1; i++) {
-        if (tokenize) {
-          sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
-        } else {
-          sb.append(tokens.get(i).word());
-        }
-        sb.append(' ');
-      }
-      if (tokenize) {
-        sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
-      } else {
-        sb.append(tokens.get(len - 1).word());
+      List<HasWord> tokens = processor.sentenceToTokens(line);
+      Tree parse = processor.parse(tokens);
+
+      // produce parent pointer representation
+      int[] parents = deps ? processor.depTreeParents(parse, tokens)
+                           : processor.constTreeParents(parse);
+
+      // print
+      if (tokPath != null) {
+        processor.printTokens(tokens);
       }
-      sb.append('\n');
-      tokWriter.write(sb.toString());
-
-      // print parent pointers
-      sb = new StringBuilder();
-      for (int i = 0; i < size - 1; i++) {
-        sb.append(parents[i]);
-        sb.append(' ');
-      }
-      sb.append(parents[size - 1]);
-      sb.append('\n');
-      parentWriter.write(sb.toString());
+      processor.printParents(parents);
 
       count++;
       if (count % 1000 == 0) {
@@ -144,7 +229,6 @@ public static void main(String[] args) throws Exception {
     long totalTimeMillis = System.currentTimeMillis() - start;
     System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
       count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count);
-    tokWriter.close();
-    parentWriter.close();
+    processor.close();
   }
 }