Skip to content

Commit

Permalink
Bunch of improvements, including tag dictionary and model-specific rules
Browse files Browse the repository at this point in the history
  • Loading branch information
Mike Lewis committed May 12, 2014
1 parent 0dd6b47 commit 00e1f9a
Show file tree
Hide file tree
Showing 48 changed files with 384 additions and 38 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added bin/uk/ac/ed/easyccg/main/EasyCCG.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Category$AtomicCategory.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Category$FunctorCategory.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Category.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardApplication.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardComposition.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator$Conjunction.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardApplication.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardComposition.class
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuation.class
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/Combinator.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/InputReader$InputToParser.class
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/InputReader.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar$1.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar$AgendaItem.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar$Cell1Best.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar$CellNBest.class
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar$ChartCell.class
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar$SeenRules.class
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/ParserAStar.class
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.class
Binary file not shown.
Binary file added bin/uk/ac/ed/easyccg/syntax/TagDict$1.class
Binary file not shown.
Binary file added bin/uk/ac/ed/easyccg/syntax/TagDict.class
Binary file not shown.
Binary file not shown.
Binary file modified bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.class
Binary file not shown.
Binary file modified easyccg.jar
Binary file not shown.
34 changes: 29 additions & 5 deletions src/uk/ac/ed/easyccg/main/EasyCCG.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,23 @@
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.Collection;
import java.util.InputMismatchException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.concurrent.TimeUnit;

import uk.ac.ed.easyccg.syntax.Category;
import uk.ac.ed.easyccg.syntax.InputReader;
import uk.ac.ed.easyccg.syntax.ParsePrinter;
import uk.ac.ed.easyccg.syntax.Parser;
import uk.ac.ed.easyccg.syntax.ParserAStar;
import uk.ac.ed.easyccg.syntax.ParserAStar.SuperTaggingResults;
import uk.ac.ed.easyccg.syntax.Parser;
import uk.ac.ed.easyccg.syntax.SyntaxTreeNode;
import uk.ac.ed.easyccg.syntax.SyntaxTreeNode.SyntaxTreeNodeFactory;
import uk.ac.ed.easyccg.syntax.TagDict;
import uk.ac.ed.easyccg.syntax.TaggerEmbeddings;
import uk.co.flamingpenguin.jewel.cli.ArgumentValidationException;
import uk.co.flamingpenguin.jewel.cli.CliFactory;
Expand Down Expand Up @@ -53,11 +59,18 @@ public interface CommandLineArguments
@Option(shortName="s", description = "(Optional) Allow rules not involving category combinations seen in CCGBank. Slows things down by around 20%.")
boolean getUnrestrictedRules();

@Option(defaultValue="0.0001", description = "(Optional) Prunes lexical categories whose probability is less than this ratio of the best category. Defaults to 0.0001.")
double getSupertaggerbeam();

@Option(defaultValue="0.0", description = "(Optional) If using N-best parsing, filter parses whose probability is lower than this fraction of the probability of the best parse. Defaults to 0.0")
double getNbestBeam();
double getNbestbeam();

@Option(helpRequest = true, description = "Display this message", shortName = "h")
boolean getHelp();

@Option(description = "(Optional) Make a tag dictionary")
boolean getMakeTagDict();

}

public enum InputFormat {
Expand All @@ -78,17 +91,26 @@ public static void main(String[] args) throws IOException, ArgumentValidationExc
CommandLineArguments parsedArgs = CliFactory.parseArguments(CommandLineArguments.class, args);
InputFormat input = InputFormat.valueOf(parsedArgs.getInputFormat().toUpperCase());

if (parsedArgs.getMakeTagDict()) {
InputReader reader = InputReader.make(input, new SyntaxTreeNodeFactory(parsedArgs.getMaxLength(), 0));
Map<String, Collection<Category>> tagDict = TagDict.makeDict(reader.readFile(parsedArgs.getInputFile()));
TagDict.writeTagDict(tagDict, parsedArgs.getModel());
System.exit(0);
}


if (!parsedArgs.getModel().exists()) throw new InputMismatchException("Couldn't load model from from: " + parsedArgs.getModel());
System.err.println("Loading model...");

Parser parser = new ParserAStar(
new TaggerEmbeddings(parsedArgs.getModel(), parsedArgs.getMaxLength()),
new TaggerEmbeddings(parsedArgs.getModel(), parsedArgs.getMaxLength(), parsedArgs.getSupertaggerbeam()),
parsedArgs.getMaxLength(),
parsedArgs.getNbest(),
parsedArgs.getNbestBeam(),
parsedArgs.getNbestbeam(),
input,
parsedArgs.getRootCategories(),
new File(parsedArgs.getModel(), "unaryRules"),
new File(parsedArgs.getModel(), "binaryRules"),
parsedArgs.getUnrestrictedRules() ? null : new File(parsedArgs.getModel(), "seenRules")
);

Expand All @@ -98,6 +120,8 @@ public static void main(String[] args) throws IOException, ArgumentValidationExc
if ((outputFormat == OutputFormat.PROLOG || outputFormat == OutputFormat.EXTENDED) && input != InputFormat.POSANDNERTAGGED) throw new Error("Must use \"-i POSandNERtagged\" for this output");

if (!parsedArgs.getInputFile().getName().isEmpty()) {
System.err.println("Parsing...");

// Read from file
Stopwatch t = Stopwatch.createStarted();
SuperTaggingResults results = new SuperTaggingResults();
Expand All @@ -115,7 +139,7 @@ public static void main(String[] args) throws IOException, ArgumentValidationExc
}

System.err.println("Sentences parsed: " + results.parsedSentences);
System.err.println("Speed: " + twoDP.format((double) results.parsedSentences / t.elapsed(TimeUnit.SECONDS)) + " sentences per second");
System.err.println("Speed: " + twoDP.format(1000.0 * results.parsedSentences / t.elapsed(TimeUnit.MILLISECONDS)) + " sentences per second");
} else {
// Read from stdin
Scanner sc = new Scanner(System.in,"UTF-8");
Expand Down
4 changes: 3 additions & 1 deletion src/uk/ac/ed/easyccg/syntax/Category.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import com.google.common.collect.ImmutableSet;

abstract class Category {
public abstract class Category {
private final String cat;
private final int id;
private final static String WILDCARD_FEATURE = "X";
Expand Down Expand Up @@ -62,6 +62,8 @@ public boolean matches(Slash other)
public static final Category SEMICOLON = Category.valueOf(";");
public static final Category CONJ = Category.valueOf("conj");
public final static Category N = valueOf("N");
public static final Category LQU = Category.valueOf("LQU");
public static final Category LRB = Category.valueOf("LRB");

public static Category valueOf(String cat) {

Expand Down
97 changes: 91 additions & 6 deletions src/uk/ac/ed/easyccg/syntax/Combinator.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package uk.ac.ed.easyccg.syntax;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;

import uk.ac.ed.easyccg.syntax.Category.Slash;

import com.google.common.collect.ImmutableList;

public abstract class Combinator
{
public enum RuleType {
Expand All @@ -33,17 +33,41 @@ public RuleProduction(RuleType ruleType, Category result, final boolean headIsLe

public abstract boolean headIsLeft(Category left, Category right);

public final static Collection<Combinator> combinators = ImmutableList.copyOf(Arrays.asList(
public final static Collection<Combinator> STANDARD_COMBINATORS = new ArrayList<Combinator>(Arrays.asList(
new ForwardApplication(),
new BackwardApplication(),
new ForwardComposition(Slash.FWD, Slash.FWD, Slash.FWD),
new BackwardComposition(Slash.FWD, Slash.BWD, Slash.FWD),
new GeneralizedForwardComposition(Slash.FWD, Slash.FWD, Slash.FWD),
new GeneralizedBackwardComposition(Slash.FWD, Slash.BWD, Slash.FWD),
new Conjunction(),
new RemovePunctuation(false)
new RemovePunctuation(false),
new RemovePunctuationLeft()
));

public static Collection<Combinator> loadSpecialCombinators(File file) throws IOException {
Collection<Combinator> newCombinators = new ArrayList<Combinator>();
for (String line : Util.readFile(file)) {
// l , S[to]\NP NP\NP
if (line.indexOf("#") > -1) {
line = line.substring(0, line.indexOf("#"));
}

line = line.trim();
if (line.isEmpty()) {
continue ;
}

String[] fields = line.split(" ");
boolean headIsLeft = fields[0].equals("l");
Category left = Category.valueOf(fields[1]);
Category right = Category.valueOf(fields[2]);
Category result = Category.valueOf(fields[3]);
newCombinators.add(new SpecialCombinator(left, right, result, headIsLeft));
}
return newCombinators;
}


private final RuleType ruleType;
public abstract boolean canApply(Category left, Category right);
Expand All @@ -62,9 +86,9 @@ private static Category correctWildCardFeatures(Category toCorrect, Category mat
/**
* Returns a set of rules that can be applied to a pair of categories.
*/
static Collection<RuleProduction> getRules(Category left, Category right) {
static Collection<RuleProduction> getRules(Category left, Category right, Collection<Combinator> rules) {
Collection<RuleProduction> result = new ArrayList<RuleProduction>(2);
for (Combinator c : combinators) {
for (Combinator c : rules) {
if (c.canApply(left, right)) {
result.add(new RuleProduction(c.ruleType, c.apply(left, right), c.headIsLeft(left, right)));
}
Expand Down Expand Up @@ -139,8 +163,69 @@ public boolean headIsLeft(Category left, Category right)
return !punctuationIsLeft;
}
}

/**
* Open Brackets and Quotations
*/
private static class RemovePunctuationLeft extends Combinator {
private RemovePunctuationLeft()
{
super(RuleType.RP);
}

@Override
public boolean canApply(Category left, Category right)
{
return left == Category.LQU || left == Category.LRB;
}

@Override
public Category apply(Category left, Category right)
{
return right;
}

@Override
public boolean headIsLeft(Category left, Category right)
{
return false;
}
}

private static class SpecialCombinator extends Combinator {
private final Category left;
private final Category right;
private final Category result;
private final boolean headIsLeft;

private SpecialCombinator(Category left, Category right, Category result, boolean headIsLeft)
{
super(RuleType.NOISE);
this.left = left;
this.right = right;
this.result = result;
this.headIsLeft = headIsLeft;
}

@Override
public boolean canApply(Category left, Category right)
{
return this.left.matches(left) && this.right.matches(right);
}

@Override
public Category apply(Category left, Category right)
{
return result;
}

@Override
public boolean headIsLeft(Category left, Category right)
{
return headIsLeft;
}
}

private static class ForwardApplication extends Combinator {
private ForwardApplication()
{
Expand Down
2 changes: 1 addition & 1 deletion src/uk/ac/ed/easyccg/syntax/InputReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ public String getWordsAsString()
result.append(word.word + " ");
}

return words.toString().trim();
return result.toString().trim();
}

public static InputToParser fromTokens(List<String> tokens) {
Expand Down
15 changes: 12 additions & 3 deletions src/uk/ac/ed/easyccg/syntax/ParserAStar.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class ParserAStar implements Parser

public ParserAStar(Tagger tagger, int maxSentenceLength, int nbest, double nbestBeam, InputFormat inputFormat, List<String> validRootCategories,
File unaryRulesFile,
File extraCombinatorsFile,
File seenRulesFile)
throws IOException {
this.tagger = tagger;
Expand All @@ -45,7 +46,14 @@ public ParserAStar(Tagger tagger, int maxSentenceLength, int nbest, double nbest
this.unaryRules = loadUnaryRules(unaryRulesFile);
this.seenRules = new SeenRules(seenRulesFile);
this.nbestBeam = Math.log(nbestBeam);


List<Combinator> combinators = new ArrayList<Combinator>(Combinator.STANDARD_COMBINATORS);

if (extraCombinatorsFile.exists()) {
combinators.addAll(Combinator.loadSpecialCombinators(extraCombinatorsFile));
}
this.binaryRules = ImmutableList.copyOf(combinators);

List<Category> cats = new ArrayList<Category>();
for (String cat : validRootCategories) {
cats.add(Category.valueOf(cat));
Expand All @@ -55,6 +63,7 @@ public ParserAStar(Tagger tagger, int maxSentenceLength, int nbest, double nbest

private final int maxLength;

private final Collection<Combinator> binaryRules;
private final Multimap<Category, Category> unaryRules;
private final int nbest;
private final double nbestBeam;
Expand Down Expand Up @@ -260,7 +269,7 @@ public int compareTo(AgendaItem o)
}

if (result == 0) {
// All other things being equal, prefer the parse with shorter dependencies (i.e. right-branching).
// All other things being equal, it works best to prefer parser with longer dependencies (i.e. non-local attachment).
return parse.totalDependencyLength - o.parse.totalDependencyLength;
} else {
return result;
Expand Down Expand Up @@ -443,7 +452,7 @@ private Collection<RuleProduction> getRules(Category left, Category right) {

Collection<RuleProduction> result = rightToRules.get(right);
if (result == null) {
result = Combinator.getRules(left, right);
result = Combinator.getRules(left, right, binaryRules);
rightToRules.put(right, ImmutableList.copyOf(result));
}

Expand Down
28 changes: 27 additions & 1 deletion src/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@ SyntaxTreeNodeLeaf getHead()
{
return headIsLeft ? leftChild.getHead() : rightChild.getHead();
}
@Override
int dimension()
{
int left = leftChild.dimension();
int right = rightChild.dimension();
if (left == right) {
return 1 + left;
} else {
return Math.max(left, right);
}


}

}

Expand Down Expand Up @@ -127,6 +140,11 @@ SyntaxTreeNodeLeaf getHead()
{
return this;
}
@Override
int dimension()
{
return 0;
}
}

static class SyntaxTreeNodeUnary extends SyntaxTreeNode {
Expand Down Expand Up @@ -162,12 +180,20 @@ SyntaxTreeNodeLeaf getHead()
{
return child.getHead();
}

@Override
int dimension()
{
return child.dimension();
}
}

public String toString() {
return ParsePrinter.CCGBANK_PRINTER.print(this, -1);
}

abstract int dimension();

public int getHeadIndex() {
return headIndex;
}
Expand All @@ -181,7 +207,7 @@ public int compareTo(SyntaxTreeNode o)
/**
* Factory for SyntaxTreeNode. Using a factory so we can have different hashing/caching behaviour when N-best parsing.
*/
static class SyntaxTreeNodeFactory {
public static class SyntaxTreeNodeFactory {
private final int[][] categoryHash;
private final int[][] dependencyHash;
private final boolean hashWords;
Expand Down
Loading

0 comments on commit 00e1f9a

Please sign in to comment.