diff --git a/bin/uk/ac/ed/easyccg/main/EasyCCG$CommandLineArguments.class b/bin/uk/ac/ed/easyccg/main/EasyCCG$CommandLineArguments.class new file mode 100644 index 0000000..1985b86 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/main/EasyCCG$CommandLineArguments.class differ diff --git a/bin/uk/ac/ed/easyccg/main/EasyCCG$InputFormat.class b/bin/uk/ac/ed/easyccg/main/EasyCCG$InputFormat.class new file mode 100644 index 0000000..f359372 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/main/EasyCCG$InputFormat.class differ diff --git a/bin/uk/ac/ed/easyccg/main/EasyCCG$OutputFormat.class b/bin/uk/ac/ed/easyccg/main/EasyCCG$OutputFormat.class new file mode 100644 index 0000000..85c4893 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/main/EasyCCG$OutputFormat.class differ diff --git a/bin/uk/ac/ed/easyccg/main/EasyCCG.class b/bin/uk/ac/ed/easyccg/main/EasyCCG.class new file mode 100644 index 0000000..1fe8411 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/main/EasyCCG.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Category$AtomicCategory.class b/bin/uk/ac/ed/easyccg/syntax/Category$AtomicCategory.class index 7302fae..d20ef1d 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Category$AtomicCategory.class and b/bin/uk/ac/ed/easyccg/syntax/Category$AtomicCategory.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Category$FunctorCategory.class b/bin/uk/ac/ed/easyccg/syntax/Category$FunctorCategory.class index 745fcef..25ba902 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Category$FunctorCategory.class and b/bin/uk/ac/ed/easyccg/syntax/Category$FunctorCategory.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Category.class b/bin/uk/ac/ed/easyccg/syntax/Category.class index 0a583de..86c639f 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Category.class and b/bin/uk/ac/ed/easyccg/syntax/Category.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardApplication.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardApplication.class index ec930f7..79e6cf9 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardApplication.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardApplication.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardComposition.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardComposition.class index f985998..39b4da2 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardComposition.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$BackwardComposition.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$Conjunction.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$Conjunction.class index 7a00f8a..7bee92c 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$Conjunction.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$Conjunction.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardApplication.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardApplication.class index 7b2da9b..e932e5e 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardApplication.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardApplication.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardComposition.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardComposition.class index 8673e90..e4f57b7 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardComposition.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$ForwardComposition.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedBackwardComposition.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedBackwardComposition.class index 5218cc8..196da76 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedBackwardComposition.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedBackwardComposition.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedForwardComposition.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedForwardComposition.class index 8d074a7..f5fe464 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedForwardComposition.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$GeneralizedForwardComposition.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuation.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuation.class index 646960b..9c5bb80 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuation.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuation.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuationLeft.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuationLeft.class new file mode 100644 index 0000000..24ee517 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/syntax/Combinator$RemovePunctuationLeft.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator$SpecialCombinator.class b/bin/uk/ac/ed/easyccg/syntax/Combinator$SpecialCombinator.class new file mode 100644 index 0000000..616b404 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/syntax/Combinator$SpecialCombinator.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/Combinator.class b/bin/uk/ac/ed/easyccg/syntax/Combinator.class index 17eefc6..fe474cd 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/Combinator.class and b/bin/uk/ac/ed/easyccg/syntax/Combinator.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/InputReader$InputToParser.class b/bin/uk/ac/ed/easyccg/syntax/InputReader$InputToParser.class index 1127f26..640c2fc 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/InputReader$InputToParser.class and b/bin/uk/ac/ed/easyccg/syntax/InputReader$InputToParser.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/InputReader$SuperTaggedInputReader.class b/bin/uk/ac/ed/easyccg/syntax/InputReader$SuperTaggedInputReader.class index 6a8a174..05fa2f9 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/InputReader$SuperTaggedInputReader.class and b/bin/uk/ac/ed/easyccg/syntax/InputReader$SuperTaggedInputReader.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/InputReader.class b/bin/uk/ac/ed/easyccg/syntax/InputReader.class index 6906f2f..35e11ac 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/InputReader.class and b/bin/uk/ac/ed/easyccg/syntax/InputReader.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$1.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$1.class index 399cb1c..e1ec088 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$1.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$1.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$AgendaItem.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$AgendaItem.class index c3d703d..db091f6 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$AgendaItem.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$AgendaItem.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$Cell1Best.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$Cell1Best.class index 9a43c62..5bef352 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$Cell1Best.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$Cell1Best.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$CellNBest.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$CellNBest.class index 5e8706c..b006048 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$CellNBest.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$CellNBest.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$ChartCell.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$ChartCell.class index 5996056..939dbc5 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$ChartCell.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$ChartCell.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$GetSupertagsVisitor.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$GetSupertagsVisitor.class index e6116cc..9684244 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$GetSupertagsVisitor.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$GetSupertagsVisitor.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SeenRules.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SeenRules.class index 119d26b..2838cb1 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SeenRules.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SeenRules.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SuperTaggingResults.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SuperTaggingResults.class index ff1768a..41f42bd 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SuperTaggingResults.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar$SuperTaggingResults.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/ParserAStar.class b/bin/uk/ac/ed/easyccg/syntax/ParserAStar.class index 46fbc52..92046cc 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/ParserAStar.class and b/bin/uk/ac/ed/easyccg/syntax/ParserAStar.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeBinary.class b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeBinary.class index 7b60f24..43cf411 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeBinary.class and b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeBinary.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeFactory.class b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeFactory.class index 24c9b59..f7a652f 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeFactory.class and b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeFactory.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeLeaf.class b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeLeaf.class index 74ea7d4..66f3c29 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeLeaf.class and b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeLeaf.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeUnary.class b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeUnary.class index d61ab10..e1b5e34 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeUnary.class and b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode$SyntaxTreeNodeUnary.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.class b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.class index ac1fa1e..fce9813 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.class and b/bin/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/TagDict$1.class b/bin/uk/ac/ed/easyccg/syntax/TagDict$1.class new file mode 100644 index 0000000..4cd3dd1 Binary files /dev/null and b/bin/uk/ac/ed/easyccg/syntax/TagDict$1.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/TagDict.class b/bin/uk/ac/ed/easyccg/syntax/TagDict.class new file mode 100644 index 0000000..66d9c3e Binary files /dev/null and b/bin/uk/ac/ed/easyccg/syntax/TagDict.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings$ScoredCategory.class b/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings$ScoredCategory.class index 05e529a..5f8f27a 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings$ScoredCategory.class and b/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings$ScoredCategory.class differ diff --git a/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.class b/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.class index 06acd34..ee496de 100644 Binary files a/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.class and b/bin/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.class differ diff --git a/easyccg.jar b/easyccg.jar index 4bfa14c..0f2ef54 100644 Binary files a/easyccg.jar and b/easyccg.jar differ diff --git a/src/uk/ac/ed/easyccg/main/EasyCCG.java b/src/uk/ac/ed/easyccg/main/EasyCCG.java index 08711f8..42ddc98 100644 --- a/src/uk/ac/ed/easyccg/main/EasyCCG.java +++ b/src/uk/ac/ed/easyccg/main/EasyCCG.java @@ -3,17 +3,23 @@ import java.io.File; import java.io.IOException; import java.text.DecimalFormat; +import java.util.Collection; import java.util.InputMismatchException; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Scanner; import java.util.concurrent.TimeUnit; +import uk.ac.ed.easyccg.syntax.Category; +import uk.ac.ed.easyccg.syntax.InputReader; import uk.ac.ed.easyccg.syntax.ParsePrinter; +import uk.ac.ed.easyccg.syntax.Parser; import uk.ac.ed.easyccg.syntax.ParserAStar; import uk.ac.ed.easyccg.syntax.ParserAStar.SuperTaggingResults; -import uk.ac.ed.easyccg.syntax.Parser; import uk.ac.ed.easyccg.syntax.SyntaxTreeNode; +import uk.ac.ed.easyccg.syntax.SyntaxTreeNode.SyntaxTreeNodeFactory; +import uk.ac.ed.easyccg.syntax.TagDict; import uk.ac.ed.easyccg.syntax.TaggerEmbeddings; import uk.co.flamingpenguin.jewel.cli.ArgumentValidationException; import uk.co.flamingpenguin.jewel.cli.CliFactory; @@ -53,11 +59,18 @@ public interface CommandLineArguments @Option(shortName="s", description = "(Optional) Allow rules not involving category combinations seen in CCGBank. Slows things down by around 20%.") boolean getUnrestrictedRules(); + @Option(defaultValue="0.0001", description = "(Optional) Prunes lexical categories whose probability is less than this ratio of the best category. Defaults to 0.0001.") + double getSupertaggerbeam(); + @Option(defaultValue="0.0", description = "(Optional) If using N-best parsing, filter parses whose probability is lower than this fraction of the probability of the best parse. Defaults to 0.0") - double getNbestBeam(); + double getNbestbeam(); @Option(helpRequest = true, description = "Display this message", shortName = "h") boolean getHelp(); + + @Option(description = "(Optional) Make a tag dictionary") + boolean getMakeTagDict(); + } public enum InputFormat { @@ -78,17 +91,26 @@ public static void main(String[] args) throws IOException, ArgumentValidationExc CommandLineArguments parsedArgs = CliFactory.parseArguments(CommandLineArguments.class, args); InputFormat input = InputFormat.valueOf(parsedArgs.getInputFormat().toUpperCase()); + if (parsedArgs.getMakeTagDict()) { + InputReader reader = InputReader.make(input, new SyntaxTreeNodeFactory(parsedArgs.getMaxLength(), 0)); + Map> tagDict = TagDict.makeDict(reader.readFile(parsedArgs.getInputFile())); + TagDict.writeTagDict(tagDict, parsedArgs.getModel()); + System.exit(0); + } + + if (!parsedArgs.getModel().exists()) throw new InputMismatchException("Couldn't load model from from: " + parsedArgs.getModel()); System.err.println("Loading model..."); Parser parser = new ParserAStar( - new TaggerEmbeddings(parsedArgs.getModel(), parsedArgs.getMaxLength()), + new TaggerEmbeddings(parsedArgs.getModel(), parsedArgs.getMaxLength(), parsedArgs.getSupertaggerbeam()), parsedArgs.getMaxLength(), parsedArgs.getNbest(), - parsedArgs.getNbestBeam(), + parsedArgs.getNbestbeam(), input, parsedArgs.getRootCategories(), new File(parsedArgs.getModel(), "unaryRules"), + new File(parsedArgs.getModel(), "binaryRules"), parsedArgs.getUnrestrictedRules() ? null : new File(parsedArgs.getModel(), "seenRules") ); @@ -98,6 +120,8 @@ public static void main(String[] args) throws IOException, ArgumentValidationExc if ((outputFormat == OutputFormat.PROLOG || outputFormat == OutputFormat.EXTENDED) && input != InputFormat.POSANDNERTAGGED) throw new Error("Must use \"-i POSandNERtagged\" for this output"); if (!parsedArgs.getInputFile().getName().isEmpty()) { + System.err.println("Parsing..."); + // Read from file Stopwatch t = Stopwatch.createStarted(); SuperTaggingResults results = new SuperTaggingResults(); @@ -115,7 +139,7 @@ public static void main(String[] args) throws IOException, ArgumentValidationExc } System.err.println("Sentences parsed: " + results.parsedSentences); - System.err.println("Speed: " + twoDP.format((double) results.parsedSentences / t.elapsed(TimeUnit.SECONDS)) + " sentences per second"); + System.err.println("Speed: " + twoDP.format(1000.0 * results.parsedSentences / t.elapsed(TimeUnit.MILLISECONDS)) + " sentences per second"); } else { // Read from stdin Scanner sc = new Scanner(System.in,"UTF-8"); diff --git a/src/uk/ac/ed/easyccg/syntax/Category.java b/src/uk/ac/ed/easyccg/syntax/Category.java index 84b62b0..075c766 100644 --- a/src/uk/ac/ed/easyccg/syntax/Category.java +++ b/src/uk/ac/ed/easyccg/syntax/Category.java @@ -8,7 +8,7 @@ import com.google.common.collect.ImmutableSet; -abstract class Category { +public abstract class Category { private final String cat; private final int id; private final static String WILDCARD_FEATURE = "X"; @@ -62,6 +62,8 @@ public boolean matches(Slash other) public static final Category SEMICOLON = Category.valueOf(";"); public static final Category CONJ = Category.valueOf("conj"); public final static Category N = valueOf("N"); + public static final Category LQU = Category.valueOf("LQU"); + public static final Category LRB = Category.valueOf("LRB"); public static Category valueOf(String cat) { diff --git a/src/uk/ac/ed/easyccg/syntax/Combinator.java b/src/uk/ac/ed/easyccg/syntax/Combinator.java index 6f2478d..2dcb678 100644 --- a/src/uk/ac/ed/easyccg/syntax/Combinator.java +++ b/src/uk/ac/ed/easyccg/syntax/Combinator.java @@ -1,13 +1,13 @@ package uk.ac.ed.easyccg.syntax; +import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import uk.ac.ed.easyccg.syntax.Category.Slash; -import com.google.common.collect.ImmutableList; - public abstract class Combinator { public enum RuleType { @@ -33,7 +33,7 @@ public RuleProduction(RuleType ruleType, Category result, final boolean headIsLe public abstract boolean headIsLeft(Category left, Category right); - public final static Collection combinators = ImmutableList.copyOf(Arrays.asList( + public final static Collection STANDARD_COMBINATORS = new ArrayList(Arrays.asList( new ForwardApplication(), new BackwardApplication(), new ForwardComposition(Slash.FWD, Slash.FWD, Slash.FWD), @@ -41,9 +41,33 @@ public RuleProduction(RuleType ruleType, Category result, final boolean headIsLe new GeneralizedForwardComposition(Slash.FWD, Slash.FWD, Slash.FWD), new GeneralizedBackwardComposition(Slash.FWD, Slash.BWD, Slash.FWD), new Conjunction(), - new RemovePunctuation(false) + new RemovePunctuation(false), + new RemovePunctuationLeft() )); + public static Collection loadSpecialCombinators(File file) throws IOException { + Collection newCombinators = new ArrayList(); + for (String line : Util.readFile(file)) { + // l , S[to]\NP NP\NP + if (line.indexOf("#") > -1) { + line = line.substring(0, line.indexOf("#")); + } + + line = line.trim(); + if (line.isEmpty()) { + continue ; + } + + String[] fields = line.split(" "); + boolean headIsLeft = fields[0].equals("l"); + Category left = Category.valueOf(fields[1]); + Category right = Category.valueOf(fields[2]); + Category result = Category.valueOf(fields[3]); + newCombinators.add(new SpecialCombinator(left, right, result, headIsLeft)); + } + return newCombinators; + } + private final RuleType ruleType; public abstract boolean canApply(Category left, Category right); @@ -62,9 +86,9 @@ private static Category correctWildCardFeatures(Category toCorrect, Category mat /** * Returns a set of rules that can be applied to a pair of categories. */ - static Collection getRules(Category left, Category right) { + static Collection getRules(Category left, Category right, Collection rules) { Collection result = new ArrayList(2); - for (Combinator c : combinators) { + for (Combinator c : rules) { if (c.canApply(left, right)) { result.add(new RuleProduction(c.ruleType, c.apply(left, right), c.headIsLeft(left, right))); } @@ -139,8 +163,69 @@ public boolean headIsLeft(Category left, Category right) return !punctuationIsLeft; } } + + /** + * Open Brackets and Quotations + */ + private static class RemovePunctuationLeft extends Combinator { + private RemovePunctuationLeft() + { + super(RuleType.RP); + } + + @Override + public boolean canApply(Category left, Category right) + { + return left == Category.LQU || left == Category.LRB; + } + @Override + public Category apply(Category left, Category right) + { + return right; + } + + @Override + public boolean headIsLeft(Category left, Category right) + { + return false; + } + } + private static class SpecialCombinator extends Combinator { + private final Category left; + private final Category right; + private final Category result; + private final boolean headIsLeft; + + private SpecialCombinator(Category left, Category right, Category result, boolean headIsLeft) + { + super(RuleType.NOISE); + this.left = left; + this.right = right; + this.result = result; + this.headIsLeft = headIsLeft; + } + + @Override + public boolean canApply(Category left, Category right) + { + return this.left.matches(left) && this.right.matches(right); + } + + @Override + public Category apply(Category left, Category right) + { + return result; + } + + @Override + public boolean headIsLeft(Category left, Category right) + { + return headIsLeft; + } + } + private static class ForwardApplication extends Combinator { private ForwardApplication() { diff --git a/src/uk/ac/ed/easyccg/syntax/InputReader.java b/src/uk/ac/ed/easyccg/syntax/InputReader.java index dc3d5d7..704d720 100644 --- a/src/uk/ac/ed/easyccg/syntax/InputReader.java +++ b/src/uk/ac/ed/easyccg/syntax/InputReader.java @@ -128,7 +128,7 @@ public String getWordsAsString() result.append(word.word + " "); } - return words.toString().trim(); + return result.toString().trim(); } public static InputToParser fromTokens(List tokens) { diff --git a/src/uk/ac/ed/easyccg/syntax/ParserAStar.java b/src/uk/ac/ed/easyccg/syntax/ParserAStar.java index c04239a..afecc7d 100644 --- a/src/uk/ac/ed/easyccg/syntax/ParserAStar.java +++ b/src/uk/ac/ed/easyccg/syntax/ParserAStar.java @@ -35,6 +35,7 @@ public class ParserAStar implements Parser public ParserAStar(Tagger tagger, int maxSentenceLength, int nbest, double nbestBeam, InputFormat inputFormat, List validRootCategories, File unaryRulesFile, + File extraCombinatorsFile, File seenRulesFile) throws IOException { this.tagger = tagger; @@ -45,7 +46,14 @@ public ParserAStar(Tagger tagger, int maxSentenceLength, int nbest, double nbest this.unaryRules = loadUnaryRules(unaryRulesFile); this.seenRules = new SeenRules(seenRulesFile); this.nbestBeam = Math.log(nbestBeam); - + + List combinators = new ArrayList(Combinator.STANDARD_COMBINATORS); + + if (extraCombinatorsFile.exists()) { + combinators.addAll(Combinator.loadSpecialCombinators(extraCombinatorsFile)); + } + this.binaryRules = ImmutableList.copyOf(combinators); + List cats = new ArrayList(); for (String cat : validRootCategories) { cats.add(Category.valueOf(cat)); @@ -55,6 +63,7 @@ public ParserAStar(Tagger tagger, int maxSentenceLength, int nbest, double nbest private final int maxLength; + private final Collection binaryRules; private final Multimap unaryRules; private final int nbest; private final double nbestBeam; @@ -260,7 +269,7 @@ public int compareTo(AgendaItem o) } if (result == 0) { - // All other things being equal, prefer the parse with shorter dependencies (i.e. right-branching). + // All other things being equal, it works best to prefer parser with longer dependencies (i.e. non-local attachment). return parse.totalDependencyLength - o.parse.totalDependencyLength; } else { return result; @@ -443,7 +452,7 @@ private Collection getRules(Category left, Category right) { Collection result = rightToRules.get(right); if (result == null) { - result = Combinator.getRules(left, right); + result = Combinator.getRules(left, right, binaryRules); rightToRules.put(right, ImmutableList.copyOf(result)); } diff --git a/src/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.java b/src/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.java index bda9917..db4d629 100644 --- a/src/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.java +++ b/src/uk/ac/ed/easyccg/syntax/SyntaxTreeNode.java @@ -67,6 +67,19 @@ SyntaxTreeNodeLeaf getHead() { return headIsLeft ? leftChild.getHead() : rightChild.getHead(); } + @Override + int dimension() + { + int left = leftChild.dimension(); + int right = rightChild.dimension(); + if (left == right) { + return 1 + left; + } else { + return Math.max(left, right); + } + + + } } @@ -127,6 +140,11 @@ SyntaxTreeNodeLeaf getHead() { return this; } + @Override + int dimension() + { + return 0; + } } static class SyntaxTreeNodeUnary extends SyntaxTreeNode { @@ -162,12 +180,20 @@ SyntaxTreeNodeLeaf getHead() { return child.getHead(); } + + @Override + int dimension() + { + return child.dimension(); + } } public String toString() { return ParsePrinter.CCGBANK_PRINTER.print(this, -1); } + abstract int dimension(); + public int getHeadIndex() { return headIndex; } @@ -181,7 +207,7 @@ public int compareTo(SyntaxTreeNode o) /** * Factory for SyntaxTreeNode. Using a factory so we can have different hashing/caching behaviour when N-best parsing. */ - static class SyntaxTreeNodeFactory { + public static class SyntaxTreeNodeFactory { private final int[][] categoryHash; private final int[][] dependencyHash; private final boolean hashWords; diff --git a/src/uk/ac/ed/easyccg/syntax/TagDict.java b/src/uk/ac/ed/easyccg/syntax/TagDict.java new file mode 100644 index 0000000..f34356f --- /dev/null +++ b/src/uk/ac/ed/easyccg/syntax/TagDict.java @@ -0,0 +1,141 @@ +package uk.ac.ed.easyccg.syntax; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import uk.ac.ed.easyccg.syntax.InputReader.InputToParser; + +import com.google.common.collect.HashMultiset; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multiset; +import com.google.common.collect.Multiset.Entry; + +public class TagDict +{ + + private static final int MIN_OCCURENCES_OF_WORD = 500; + /** + * Key used in the tag dictionary for infrequent words + */ + public static final String OTHER_WORDS = "*other_words*"; + private final static String fileName = "tagdict"; + + /** + * Saves a tag dictionary to the model folder + */ + public static void writeTagDict(Map> tagDict, File modelFolder) throws FileNotFoundException, UnsupportedEncodingException { + PrintWriter writer = new PrintWriter(new File(modelFolder, fileName), "UTF-8"); + for (java.util.Map.Entry> entry : tagDict.entrySet()) { + writer.print(entry.getKey()); + for (Category c : entry.getValue()) { + writer.print("\t" + c.toString()); + } + writer.println(); + } + + writer.close(); + } + + /** + * Loads a tag dictionary from the model folder + */ + public static Map> readDict(File modelFolder) throws IOException { + Map> result = new HashMap>(); + File file = new File(modelFolder, fileName); + if (!file.exists()) return null; + for (String line : Util.readFile(file)) { + String[] fields = line.split("\t"); + List cats = new ArrayList(); + for (int i=1; i> comparator = new Comparator>() + { + @Override + public int compare(Entry arg0, Entry arg1) + { + return arg1.getCount() - arg0.getCount(); + } + }; + + /** + * Finds the set of categories used for each word in a corpus + */ + public static Map> makeDict(Iterable input) { + Multiset wordCounts = HashMultiset.create(); + Map> wordToCatToCount = new HashMap>(); + + // First, count how many times each word occurs with each category + for (InputToParser sentence : input) { + for (int i=0; i tmp = HashMultiset.create(); + wordToCatToCount.put(word, tmp); + } + + wordToCatToCount.get(word).add(cat); + } + } + + + // Now, save off a sorted list of categories + Multiset countsForOtherWords = HashMultiset.create(); + + Map> result = new HashMap>(); + for (Entry wordAndCount : wordCounts.entrySet()) { + Multiset countForCategory = wordToCatToCount.get(wordAndCount.getElement()); + if (wordAndCount.getCount() > MIN_OCCURENCES_OF_WORD) { + // Frequent word + addEntryForWord(countForCategory, result, wordAndCount.getElement()); + } else { + // Group stats for all rare words together. + + for (Entry catToCount : countForCategory.entrySet()) { + countsForOtherWords.add(catToCount.getElement(), catToCount.getCount()); + } + } + } + addEntryForWord(countsForOtherWords, result, OTHER_WORDS); + + + return ImmutableMap.copyOf(result); + } + + private static void addEntryForWord(Multiset countForCategory, + Map> result, String word) + { + List> cats = new ArrayList>(); + for (Entry catToCount : countForCategory.entrySet()) { + cats.add(catToCount); + } + + Collections.sort(cats, comparator); + List cats2 = new ArrayList(); + for (Entry entry : cats) { + cats2.add(entry.getElement()); + } + + result.put(word, cats2); + } +} diff --git a/src/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.java b/src/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.java index abd0fc9..dd344ed 100644 --- a/src/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.java +++ b/src/uk/ac/ed/easyccg/syntax/TaggerEmbeddings.java @@ -4,10 +4,12 @@ import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import no.uib.cipr.matrix.DenseMatrix; import no.uib.cipr.matrix.DenseVector; @@ -16,6 +18,8 @@ import uk.ac.ed.easyccg.syntax.InputReader.InputWord; import uk.ac.ed.easyccg.syntax.SyntaxTreeNode.SyntaxTreeNodeFactory; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.MinMaxPriorityQueue; import com.google.common.io.PatternFilenameFilter; import com.google.common.primitives.Doubles; @@ -49,37 +53,78 @@ public class TaggerEmbeddings implements Tagger private final static String suffixPad="*suffix_pad*"; private final static String unknownSuffix="*unknown_suffix*"; + private final List weightMatrixRows; + /** * Number of supertags to consider for each word. Choosing 50 means it's effectively unpruned, * but saves us having to sort the complete list of categories. */ private final static int beam = 50; + + private final double beta; + + private final SyntaxTreeNodeFactory terminalFactory; + private final Map> tagDict; - public TaggerEmbeddings(File file, int maxSentenceLength) { + public TaggerEmbeddings(File modelFolder, int maxSentenceLength, double beta) { try { FilenameFilter embeddingsFileFilter = new PatternFilenameFilter("embeddings.*"); - embeddingsFeatures = loadEmbeddings(true, file.listFiles(embeddingsFileFilter)); + embeddingsFeatures = loadEmbeddings(true, modelFolder.listFiles(embeddingsFileFilter)); discreteFeatures = new HashMap(); - discreteFeatures.putAll(loadEmbeddings(false, new File(file, "capitals"))); - discreteFeatures.putAll(loadEmbeddings(false, new File(file, "suffix"))); + discreteFeatures.putAll(loadEmbeddings(false, new File(modelFolder, "capitals"))); + discreteFeatures.putAll(loadEmbeddings(false, new File(modelFolder, "suffix"))); totalFeatures = (embeddingsFeatures.get(unknownLower).length + discreteFeatures.get(unknownSuffix).length + discreteFeatures.get(capsLower).length)*(2 * contextWindow + 1); - lexicalCategories = loadCategories(new File(file, "categories")); + lexicalCategories = loadCategories(new File(modelFolder, "categories")); weightMatrix = new DenseMatrix(lexicalCategories.size(), totalFeatures); - loadMatrix(weightMatrix, new File(file, "classifier")); + loadMatrix(weightMatrix, new File(modelFolder, "classifier")); + + weightMatrixRows = new ArrayList(lexicalCategories.size()); + for (int i=0; i catToIndex = new HashMap(); + int maxCategoryID = 0; + int index = 0; for (Category c : lexicalCategories) { maxCategoryID = Math.max(maxCategoryID, c.getID()); + catToIndex.put(c, index); + index++; + } + + Map> dict = TagDict.readDict(modelFolder); + Map> tagDict = new HashMap>(); + if (dict == null) { + dict = new HashMap>(); + dict.put(TagDict.OTHER_WORDS, lexicalCategories); + } + for (Entry> entry : dict.entrySet()) { + List catIndices = new ArrayList(entry.getValue().size()); + for (Category cat : entry.getValue()) { + catIndices.add(catToIndex.get(cat)); + } + tagDict.put(entry.getKey(), ImmutableList.copyOf(catIndices)); } + this.tagDict = ImmutableMap.copyOf(tagDict); + + terminalFactory = new SyntaxTreeNodeFactory(maxSentenceLength, maxCategoryID); - loadVector(bias, new File(file, "bias")); + loadVector(bias, new File(modelFolder, "bias")); + } catch (Exception e) { throw new Error(e); @@ -304,25 +349,39 @@ public int compareTo(ScoredCategory o) */ private List getTagsForWord(final Vector vector, final InputWord word, final int wordIndex) { - // Fixed length priority queue, used to sort candidate tags. - MinMaxPriorityQueue queue = MinMaxPriorityQueue.expectedSize(beam).maximumSize(beam).create(); - double total = 0.0; - Vector tmpVector = bias.copy(); - weightMatrix.multAdd(vector, tmpVector); - for (int i=0; i < weightMatrix.numRows(); i++) { - double score = Math.exp(tmpVector.get(i)); - - if (i < beam || score > queue.peekLast().score) { - // Surprisingly, this 'if' condition makes things faster. - queue.add(new ScoredCategory(i, score)); + Collection cats = tagDict.get(word.word); + if (cats == null) { + cats = tagDict.get(TagDict.OTHER_WORDS); + } + + double threshold = 0.0; + + final int size = Math.min(beam, cats.size()); + // Fixed length priority queue, used to sort candidate tags. + MinMaxPriorityQueue queue = MinMaxPriorityQueue.maximumSize(size).create(); + + double bestScore = 0.0; + + for (Integer cat : cats) { + double score = Math.exp(weightMatrixRows.get(cat).dot(vector) + bias.get(cat)); + if (score >= threshold) { + queue.add(new ScoredCategory(cat, score)); + + if (score > bestScore) { + bestScore = score; + threshold = beta * bestScore; + while (queue.peekLast().score < threshold) { + queue.pollLast(); + } + } } total += score; } - + // Convert the queue into a sorted list of SyntaxTreeNode terminals. - List result = new ArrayList(beam); + List result = new ArrayList(queue.size()); while (queue.size() > 0) { ScoredCategory cat = queue.poll(); double probability = cat.score / total;