From 727a307af0c64747619531c3aa355535edbf4632 Mon Sep 17 00:00:00 2001 From: Louis Mullie Date: Sun, 2 Jun 2013 21:42:56 -0400 Subject: [PATCH] Add basic support for OpenNLP. --- lib/treat/config/data/core.rb | 2 +- lib/treat/config/data/languages/english.rb | 2 +- lib/treat/config/data/libraries.rb | 4 ++ lib/treat/helpers/string.rb | 2 +- lib/treat/loaders/bind_it.rb | 48 +++++++++++++++++ lib/treat/loaders/open_nlp.rb | 12 +++++ lib/treat/loaders/stanford.rb | 53 +++---------------- lib/treat/proxies/proxy.rb | 5 +- lib/treat/workers/groupable.rb | 2 +- .../tokenizers/{maxent.rb => open_nlp.rb} | 9 ++-- spec/helper.rb | 8 ++- 11 files changed, 89 insertions(+), 58 deletions(-) create mode 100644 lib/treat/loaders/bind_it.rb create mode 100644 lib/treat/loaders/open_nlp.rb rename lib/treat/workers/processors/tokenizers/{maxent.rb => open_nlp.rb} (74%) diff --git a/lib/treat/config/data/core.rb b/lib/treat/config/data/core.rb index 9a1f2e9d..a4c3f7d3 100644 --- a/lib/treat/config/data/core.rb +++ b/lib/treat/config/data/core.rb @@ -4,7 +4,7 @@ 'abw', 'doc', 'yaml', 'uea', 'lda', 'pdf', 'ptb', 'dot', 'ai', 'id3', 'svo', 'mlp', - 'svm', 'srx'], + 'svm', 'srx', 'nlp'], encodings: {language_to_code: { diff --git a/lib/treat/config/data/languages/english.rb b/lib/treat/config/data/languages/english.rb index aeaebaff..b51baecc 100755 --- a/lib/treat/config/data/languages/english.rb +++ b/lib/treat/config/data/languages/english.rb @@ -33,7 +33,7 @@ processors: { parsers: [:stanford], segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford], - tokenizers: [:ptb, :stanford, :punkt] + tokenizers: [:ptb, :stanford, :punkt, :open_nlp] } }, stop_words: diff --git a/lib/treat/config/data/libraries.rb b/lib/treat/config/data/libraries.rb index 4da1f47a..ec0faf6b 100644 --- a/lib/treat/config/data/libraries.rb +++ b/lib/treat/config/data/libraries.rb @@ -8,5 +8,9 @@ stanford: { jar_path: nil, model_path: nil + }, + open_nlp: { + jar_path: nil, + model_path: nil } } \ No newline at end of file diff --git a/lib/treat/helpers/string.rb b/lib/treat/helpers/string.rb index a364d054..3f3ab791 100644 --- a/lib/treat/helpers/string.rb +++ b/lib/treat/helpers/string.rb @@ -54,7 +54,7 @@ def camel_case if @@cc_cache[o_phrase] return @@cc_cache[o_phrase] end - if Treat.core.acronyms.include?(phrase) + if Treat.core.acronyms.include?(phrase.downcase) phrase = phrase.upcase else phrase.gsub!(Regex) { |a| a.upcase } diff --git a/lib/treat/loaders/bind_it.rb b/lib/treat/loaders/bind_it.rb new file mode 100644 index 00000000..48125114 --- /dev/null +++ b/lib/treat/loaders/bind_it.rb @@ -0,0 +1,48 @@ +class Treat::Loaders::BindIt + + # Keep track of whether its loaded or not. + @@loaded = {} + + # Load CoreNLP package for a given language. + def self.load(klass, name, language = nil) + + return if @@loaded[klass] + + language ||= Treat.core.language.default + + jar_path = Treat.libraries[name].jar_path || + Treat.paths.bin + "#{name}/" + model_path = Treat.libraries[name].model_path || + Treat.paths.models + "#{name}/" + + if !File.directory?(jar_path) + raise Treat::Exception, "Looking for #{klass} " + + "library JAR files in #{jar_path}, but it is " + + "not a directory. Please set the config option " + + "Treat.libraries.#{name}.jar_path to a folder " + + "containing the appropriate JAR files." + end + + if !File.directory?(model_path) + raise Treat::Exception, "Looking for #{klass} " + + "library model files in #{model_path}, but it " + + "is not a directory. Please set the config option " + + "Treat.libraries.#{name}.model_path to a folder " + + "containing the appropriate JAR files." + end + + klass.jar_path = jar_path + klass.model_path = model_path + klass.use language + + if Treat.core.verbosity.silence + klass.log_file = '/dev/null' + end + + klass.bind + + @@loaded[klass] = true + + end + +end diff --git a/lib/treat/loaders/open_nlp.rb b/lib/treat/loaders/open_nlp.rb new file mode 100644 index 00000000..9988e0fa --- /dev/null +++ b/lib/treat/loaders/open_nlp.rb @@ -0,0 +1,12 @@ +require 'treat/loaders/bind_it' + +# A helper class to load the OpenNLP package. +class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt + + require 'open-nlp' + + def self.load(language = nil) + super(OpenNLP, :open_nlp, language) + end + +end \ No newline at end of file diff --git a/lib/treat/loaders/stanford.rb b/lib/treat/loaders/stanford.rb index be83afd1..5e5e754b 100644 --- a/lib/treat/loaders/stanford.rb +++ b/lib/treat/loaders/stanford.rb @@ -1,53 +1,14 @@ +require 'treat/loaders/bind_it' + # A helper class to load the CoreNLP package. -class Treat::Loaders::Stanford +class Treat::Loaders::Stanford < Treat::Loaders::BindIt - # Keep track of whether its loaded or not. - @@loaded = false + require 'stanford-core-nlp' - # Load CoreNLP package for a given language. def self.load(language = nil) - - return if @@loaded - - language ||= Treat.core.language.default - - jar_path = Treat.libraries.stanford.jar_path || - Treat.paths.bin + 'stanford/' - model_path = Treat.libraries.stanford.model_path || - Treat.paths.models + 'stanford/' - - if !File.directory?(jar_path) - raise Treat::Exception, "Looking for Stanford " + - "CoreNLP JAR files in #{jar_path}, but it is " + - "not a directory. Please set the config option " + - "Treat.libraries.stanford.jar_path to a folder " + - "containing the Stanford JAR files." - end - - if !File.directory?(model_path) - raise Treat::Exception, "Looking for Stanford " + - "CoreNLP model files in #{model_path}, but it " + - "is not a directory. Please set the config option " + - "Treat.libraries.stanford.model_path to a folder " + - "containing the Stanford JAR files." - end - - require 'stanford-core-nlp' - - StanfordCoreNLP.jar_path = jar_path - StanfordCoreNLP.model_path = model_path - StanfordCoreNLP.use(language) - - if Treat.core.verbosity.silence - StanfordCoreNLP.log_file = '/dev/null' - end - - StanfordCoreNLP.bind - - @@loaded = true - + super(StanfordCoreNLP, :stanford, language) end - + def self.find_model(name, language) language = language.intern model_file = StanfordCoreNLP::Config::Models[name][language] @@ -57,4 +18,4 @@ def self.find_model(name, language) File.join(model_path, model_dir, model_file) end -end +end \ No newline at end of file diff --git a/lib/treat/proxies/proxy.rb b/lib/treat/proxies/proxy.rb index b1c9ef76..e44ddbfe 100644 --- a/lib/treat/proxies/proxy.rb +++ b/lib/treat/proxies/proxy.rb @@ -11,14 +11,15 @@ module Proxy def method_missing(sym, *args, &block) if [:do, :apply].include?(sym) || Treat::Workers.lookup(sym) - to_entity.send(sym, *args) + to_entity.send(sym, *args) else super(sym, *args, &block) end end + # Create an unknown type of entity by default. def to_entity(builder = nil) - Treat::Entities::Unknown(self.to_s) + Treat::Entities::Unknown.new(self.to_s) end end diff --git a/lib/treat/workers/groupable.rb b/lib/treat/workers/groupable.rb index 8272f796..cbc98c58 100644 --- a/lib/treat/workers/groupable.rb +++ b/lib/treat/workers/groupable.rb @@ -15,7 +15,7 @@ def const_missing(const) require file if not self.const_defined?(const) raise Treat::Exception, - "File #{file} does not define " + + "File #{file}.rb does not define " + "#{self}::#{const}." end const_get(const) diff --git a/lib/treat/workers/processors/tokenizers/maxent.rb b/lib/treat/workers/processors/tokenizers/open_nlp.rb similarity index 74% rename from lib/treat/workers/processors/tokenizers/maxent.rb rename to lib/treat/workers/processors/tokenizers/open_nlp.rb index 0607be4c..35008b60 100644 --- a/lib/treat/workers/processors/tokenizers/maxent.rb +++ b/lib/treat/workers/processors/tokenizers/open_nlp.rb @@ -1,8 +1,10 @@ # Maximum entropy tokenization supplied by OpenNLP. -class Treat::Workers::Processors::Tokenizers::Maxent +class Treat::Workers::Processors::Tokenizers::OpenNlp require 'open-nlp' - OpenNLP.load + Treat::Loaders::OpenNLP.load + + @@tokenizers = {} # Maximum entropy tokenization. def self.tokenize(entity, options = {}) @@ -20,8 +22,7 @@ def self.tokenize(entity, options = {}) tokens = tokenizer.tokenize(str).to_a tokens.each do |token| - entity << Treat::Entities - ::Token.from_string(chunk) + entity << Treat::Entities::Token.from_string(token) end end diff --git a/spec/helper.rb b/spec/helper.rb index 4a0bc4fe..3ec65f34 100644 --- a/spec/helper.rb +++ b/spec/helper.rb @@ -7,17 +7,21 @@ module Treat::Specs require 'rspec' # Some configuration options for devel. -=begin + Treat.databases.mongo.db = 'treat_test' Treat.libraries.stanford.model_path = '/ruby/stanford-core-nlp-minimal/models/' Treat.libraries.stanford.jar_path = '/ruby/stanford-core-nlp-minimal/bin/' + Treat.libraries.open_nlp.jar_path = + '/ruby/open-nlp-english/bin/' + Treat.libraries.open_nlp.model_path = + '/ruby/open-nlp-english/models/' Treat.libraries.punkt.model_path = '/ruby/punkt/models/' Treat.libraries.reuters.model_path = '/ruby/reuters/models/' -=end + # Mimic the ./lib structure. module Entities; end module Workers; end