diff --git a/Gemfile b/Gemfile index d3be8560..00b57f32 100644 --- a/Gemfile +++ b/Gemfile @@ -4,6 +4,7 @@ gemspec gem 'birch' gem 'schiphol' +gem 'yomu' group :test do gem 'rspec' diff --git a/lib/treat/config/data/languages/german.rb b/lib/treat/config/data/languages/german.rb index ff013246..ed231091 100755 --- a/lib/treat/config/data/languages/german.rb +++ b/lib/treat/config/data/languages/german.rb @@ -1,3 +1,5 @@ +#encoding: UTF-8 + { dependencies: [ 'punkt-segmenter', diff --git a/lib/treat/workers/formatters/readers/autoselect.rb b/lib/treat/workers/formatters/readers/autoselect.rb index 20f8ab1b..f3dab48f 100644 --- a/lib/treat/workers/formatters/readers/autoselect.rb +++ b/lib/treat/workers/formatters/readers/autoselect.rb @@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Readers::Autoselect ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/ ImageExtensions = ['gif', 'jpg', 'jpeg', 'png'] DefaultOptions = { - :default_to => 'txt' + :default_to => 'document' } # Choose a reader to use. diff --git a/lib/treat/workers/formatters/readers/document.rb b/lib/treat/workers/formatters/readers/document.rb new file mode 100644 index 00000000..a40a9478 --- /dev/null +++ b/lib/treat/workers/formatters/readers/document.rb @@ -0,0 +1,17 @@ +require 'yomu' + +# This class is a wrapper for Yomu. +# Yomu is a library for extracting text and metadata from files and documents +# using the Apache Tika content analysis toolkit. +class Treat::Workers::Formatters::Readers::Document + # Extract the readable text from any document. + # + # Options: none. + def self.read(document, options = {}) + yomu = Yomu.new(document.file) + + document.value = yomu.text + document.set :format, yomu.mimetype.extensions.first + document + end +end diff --git a/spec/entities/document.rb b/spec/entities/document.rb index f8c34e00..7a15b465 100644 --- a/spec/entities/document.rb +++ b/spec/entities/document.rb @@ -9,10 +9,10 @@ module Treat::Specs::Entities it "opens the file and reads its " + "content into a document" do f = Treat.paths.spec + - 'workers/examples/english/mathematicians/leibniz.txt' + 'workers/examples/english/mathematicians/pythagoras.docx' d = Treat::Entities::Document.build(f) d.should be_an_instance_of Treat::Entities::Document - d.to_s.index('Gottfried Leibniz').should_not eql nil + d.to_s.index('Pythagoras of Samos').should_not eql nil end end diff --git a/spec/workers/examples/english/mathematicians/pythagoras.docx b/spec/workers/examples/english/mathematicians/pythagoras.docx new file mode 100644 index 00000000..340dd6a2 Binary files /dev/null and b/spec/workers/examples/english/mathematicians/pythagoras.docx differ diff --git a/treat.gemspec b/treat.gemspec index 56921df5..aa8d8527 100644 --- a/treat.gemspec +++ b/treat.gemspec @@ -25,6 +25,7 @@ Gem::Specification.new do |s| # Runtime dependencies s.add_runtime_dependency 'schiphol' s.add_runtime_dependency 'birch' + s.add_runtime_dependency 'yomu' # Development dependencies s.add_development_dependency 'rspec' @@ -36,4 +37,4 @@ Gem::Specification.new do |s| To complete the installation, run `require treat` in an IRB terminal, followed by `Treat::Core::Installer.install`. } -end \ No newline at end of file +end