Skip to content

Commit

Permalink
Unite stemmer and stopword filter to whitespace tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
piroor committed Jun 30, 2017
1 parent 60f9b1d commit f256337
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
7 changes: 1 addition & 6 deletions lib/classifier-reborn/extensions/hasher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,7 @@ module Hasher
# Return a Hash of strings => ints. Each word in the string is stemmed,
# interned, and indexes to its frequency in the document.
def word_hash(str, language = 'en', enable_stemmer = true, clean: false)
words = Tokenizer::Whitespace.tokenize(str, clean: clean)
words = TokenFilter::Stopword.filter(words, language: language)
if enable_stemmer
words = TokenFilter::Stemmer.filter(words, language: language)
end

words = Tokenizer::Whitespace.tokenize(str, language: language, enable_stemmer: enable_stemmer, clean: clean)
d = Hash.new(0)
words.each do |word|
d[word.intern] += 1
Expand Down
12 changes: 8 additions & 4 deletions lib/classifier-reborn/extensions/tokenizer/whitespace.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@ module Tokenizer
module Whitespace
module_function

def tokenize(str, clean: false)
word_tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
def tokenize(str, language: 'en', enable_stemmer: true, clean: false)
tokens = str.gsub(/[^\p{WORD}\s]/, '').downcase.split.collect do |word|
Token.new(word, stemmable: true, maybe_stopword: true)
end
unless clean
symbol_tokens = str.scan(/[^\s\p{WORD}]/).collect do |word|
Token.new(word, stemmable: false, maybe_stopword: false)
end
word_tokens += symbol_tokens
tokens += symbol_tokens
end
word_tokens
tokens = TokenFilter::Stopword.filter(tokens, language: language)
if enable_stemmer
tokens = TokenFilter::Stemmer.filter(tokens, language: language)
end
tokens
end
end
end
Expand Down

0 comments on commit f256337

Please sign in to comment.