lib/fts_lite/tokenizer.rb in fts_lite-0.1.2 vs lib/fts_lite/tokenizer.rb in fts_lite-0.1.3

- old
+ new

@@ -4,11 +4,12 @@ module FtsLite module Tokenizer QUERY_DELIMITER = /[\s ]+/ SIMPLE_DELIMITER = /[\s \.\*"',\?!;\(\)。、.,?!「」『』()]+/ - NEAR = " NEAR/2 " + NEAR0 = " NEAR/0 " + NEAR2 = " NEAR/2 " def self.create(name) case name.to_sym when :simple Simple.new @@ -26,27 +27,29 @@ end def self.normalize(text) NKF::nkf('-wZX', text).downcase end class Simple - def query(text) + def query(text, options) vector(text) end def vector(text) split(text).join(" ") end def split(text) Tokenizer.normalize(text).split(SIMPLE_DELIMITER) end end class Bigram - def query(text) + def query(text, options = {}) + fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false + near = fuzzy ? NEAR2 : NEAR0 text = Tokenizer.normalize(text) text.split(QUERY_DELIMITER).map {|segment| segment.split(SIMPLE_DELIMITER).map {|word| 0.upto(word.size - 2).map {|i| word[i, 2] } - }.join(NEAR) + }.join(near) }.flatten.join(" ") end def vector(text) split(text).join(" ") end @@ -56,16 +59,18 @@ 0.upto(word.size - 2).map {|i| word[i, 2] } }.flatten end end class Trigram - def query(text) + def query(text, options = {}) + fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false + near = fuzzy ? NEAR2 : NEAR0 text = Tokenizer.normalize(text) text.split(QUERY_DELIMITER).map {|segment| segment.split(SIMPLE_DELIMITER).map {|word| 0.upto(word.size - 3).map {|i| word[i, 3] } - }.join(NEAR) + }.join(near) }.flatten.join(" ") end def vector(text) split(text).join(" ") end @@ -75,16 +80,18 @@ 0.upto(word.size - 3).map {|i| word[i, 3] } }.flatten end end class Wakachi - def query(text) + def query(text, options = {}) + fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false + near = fuzzy ? NEAR2 : NEAR0 text = Tokenizer.normalize(text) text.split(QUERY_DELIMITER).map {|segment| BimyouSegmenter.segment(segment, :white_space => false, - :symbol => false).join(NEAR) + :symbol => false).join(near) }.join(" ") end def vector(text) split(text).join(" ") end @@ -93,21 +100,23 @@ :white_space => false, :symbol => false) end end class WakachiBigram - def query(text) + def query(text, options = {}) + fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false + near = fuzzy ? NEAR2 : NEAR0 text = Tokenizer.normalize(text) text.split(QUERY_DELIMITER).map {|segment| BimyouSegmenter.segment(segment, :white_space => false, :symbol => false).map {|word| if (word.size == 1) word else - 0.upto(word.size - 2).map {|i| word[i, 2] }.join(NEAR) + 0.upto(word.size - 2).map {|i| word[i, 2] }.join(near) end - }.flatten.join(NEAR) + }.flatten.join(near) }.join(" ") end def vector(text) split(text).join(" ") end