lib/fts_lite/tokenizer.rb in fts_lite-0.1.2 vs lib/fts_lite/tokenizer.rb in fts_lite-0.1.3
- old
+ new
@@ -4,11 +4,12 @@
module FtsLite
module Tokenizer
QUERY_DELIMITER = /[\s ]+/
SIMPLE_DELIMITER = /[\s \.\*"',\?!;\(\)。、.,?!「」『』()]+/
- NEAR = " NEAR/2 "
+ NEAR0 = " NEAR/0 "
+ NEAR2 = " NEAR/2 "
def self.create(name)
case name.to_sym
when :simple
Simple.new
@@ -26,27 +27,29 @@
end
def self.normalize(text)
NKF::nkf('-wZX', text).downcase
end
class Simple
- def query(text)
+ def query(text, options)
vector(text)
end
def vector(text)
split(text).join(" ")
end
def split(text)
Tokenizer.normalize(text).split(SIMPLE_DELIMITER)
end
end
class Bigram
- def query(text)
+ def query(text, options = {})
+ fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false
+ near = fuzzy ? NEAR2 : NEAR0
text = Tokenizer.normalize(text)
text.split(QUERY_DELIMITER).map {|segment|
segment.split(SIMPLE_DELIMITER).map {|word|
0.upto(word.size - 2).map {|i| word[i, 2] }
- }.join(NEAR)
+ }.join(near)
}.flatten.join(" ")
end
def vector(text)
split(text).join(" ")
end
@@ -56,16 +59,18 @@
0.upto(word.size - 2).map {|i| word[i, 2] }
}.flatten
end
end
class Trigram
- def query(text)
+ def query(text, options = {})
+ fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false
+ near = fuzzy ? NEAR2 : NEAR0
text = Tokenizer.normalize(text)
text.split(QUERY_DELIMITER).map {|segment|
segment.split(SIMPLE_DELIMITER).map {|word|
0.upto(word.size - 3).map {|i| word[i, 3] }
- }.join(NEAR)
+ }.join(near)
}.flatten.join(" ")
end
def vector(text)
split(text).join(" ")
end
@@ -75,16 +80,18 @@
0.upto(word.size - 3).map {|i| word[i, 3] }
}.flatten
end
end
class Wakachi
- def query(text)
+ def query(text, options = {})
+ fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false
+ near = fuzzy ? NEAR2 : NEAR0
text = Tokenizer.normalize(text)
text.split(QUERY_DELIMITER).map {|segment|
BimyouSegmenter.segment(segment,
:white_space => false,
- :symbol => false).join(NEAR)
+ :symbol => false).join(near)
}.join(" ")
end
def vector(text)
split(text).join(" ")
end
@@ -93,21 +100,23 @@
:white_space => false,
:symbol => false)
end
end
class WakachiBigram
- def query(text)
+ def query(text, options = {})
+ fuzzy = options.key?(:fuzzy) ? options[:fuzzy] : false
+ near = fuzzy ? NEAR2 : NEAR0
text = Tokenizer.normalize(text)
text.split(QUERY_DELIMITER).map {|segment|
BimyouSegmenter.segment(segment,
:white_space => false,
:symbol => false).map {|word|
if (word.size == 1)
word
else
- 0.upto(word.size - 2).map {|i| word[i, 2] }.join(NEAR)
+ 0.upto(word.size - 2).map {|i| word[i, 2] }.join(near)
end
- }.flatten.join(NEAR)
+ }.flatten.join(near)
}.join(" ")
end
def vector(text)
split(text).join(" ")
end