lib/fts_lite/tokenizer.rb in fts_lite-0.1.0 vs lib/fts_lite/tokenizer.rb in fts_lite-0.1.1
- old
+ new
@@ -2,11 +2,13 @@
require 'nkf'
require 'bimyou_segmenter'
module FtsLite
module Tokenizer
- SIMPLE_DELIMITER = /[\s\.,\?!;\(\)。、.,?!「」『』()]+/
+ QUERY_DELIMITER = /[\s ]+/
+ SIMPLE_DELIMITER = /[\s \.\*"',\?!;\(\)。、.,?!「」『』()]+/
+ NEAR = " NEAR/2 "
def self.create(name)
case name.to_sym
when :simple
Simple.new
@@ -24,18 +26,29 @@
end
def self.normalize(text)
NKF::nkf('-wZX', text).downcase
end
class Simple
+ def query(text)
+ vector(text)
+ end
def vector(text)
split(text).join(" ")
end
def split(text)
- Tokenizer.normalize(text).gsub(/[\.,\?!;:]/, ' ').split(SIMPLE_DELIMITER)
+ Tokenizer.normalize(text).split(SIMPLE_DELIMITER)
end
end
class Bigram
+ def query(text)
+ text = Tokenizer.normalize(text)
+ text.split(QUERY_DELIMITER).map {|segment|
+ segment.split(SIMPLE_DELIMITER).map {|word|
+ 0.upto(word.size - 2).map {|i| word[i, 2] }
+ }.join(NEAR)
+ }.flatten.join(" ")
+ end
def vector(text)
split(text).join(" ")
end
def split(text)
text = Tokenizer.normalize(text)
@@ -43,10 +56,18 @@
0.upto(word.size - 2).map {|i| word[i, 2] }
}.flatten
end
end
class Trigram
+ def query(text)
+ text = Tokenizer.normalize(text)
+ text.split(QUERY_DELIMITER).map {|segment|
+ segment.split(SIMPLE_DELIMITER).map {|word|
+ 0.upto(word.size - 3).map {|i| word[i, 3] }
+ }.join(NEAR)
+ }.flatten.join(" ")
+ end
def vector(text)
split(text).join(" ")
end
def split(text)
text = Tokenizer.normalize(text)
@@ -54,19 +75,41 @@
0.upto(word.size - 3).map {|i| word[i, 3] }
}.flatten
end
end
class Wakachi
+ def query(text)
+ text = Tokenizer.normalize(text)
+ text.split(QUERY_DELIMITER).map {|segment|
+ BimyouSegmenter.segment(segment,
+ :white_space => false,
+ :symbol => false).join(NEAR)
+ }.join(" ")
+ end
def vector(text)
split(text).join(" ")
end
def split(text)
BimyouSegmenter.segment(Tokenizer.normalize(text),
:white_space => false,
:symbol => false)
end
end
class WakachiBigram
+ def query(text)
+ text = Tokenizer.normalize(text)
+ text.split(QUERY_DELIMITER).map {|segment|
+ BimyouSegmenter.segment(segment,
+ :white_space => false,
+ :symbol => false).map {|word|
+ if (word.size == 1)
+ word
+ else
+ 0.upto(word.size - 2).map {|i| word[i, 2] }.join(NEAR)
+ end
+ }.flatten.join(NEAR)
+ }.join(" ")
+ end
def vector(text)
split(text).join(" ")
end
def split(text)
BimyouSegmenter.segment(Tokenizer.normalize(text),