Sha256: 76fd7e1babd9a38f4eea1245bae90042f5ad256836df8746d2abf59a7946f1b8
Contents?: true
Size: 909 Bytes
Versions: 5
Compression:
Stored size: 909 Bytes
Contents
Wukong.processor(:mapper) do field :min_length, Integer, :default => 1 field :max_length, Integer, :default => 256 field :split_on, Regexp, :default => /\s+/ field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/ field :fold_case, :boolean, :default => false def process string tokenize(string).each do |token| yield token if acceptable?(token) end end private def tokenize string string.split(split_on).map do |token| stripped = token.gsub(remove, '') fold_case ? stripped.downcase : stripped end end def acceptable? token (min_length..max_length).include?(token.length) end end Wukong.processor(:reducer, Wukong::Processor::Accumulator) do attr_accessor :count def start record self.count = 0 end def accumulate record self.count += 1 end def finalize yield [key, count].join("\t") end end
Version data entries
5 entries across 5 versions & 1 rubygems