Sha256: 5c918d851395c0a4e44cb3cfc1612f2ae5338f76a307a45d18cea86292848c2e

Contents?: true

Size: 665 Bytes

Versions: 5

Compression:

Stored size: 665 Bytes

Contents

Wukong.processor(:mapper) do
  
  field :min_length, Integer,  :default => 1
  field :max_length, Integer,  :default => 256
  field :split_on,   Regexp,   :default => /\s+/
  field :remove,     Regexp,   :default => /[^a-zA-Z0-9\']+/
  field :fold_case,  :boolean, :default => false
  
  def process string
    tokenize(string).each do |token|
      yield token if acceptable?(token)
    end
  end

  private

  def tokenize string
    string.split(split_on).map do |token|
      stripped = token.gsub(remove, '')
      fold_case ? stripped.downcase : stripped
    end
  end

  def acceptable? token
    (min_length..max_length).include?(token.length)
  end
  
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
wukong-hadoop-0.2.0 examples/map_only.rb
wukong-hadoop-0.1.1 examples/map_only.rb
wukong-hadoop-0.1.0 examples/map_only.rb
wukong-hadoop-0.0.2 examples/map_only.rb
wukong-hadoop-0.0.1 examples/map_only.rb