Sha256: 76fd7e1babd9a38f4eea1245bae90042f5ad256836df8746d2abf59a7946f1b8

Contents?: true

Size: 909 Bytes

Versions: 5

Compression:

Stored size: 909 Bytes

Contents

Wukong.processor(:mapper) do
  
  field :min_length, Integer,  :default => 1
  field :max_length, Integer,  :default => 256
  field :split_on,   Regexp,   :default => /\s+/
  field :remove,     Regexp,   :default => /[^a-zA-Z0-9\']+/
  field :fold_case,  :boolean, :default => false
  
  def process string
    tokenize(string).each do |token|
      yield token if acceptable?(token)
    end
  end

  private

  def tokenize string
    string.split(split_on).map do |token|
      stripped = token.gsub(remove, '')
      fold_case ? stripped.downcase : stripped
    end
  end

  def acceptable? token
    (min_length..max_length).include?(token.length)
  end
end

Wukong.processor(:reducer, Wukong::Processor::Accumulator) do

  attr_accessor :count
  
  def start record
    self.count = 0
  end
  
  def accumulate record
    self.count += 1
  end

  def finalize
    yield [key, count].join("\t")
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
wukong-hadoop-0.2.0 examples/word_count.rb
wukong-hadoop-0.1.1 examples/word_count.rb
wukong-hadoop-0.1.0 examples/word_count.rb
wukong-hadoop-0.0.2 examples/word_count.rb
wukong-hadoop-0.0.1 examples/word_count.rb