#!/usr/bin/env ruby require 'rubygems' require 'wukong/script' module WordCount class Mapper < Wukong::Streamer::LineStreamer # # Split a string into its constituent words. # # This is pretty simpleminded: # * downcase the word # * Split at any non-alphanumeric boundary, including '_' # * However, preserve the special cases of 's, 'd or 't at the end of a # word. # # tokenize("Ability is a poor man's wealth #johnwoodenquote") # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"] # def tokenize str return [] if str.blank? str = str.downcase; # kill off all punctuation except [stuff]'s or [stuff]'t # this includes hyphens (words are split) str = str. gsub(/[^a-zA-Z0-9\']+/, ' '). gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'") # Busticate at whitespace words = str.split(/\s+/) words.reject!{|w| w.length < 3 } words end # # Emit each word in each line. # def process line tokenize(line).each{|word| yield [word, 1] } end end # # You can stack up all the values in a list then sum them at once. # # This isn't good style, as it means the whole list is held in memory # class Reducer1 < Wukong::Streamer::ListReducer def finalize yield [ values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot }, key ] end end # # A bit kinder to your memory manager: accumulate the sum record-by-record: # class Reducer2 < Wukong::Streamer::AccumulatingReducer def start!(*args) @key_count = 0 end def accumulate(*args) @key_count += 1 end def finalize yield [ @key_count, key ] end end # # ... easiest of all, though: this is common enough that it's already included # require 'wukong/streamer/count_keys' class Reducer3 < Wukong::Streamer::CountKeys end end # Execute the script Wukong.run( WordCount::Mapper, WordCount::Reducer2 )