Sha256: b5b7f875e0937fbf2c6dd35f29eff8b92e98123b2e84a7698944998f0c430ca2

Contents?: true

Size: 1.04 KB

Versions: 9

Compression:

Stored size: 1.04 KB

Contents

#!/usr/bin/env ruby
require 'wukong'

#
# Bigram counts
#
# head -n 100 /usr/share/dict/words | ./examples/corpus/words_to_bigrams.rb  | sort |  /tmp/words_to_bigrams.rb
#


#
# Kludge to work in Elastic map reduce:
#
# If your script is ./examples/corpus/words_to_bigrams.rb, make symlinks
# to it from ./examples/corpus/words_to_bigrams__map.rb and
# ./examples/corpus/words_to_bigrams__reduce.rb
#
if $0 =~ /__(map|reduce)\.rb$/
  Settings[$1.to_sym] = true
end


#
# given one word per line
# emits all successive pairs of characters in that word
# eg 'boooo-urns' yields
#   bo oo oo oo o- -u ur rn ns
#
class WordNGrams < Wukong::Streamer::Base
  def process word
    word[0..-2].chars.zip(word[1..-1].chars).each do |ngram_2|
      yield ngram_2.join('')
    end
  end
end

#
# number of unique keys in a row
#
class KeyCountStreamer < Wukong::Streamer::AccumulatingReducer
  def start! *args
    @count = 0
  end
  def accumulate *args
    @count += 1
  end
  def finalize
    yield [key, @count]
  end
end

Wukong::Script.new(WordNGrams, KeyCountStreamer).run

Version data entries

9 entries across 9 versions & 1 rubygems

Version Path
wukong-2.0.0 examples/corpus/words_to_bigrams.rb
wukong-1.5.4 examples/corpus/words_to_bigrams.rb
wukong-1.5.3 examples/corpus/words_to_bigrams.rb
wukong-1.5.2 examples/corpus/words_to_bigrams.rb
wukong-1.5.1 examples/corpus/words_to_bigrams.rb
wukong-1.5.0 examples/corpus/words_to_bigrams.rb
wukong-1.4.12 examples/corpus/words_to_bigrams.rb
wukong-1.4.11 examples/corpus/words_to_bigrams.rb
wukong-1.4.10 examples/corpus/words_to_bigrams.rb