Sha256: d720e8cade687a8a9bfd6cdbe01702d7e90cb17a46250a51d40b80fc25eb429b

Contents?: true

Size: 1.03 KB

Versions: 3

Compression:

Stored size: 1.03 KB

Contents

#!/usr/bin/env ruby
$: << File.dirname(__FILE__)
require 'rubygems'
require 'wukong/script'
require 'bucket_counter'

#
# Coocurrence counts
#

#
# Input is a list of document-idx-sentences, each field is tab-separated
#   title   idx   word_a    word_b    word_c ...
#
# This emits each co-courring pair exactly once; in the case of a three-word
# sentence the output would be
#
#   word_a  word_b
#   word_a  word_c
#   word_b  word_c
#
class SentenceBigrams < Wukong::Streamer::RecordStreamer
  def process title, idx, *words
    words[0..-2].zip(words[1..-1]).each do |word_a, word_b|
      yield [word_a, word_b]
    end
  end
end

#
# Combine multiple bucket counts into a single on
#
class CombineBuckets < Wukong::Streamer::AccumulatingReducer
  def get_key *fields
    fields[0..1]
  end
  def start! *args
    @total = 0
  end
  def accumulate *fields
    @total += 1
  end
  def finalize
    yield [@total, key].flatten
  end
end

Wukong.run(
  SentenceBigrams,
  CombineBuckets,
  :io_sort_record_percent => 0.3,
  :io_sort_mb => 300
  )

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
wukong-3.0.0.pre old/examples/corpus/sentence_bigrams.rb
wukong-2.0.2 examples/corpus/sentence_bigrams.rb
wukong-2.0.1 examples/corpus/sentence_bigrams.rb