Sha256: 2561c4f5d3c1090a79617fb0134e4ba55527d2f03f9ded5e9f8c4518d185597d

Contents?: true

Size: 1.37 KB

Versions: 1

Compression:

Stored size: 1.37 KB

Contents

#!/usr/bin/env ruby
$: << File.dirname(__FILE__)
require 'rubygems'
require 'wukong/script'
require 'bucket_counter'

#
# Coocurrence counts
#

#
# Input is a list of document-idx-sentences, each field is tab-separated
#   title   idx   word_a    word_b    word_c ...
#
# This emits each co-courring pair exactly once; in the case of a three-word
# sentence the output would be
#
#   word_a  word_b
#   word_a  word_c
#   word_b  word_c
#
class SentenceCoocurrence < Wukong::Streamer::RecordStreamer
  def initialize *args
    super *args
    @bucket = BucketCounter.new
  end

  def process title, idx, *words
    words.each_with_index do |word_a, idx|
      words[(idx+1) .. -1].each do |word_b|
        @bucket << [word_a, word_b]
      end
    end
    dump_bucket if @bucket.full?
  end

  def dump_bucket
    @bucket.each do |pair_key, count|
      emit [pair_key, count]
    end
    $stderr.puts "bucket stats: #{@bucket.stats.inspect}"
    @bucket.clear
  end

  def after_stream
    dump_bucket
  end
end

#
# Combine multiple bucket counts into a single on
#
class CombineBuckets < Wukong::Streamer::AccumulatingReducer
  def start! *args
    @total = 0
  end
  def accumulate word, count
    @total += count.to_i
  end
  def finalize
    yield [@total, key] if @total > 20
  end
end

Wukong.run(
  SentenceCoocurrence,
  CombineBuckets,
  :io_sort_record_percent => 0.3,
  :io_sort_mb => 300
  )

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wukong-2.0.0 examples/corpus/sentence_coocurrence.rb