Sha256: 2561c4f5d3c1090a79617fb0134e4ba55527d2f03f9ded5e9f8c4518d185597d
Contents?: true
Size: 1.37 KB
Versions: 1
Compression:
Stored size: 1.37 KB
Contents
#!/usr/bin/env ruby $: << File.dirname(__FILE__) require 'rubygems' require 'wukong/script' require 'bucket_counter' # # Coocurrence counts # # # Input is a list of document-idx-sentences, each field is tab-separated # title idx word_a word_b word_c ... # # This emits each co-courring pair exactly once; in the case of a three-word # sentence the output would be # # word_a word_b # word_a word_c # word_b word_c # class SentenceCoocurrence < Wukong::Streamer::RecordStreamer def initialize *args super *args @bucket = BucketCounter.new end def process title, idx, *words words.each_with_index do |word_a, idx| words[(idx+1) .. -1].each do |word_b| @bucket << [word_a, word_b] end end dump_bucket if @bucket.full? end def dump_bucket @bucket.each do |pair_key, count| emit [pair_key, count] end $stderr.puts "bucket stats: #{@bucket.stats.inspect}" @bucket.clear end def after_stream dump_bucket end end # # Combine multiple bucket counts into a single on # class CombineBuckets < Wukong::Streamer::AccumulatingReducer def start! *args @total = 0 end def accumulate word, count @total += count.to_i end def finalize yield [@total, key] if @total > 20 end end Wukong.run( SentenceCoocurrence, CombineBuckets, :io_sort_record_percent => 0.3, :io_sort_mb => 300 )
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
wukong-2.0.0 | examples/corpus/sentence_coocurrence.rb |