Sha256: c3e5d3efacdcc3492bde895b4096ec435cc27f73897bc17ce9f6f9f7d52592a8

Contents?: true

Size: 1.17 KB

Versions: 4

Compression:

Stored size: 1.17 KB

Contents

#!/usr/bin/env ruby
$: << File.dirname(__FILE__)+'/../lib'
require 'wukong'

#
# Probabilistically emit some fraction of record/lines
#
# Set the sampling fraction at the command line using the
#   --sampling_fraction=
# option: for example, to take a random 1/1000th of the lines in huge_files,
#  ./examples/sample_records.rb --sampling_fraction=0.001 --go huge_files sampled_files
#
class Mapper < Wukong::Streamer::LineStreamer
  include Wukong::Streamer::Filter

  #
  # floating-point number between 0 and 1 giving the fraction of lines to emit:
  # at sampling_fraction=1 all records are emitted, at 0 none are.
  #
  # Takes its value from a mandatory command-line option
  #
  def sampling_fraction
    @sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
      raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
  end

  #
  # randomly decide to emit +sampling_fraction+ fraction of lines
  #
  def emit? line
    rand < self.sampling_fraction
  end
end

class Script < Wukong::Script
  def default_options
    super.merge :reduce_tasks => 0
  end
end

#
# Executes the script
#
Script.new( Mapper, nil ).run

Version data entries

4 entries across 4 versions & 2 rubygems

Version Path
mrflip-wukong-0.1.0 examples/sample_records.rb
wukong-1.4.0 examples/sample_records.rb
wukong-0.1.4 examples/sample_records.rb
wukong-0.1.1 examples/sample_records.rb