Sha256: c6b815034d32de715bb86646775e3b594b2e9dab375ccf1ef8d7a6606fe8ea8c

Contents?: true

Size: 928 Bytes

Versions: 4

Compression:

Stored size: 928 Bytes

Contents

#!/usr/bin/env ruby
require 'rubygems'
require 'wukong/script'

Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."

#
# Probabilistically emit some fraction of record/lines
#
# Set the sampling fraction at the command line using the
#   --sampling_fraction=
# option: for example, to take a random 1/1000th of the lines in huge_files,
#  ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
#
class Mapper < Wukong::Streamer::LineStreamer
  include Wukong::Streamer::Filter

  #
  # randomly decide to emit +sampling_fraction+ fraction of lines
  #
  def emit? line
    rand < Settings.sampling_fraction
  end
end

#
# Executes the script
#
Wukong.run( Mapper,
  nil,
  :reduce_tasks => 0,
  :reuse_jvms   => true
  )

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
wukong-3.0.0.pre old/examples/sample_records.rb
wukong-2.0.2 examples/sample_records.rb
wukong-2.0.1 examples/sample_records.rb
wukong-2.0.0 examples/sample_records.rb