Sha256: e6a4ddcaeef0c4be90dfb5ab6b8e7fecff8ccb3cf67dd951ece10c43e48caa61

Contents?: true

Size: 1.15 KB

Versions: 8

Compression:

Stored size: 1.15 KB

Contents

#!/usr/bin/env ruby
$: << File.dirname(__FILE__)+'/../lib'
require 'rubygems'
require 'wukong'

#
# Probabilistically emit some fraction of record/lines
#
# Set the sampling fraction at the command line using the
#   --sampling_fraction=
# option: for example, to take a random 1/1000th of the lines in huge_files,
#  ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
#
class Mapper < Wukong::Streamer::LineStreamer
  include Wukong::Streamer::Filter

  #
  # floating-point number between 0 and 1 giving the fraction of lines to emit:
  # at sampling_fraction=1 all records are emitted, at 0 none are.
  #
  # Takes its value from a mandatory command-line option
  #
  def sampling_fraction
    @sampling_fraction ||= ( options[:sampling_fraction] && options[:sampling_fraction].to_f ) or
      raise "Please supply a --sampling_fraction= argument, a decimal number between 0 and 1"
  end

  #
  # randomly decide to emit +sampling_fraction+ fraction of lines
  #
  def emit? line
    rand < self.sampling_fraction
  end
end

#
# Executes the script
#
Wukong::Script.new( Mapper,
  nil,
  :reduce_tasks => 0,
  :reuse_jvms   => true
  ).run

Version data entries

8 entries across 8 versions & 1 rubygems

Version Path
wukong-1.5.4 examples/sample_records.rb
wukong-1.5.3 examples/sample_records.rb
wukong-1.5.2 examples/sample_records.rb
wukong-1.5.1 examples/sample_records.rb
wukong-1.5.0 examples/sample_records.rb
wukong-1.4.12 examples/sample_records.rb
wukong-1.4.11 examples/sample_records.rb
wukong-1.4.10 examples/sample_records.rb