Sha256: b82466e56d81817103e3698b23a6198366ce615ed18f601e2813c5f7b5c78716

Contents?: true

Size: 1.97 KB

Versions: 1

Compression:

Stored size: 1.97 KB

Contents

require 'gorillib'
require 'gorillib/data_munging'
require 'configliere'

S3_BUCKET = 'bigdata.chimpy.us'
S3_DATA_ROOT = "s3n://#{S3_BUCKET}/data"
HDFS_DATA_ROOT = '/data'

Settings.define :orig_data_root, default: HDFS_DATA_ROOT, description: "directory root for input data"
Settings.define :scratch_data_root, default: HDFS_DATA_ROOT, description: "directory root for scratch data"
Settings.define :results_data_root, default: HDFS_DATA_ROOT, description: "directory root for results data"
Settings.define :mini, description: 'Run in mini mode - operate inside the mini version of the specified universe',type: :boolean, default: false
Settings.define :universe, description: 'Universe to draw data from', finally: ->(c){ c.universe ||= (c.mini? ? "mini" : "full") }
Settings.define :pig_path, default: '/usr/local/bin/pig'
Settings.define :local, type: :boolean, default: false

def Settings.mini?; !! Settings.mini ; end # BANG BANG BANG
def Settings.wu_run_cmd; (local ? '--run=local' : '--run') ; end;

def dir_exists? (dir)
  if Settings.local
    return File.exists? dir
  else
    `hadoop fs -test -e #{dir}`
    return $?.exitstatus == 0
  end
end

def wukong(script, input, output, options={})
  input = Pathname.of(input)
  output = Pathname.of(output)
  if dir_exists? output
    puts "#{output} exists. Assuming that this job has already run..."
    return
  end
  opts = ['--rm']
  options.each_pair do |k,v|
    opts << "--#{k}=#{v}"
  end
  opts << input
  opts << output
  ruby(script, Settings.wu_run_cmd,*opts)
end

def wukong_xml(script, input, output, split_tag)
  wukong(script,input,output,{split_on_xml_tag: split_tag})
end

def pig(script_name, options={})
  cmd = Settings.pig_path
  options.each_pair do |k,v|
    v = Pathname.of(v) if v.is_a? Symbol
    if k.to_s.include? '_out' and dir_exists? v
      puts "#{v} already exists. Assuming that this job has already run..."
      return
    else
      cmd += " -param #{k.upcase}=#{v}"
    end
  end
  cmd += " #{script_name}"
  sh cmd
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wukong-3.0.0.pre2 examples/munging/rake_helper.rb