#!/usr/bin/env ruby require "rubygems" require "mandy" def absolute_path(path) path =~ /^\// ? path : File.join(Dir.pwd, path) end if ARGV.size==0 puts "USAGE: mandy-hadoop my_script.rb input_file_or_folder_on_hdfs output_folder_on_hdfs cluster-config.xml [payload]" exit end file = ARGV[0] filename = File.basename(file) input = ARGV[1] output_folder = ARGV[2] config = ARGV[3] payload = ARGV[4] ? Mandy::Packer.pack(ARGV[4]) : ARGV[0] require absolute_path(file) Mandy::Job.jobs.each_with_index do |job, i| jobconf = job.settings.map { |key, value| %(-D #{key}='#{value}') }.join(' ') output = File.join(output_folder, "#{i+1}-#{job.name.downcase.gsub(/\W/, '-')}") command = %($HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar #{jobconf}\ -conf '#{config}' \ -input "#{input}" \ -mapper "mandy-map #{filename} '#{job.name}' #{payload}" \ -reducer "mandy-reduce #{filename} '#{job.name}' #{payload}" \ -file "#{payload}" \ -output "#{output}") `#{command}` # puts "#{command}" input = output end