#!/usr/bin/env bash # hadoop dfs -rmr out/parsed-followers input_file=${1} ; shift output_file=${1} ; shift map_script=${1-/bin/cat} ; shift reduce_script=${1-/usr/bin/uniq} ; shift partfields=${1-2} ; shift sortfields=${1-2} ; shift if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop} cmd="${HADOOP_HOME}/bin/hadoop \ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*-streaming.jar -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner -jobconf num.key.fields.for.partition=\"$partfields\" -jobconf stream.num.map.output.key.fields=\"$sortfields\" -jobconf stream.map.output.field.separator=\"'/t'\" -jobconf mapred.text.key.partitioner.options=\"-k1,$partfields\" -mapper \"$map_script\" -reducer \"$reduce_script\" -input \"$input_file\" -output \"$output_file\" $@ " echo "$cmd" $cmd # Maybe? # # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \ # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \ #