Sha256: 6647f2d4a21963b4b07189366d52c0077c4e1531e03a54ca95dae751808432fa
Contents?: true
Size: 1.34 KB
Versions: 10
Compression:
Stored size: 1.34 KB
Contents
#!/usr/bin/env bash # hadoop dfs -rmr out/parsed-followers input_file=${1} ; shift output_file=${1} ; shift map_script=${1-/bin/cat} ; shift reduce_script=${1-/usr/bin/uniq} ; shift partfields=${1-2} ; shift sortfields=${1-2} ; shift if [ "$output_file" == "" ] ; then echo "$0 input_file output_file [mapper=/bin/cat] [reducer=/usr/bin/uniq] [partfields=2] [sortfields=2] [extra_args]" ; exit ; fi HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop} cmd="${HADOOP_HOME}/bin/hadoop \ jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner -jobconf num.key.fields.for.partition=\"$partfields\" -jobconf stream.num.map.output.key.fields=\"$sortfields\" -jobconf stream.map.output.field.separator=\"'/t'\" -jobconf mapred.text.key.partitioner.options=\"-k1,$partfields\" -mapper \"$map_script\" -reducer \"$reduce_script\" -input \"$input_file\" -output \"$output_file\" $@ " echo "$cmd" $cmd # For a map-side-only job specify # -jobconf mapred.reduce.tasks=0 \ # Maybe? # # -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \ # -mapper org.apache.hadoop.mapred.lib.IdentityMapper \ #
Version data entries
10 entries across 5 versions & 1 rubygems