#!/bin/bash HOST=<%= host %> DEBUG=<%= debug %> TFILE=<%= tfile %> TOPTS="<%= topts %>" TOPTS_ODIR=<%= output_dir %> TMPDIR=<%= tmpdir %> QUERY="<%= query %>" HADOOP_HOME=<%= hadoop_home %> HIVE_HOME=<%= hive_home %> OVERWRITE=<%= overwrite %> HIVE_TABLE_NAME=<%= hive_table_name %> HDFS_TMPDIR=<%= hdfs_tmpdir %> HDFS_PATH=hdfs://$HDFS_TMPDIR/$HIVE_TABLE_NAME/$HOST function debug { if [ "$DEBUG" == '1' ]; then echo $HOST $(date +'%Y-%m-%d %H:%M:%S'): $1 fi } function error { echo $HOST $(date +'%Y-%m-%d %H:%M:%S'): ERROR - $1 exit 1 } function check_result { if [ $? != 0 ]; then error "$1" fi } function hdfs_run { debug "$2" $HADOOP_HOME/bin/hadoop dfs $1 check_result "$2" } function remote_exec { debug "$2" ssh $HOST "$1" check_result "$2" } $HADOOP_HOME/bin/hadoop dfs -test -e $HDFS_PATH || hdfs_run "-mkdir $HDFS_PATH" "Creating $HDFS_PATH" if [ "$OVERWRITE" == '0' ]; then ssh $HOST "test ! -x $TOPTS_ODIR" check_result "The directory $HOST:$TOPTS_ODIR exists, indicating a past failure. Try a full reimport with the -o option." FILE_COUNT=$($HADOOP_HOME/bin/hadoop dfs -ls $HDFS_PATH | wc -l) if [ "$FILE_COUNT" != "0" ]; then error "$HDFS_PATH is not empty, indicating a past failure. Try a full reimport with the -o option." fi else remote_exec "rm -rf $TOPTS_ODIR" "removing directory $TOPTS_ODIR" hdfs_run "-rmr $HDFS_PATH" "Clearing directory $HDFS_PATH" fi remote_exec "mkdir -p $TMPDIR" "making directory $HOST:$TMPDIR if it does not exist" debug "copying $TFILE to $HOST:$TMPDIR/transpart" scp -q $TFILE $HOST:$TMPDIR/transpart check_result "could not copy $TFILE to $HOST:$TMPDIR/transpart" remote_exec "$QUERY | ruby $TMPDIR/transpart $TOPTS" "executing query: $QUERY" remote_exec "$HADOOP_HOME/bin/hadoop dfs -moveFromLocal $TOPTS_ODIR $HDFS_PATH" "uploading files in $TOPTS_ODIR to HDFS" remote_exec "rm -rf $TOPTS_ODIR" "removing directory $TOPTS_ODIR" debug "finished import"