# -*- coding: utf-8 -*-
module Wukong
  module HadoopCommand

    # ===========================================================================
    #
    # Hadoop Options
    #
    Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :description => "Path to hadoop installation; ENV['HADOOP_HOME'] by default. HADOOP_HOME/bin/hadoop is used to run hadoop.", :env_var => 'HADOOP_HOME', :wukong => true
    Settings.define :hadoop_runner,                            :description => "Path to hadoop script. Usually set --hadoop_home instead of this.", :wukong => true

    #
    # Translate simplified args to their hairy hadoop equivalents
    #
    Settings.define :max_node_map_tasks,     :jobconf => true, :description => 'mapred.tasktracker.map.tasks.maximum',                   :wukong => true
    Settings.define :max_node_reduce_tasks,  :jobconf => true, :description => 'mapred.tasktracker.reduce.tasks.maximum',                :wukong => true
    Settings.define :map_tasks,              :jobconf => true, :description => 'mapred.map.tasks',                                       :wukong => true
    Settings.define :reduce_tasks,           :jobconf => true, :description => 'mapred.reduce.tasks',                                    :wukong => true
    Settings.define :sort_fields,            :jobconf => true, :description => 'stream.num.map.output.key.fields',                       :wukong => true
    Settings.define :key_field_separator,    :jobconf => true, :description => 'map.output.key.field.separator',                         :wukong => true
    Settings.define :partition_fields,       :jobconf => true, :description => 'num.key.fields.for.partition',                           :wukong => true
    Settings.define :output_field_separator, :jobconf => true, :description => 'stream.map.output.field.separator',                      :wukong => true
    Settings.define :map_speculative,        :jobconf => true, :description => 'mapred.map.tasks.speculative.execution',                 :wukong => true
    Settings.define :timeout,                :jobconf => true, :description => 'mapred.task.timeout',                                    :wukong => true
    Settings.define :reuse_jvms,             :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks',                         :wukong => true
    Settings.define :respect_exit_status,    :jobconf => true, :description => 'stream.non.zero.exit.is.failure',                        :wukong => true
    Settings.define :io_sort_mb,             :jobconf => true, :description => 'io.sort.mb',                                             :wukong => true
    Settings.define :io_sort_record_percent, :jobconf => true, :description => 'io.sort.record.percent',                                 :wukong => true
    Settings.define :job_name,               :jobconf => true, :description => 'mapred.job.name',                                        :wukong => true
    Settings.define :max_reduces_per_node,   :jobconf => true, :description => 'mapred.max.reduces.per.node',                            :wukong => true
    Settings.define :max_reduces_per_cluster,:jobconf => true, :description => 'mapred.max.reduces.per.cluster',                         :wukong => true
    Settings.define :max_maps_per_node,      :jobconf => true, :description => 'mapred.max.maps.per.node',                               :wukong => true
    Settings.define :max_maps_per_cluster,   :jobconf => true, :description => 'mapred.max.maps.per.cluster',                            :wukong => true
    Settings.define :max_record_length,      :jobconf => true, :description => 'mapred.linerecordreader.maxlength',                      :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
    Settings.define :min_split_size,         :jobconf => true, :description => 'mapred.min.split.size',                                  :wukong => true
    Settings.define :noempty,                                  :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
    Settings.define :split_on_xml_tag,                         :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true

    # emit a -jobconf hadoop option if the simplified command line arg is present
    # if not, the resulting nil will be elided later
    def jobconf option
      if options[option]
        # "-jobconf %s=%s" % [options.description_for(option), options[option]]
        "-D %s=%s" % [options.description_for(option), options[option]]
      end
    end

    #
    # Assemble the hadoop command to execute
    # and launch the hadoop runner to execute the script across all tasktrackers
    #
    # FIXME: Should add some simple logic to ensure that commands are in the
    # right order or hadoop will complain. ie. -D options MUST come before
    # others
    #
    def execute_hadoop_workflow
      # Input paths join by ','
      input_paths = @input_paths.join(',')
      #
      # Use Settings[:hadoop_home] to set the path your config install.
      hadoop_commandline = [
        hadoop_runner,
        "jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
        hadoop_jobconf_options,
        "-D mapred.job.name='#{job_name}'",
        hadoop_other_args,
        "-mapper  '#{mapper_commandline}'",
        "-reducer '#{reducer_commandline}'",
        "-input   '#{input_paths}'",
        "-output  '#{output_path}'",
        hadoop_recycle_env,
      ].flatten.compact.join(" \t\\\n  ")
      Log.info "  Launching hadoop!"
      execute_command!(hadoop_commandline)
    end

    def hadoop_jobconf_options
      jobconf_options = []
      # Fixup these options
      options[:reuse_jvms] = '-1'             if (options[:reuse_jvms] == true)
      options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
      # If no reducer_klass and no reduce_command, then skip the reduce phase
      options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
      # Fields hadoop should use to distribute records to reducers
      unless options[:partition_fields].blank?
        jobconf_options += [
          jobconf(:partition_fields),
          jobconf(:output_field_separator),
        ]
      end
      jobconf_options += [
        :key_field_separator,  :sort_fields,
        :map_tasks,            :reduce_tasks,
        :max_node_map_tasks,   :max_node_reduce_tasks,
        :max_reduces_per_node, :max_reduces_per_cluster,
        :max_maps_per_node,    :max_maps_per_cluster,
        :min_split_size,
        :map_speculative,
        :timeout,
        :reuse_jvms, :respect_exit_status
      ].map{|opt| jobconf(opt)}
      jobconf_options.flatten.compact
    end

    def hadoop_other_args
      extra_str_args  = [ options[:extra_args] ]
      if Settings.split_on_xml_tag
        extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
      end
      extra_str_args   << ' -lazyOutput' if options[:noempty]  # don't create reduce file if no records
      extra_str_args   << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
      extra_str_args
    end

    def hadoop_recycle_env
      %w[RUBYLIB].map do |var|
        %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var]
      end.compact
    end

    # The path to the hadoop runner script
    def hadoop_runner
      options[:hadoop_runner] || (options[:hadoop_home]+'/bin/hadoop')
    end

    module ClassMethods
      #
      # Via @pskomoroch via @tlipcon,
      #
      #  "there is a little known Hadoop Streaming trick buried in this Python
      #   script. You will notice that the date is not actually in the raw log
      #   data itself, but is part of the filename. It turns out that Hadoop makes
      #   job parameters you would fetch in Java with something like
      #   job.get("mapred.input.file") available as environment variables for
      #   streaming jobs, with periods replaced with underscores:
      #
      #     filepath = os.environ["map_input_file"]
      #     filename = os.path.split(filepath)[-1]
      #   Thanks to Todd Lipcon for directing me to that hack.
      #

      # HDFS pathname to the input file currently being processed.
      def input_file
        ENV['map_input_file']
      end

      # Directory of the input file
      def input_dir
        ENV['mapred_input_dir']
      end

      # Offset of this chunk within the input file
      def map_input_start_offset
        ENV['map_input_start']
      end

      # length of the mapper's input chunk
      def map_input_length
        ENV['map_input_length']
      end

      def attempt_id
        ENV['mapred_task_id']
      end
      def curr_task_id
        ENV['mapred_tip_id']
      end

      def script_cmdline_urlenc
        ENV['stream_map_streamprocessor']
      end
    end

    # Standard ClassMethods-on-include trick
    def self.included base
      base.class_eval do
        extend ClassMethods
      end
    end
  end
end

        # -partitioner                          org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
        # -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
        # -D mapred.text.key.comparator.options=-k2,2nr\
        # -D mapred.text.key.partitioner.options=-k1,2\
        # -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
        # -D stream.num.map.output.key.fields=\"$sortfields\"
        #
        # -D stream.map.output.field.separator=\"'/t'\"
        # -D    map.output.key.field.separator=. \
        # -D       mapred.data.field.separator=. \
        # -D map.output.key.value.fields.spec=6,5,1-3:0- \
        # -D reduce.output.key.value.fields.spec=0-2:5- \

      # "HADOOP_HOME"                             =>"/usr/lib/hadoop-0.20/bin/..",
      # "HADOOP_IDENT_STRING"                     =>"hadoop",
      # "HADOOP_LOGFILE"                          =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
      # "HADOOP_LOG_DIR"                          =>"/usr/lib/hadoop-0.20/bin/../logs",
      # "HOME"                                    =>"/var/run/hadoop-0.20",
      # "JAVA_HOME"                               =>"/usr/lib/jvm/java-6-sun",
      # "LD_LIBRARY_PATH"                         =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
      # "PATH"                                    =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
      # "USER"                                    =>"hadoop",
      #
      # "dfs_block_size"                          =>"134217728",
      # "map_input_start"                         =>"0",
      # "map_input_length"                        =>"125726898",
      # "mapred_output_key_class"                 =>"org.apache.hadoop.io.Text",
      # "mapred_output_value_class"               =>"org.apache.hadoop.io.Text",
      # "mapred_output_format_class"              =>"org.apache.hadoop.mapred.TextOutputFormat",
      # "mapred_output_compression_codec"         =>"org.apache.hadoop.io.compress.DefaultCodec",
      # "mapred_output_compression_type"          =>"BLOCK",
      # "mapred_task_partition"                   =>"0",
      # "mapred_tasktracker_map_tasks_maximum"    =>"4",
      # "mapred_tasktracker_reduce_tasks_maximum" =>"2",
      # "mapred_tip_id"                           =>"task_200910221152_0023_m_000000",
      # "mapred_task_id"                          =>"attempt_200910221152_0023_m_000000_0",
      # "mapred_job_tracker"                      =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
      #
      # "mapred_input_dir"                        =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
      # "map_input_file"                          =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
      # "mapred_working_dir"                      =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
      # "mapred_work_output_dir"                  =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
      # "mapred_output_dir"                       =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
      # "mapred_temp_dir"                         =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
      # "PWD"                                     =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
      # "TMPDIR"                                  =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
      # "stream_map_streamprocessor"              =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
      # "user_name"                               =>"flip",