Sha256: d5af4e02d1a5b483dd9c121e883decd943a705287a28d6d3b47325dc085502f2

Contents?: true

Size: 1.77 KB

Versions: 27

Compression:

Stored size: 1.77 KB

Contents

#!/usr/bin/env ruby
require 'wukong'
NEWLINE_LENGTH = $/.length # KLUDGE

#
#
#
# !! The +words+ count comes out higher than that of +wc+ -- don't know
# why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
#
class WcMapper < Wukong::Streamer::LineStreamer
  attr_accessor :lines, :fields, :words, :chars, :bytes

  def before_stream
    self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
  end

  def process line
    return unless line
    self.lines  += 1
    self.fields += 1 + line.count("\t")
    self.words  += 1 + line.strip.scan(/\s+/).length unless line.blank?
    self.chars  += line.chars.to_a.length + NEWLINE_LENGTH
    self.bytes  += line.bytesize          + NEWLINE_LENGTH
    $stderr.puts line if (line.chars.to_a.length != line.bytesize)
  end

  def after_stream
    emit [lines, fields, words, chars,  bytes]
  end
end

#
#
class WcReducer < Wukong::Streamer::Base
  attr_accessor :lines, :fields, :words, :chars, :bytes

  def before_stream
    self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
  end

  def process m_lines, m_fields, m_words, m_chars, m_bytes
    self.lines  += m_lines.to_i
    self.fields += m_fields.to_i
    self.words  += m_words.to_i
    self.chars  += m_chars.to_i
    self.bytes  += m_bytes.to_i
  end

  def after_stream
    emit [lines, fields, words, chars,  bytes]
  end
end

Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run

# class FooScript < Wukong::Script
#   def map_command
#     '/usr/bin/wc'
#   end
#   def reduce_command
#     '/bin/cat'
#   end
# end
# FooScript.new(nil, nil, :reduce_tasks => 1).run
#
#  ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'

Version data entries

27 entries across 27 versions & 3 rubygems

Version Path
mrflip-wukong-0.1.0 bin/hdp-wc
wukong-hadoop-0.2.0 bin/hdp-wc
wukong-hadoop-0.1.1 bin/hdp-wc
wukong-hadoop-0.1.0 bin/hdp-wc
wukong-hadoop-0.0.2 bin/hdp-wc
wukong-hadoop-0.0.1 bin/hdp-wc
wukong-3.0.0.pre bin/hdp-wc
wukong-2.0.2 bin/hdp-wc
wukong-2.0.1 bin/hdp-wc
wukong-2.0.0 bin/hdp-wc
wukong-1.5.4 bin/hdp-wc
wukong-1.5.3 bin/hdp-wc
wukong-1.5.2 bin/hdp-wc
wukong-1.5.1 bin/hdp-wc
wukong-1.5.0 bin/hdp-wc
wukong-1.4.12 bin/hdp-wc
wukong-1.4.11 bin/hdp-wc
wukong-1.4.10 bin/hdp-wc
wukong-1.4.9 bin/hdp-wc
wukong-1.4.7 bin/hdp-wc