Sha256: 81ead02a0917f3cac4a596c4a328896a1c6827ebd57ad67cb3c91bff60708adf

Contents?: true

Size: 1.23 KB

Versions: 3

Compression:

Stored size: 1.23 KB

Contents


module Wukong
  module AndPig

    #
    # Load the main class definitions
    #
    def self.init_load
      puts File.open(PIG_DEFS_DIR+"/init_load.pig").read
    end




    #
    # OK we're going to cheat here:
    # just cat the file in, and treat it as a scalar
    #
    def load_scalar path
      # var = `hadoop dfs -cat '#{path}/part-*' | head -n1 `.chomp
      var = "636"
    end



    def count_distinct dest_rel, attr, group_by
      distincted =
        generate(temp_rel(dest_rel), attr).
        distinct(temp_rel(dest_rel), :parallel => 10)
      distincted.
        group(   temp_rel(dest_rel), group_by).
        foreach( dest_rel,  "GENERATE COUNT(#{distincted.relation}.#{attr}) AS n_#{attr}")
    end

    #
    # Group a relation into bins, and return the counts for each bin
    # * dest_rel - Relation to store
    #   {bin,
    #
    def histogram dest_rel, bin_attr, bin_expr=nil
      bin_expr ||= bin_attr
      bin_name   = "#{bin_attr}_bin"
      binned     = foreach(temp_rel(dest_rel), "GENERATE #{bin_expr} AS #{bin_name}")
      binned.      group(  temp_rel(dest_rel), :by => bin_name).
        foreach(         dest_rel,  "GENERATE group AS #{bin_name}, COUNT(#{binned.relation}) AS #{bin_attr}_count")
    end


  end
end

Version data entries

3 entries across 3 versions & 2 rubygems

Version Path
mrflip-wukong-0.1.0 lib/wukong/and_pig/junk.rb
wukong-0.1.4 lib/wukong/and_pig/junk.rb
wukong-0.1.1 lib/wukong/and_pig/junk.rb