#!/usr/bin/env ruby
require 'rubygems'
require 'wukong/script'

class Logline < Struct.new(
  :ip, :date, :time, :http_method, :protocol, :path, :response_code, :duration, :referer, :ua, :tz)

  def page_type
    case
    when path =~ /\.(css|js)$/                  then :asset
    when path =~ /\.(png|gif|ico)$/             then :image
    when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/  then :page
    else                                             :other
    end
  end

  def is_page?
    page_type == :page
  end

  def day_hr
    visit.date + visit.time[0..1]
  end
end


#
# Group all visitors, and then troll through all the pages they've visited
# breaking each into distinct visits (where more than an [hour|day|whatever]
# separate subsequent pageviews
#

#
# Mapper parses log files and created a visitor_id from the visitor's user_id,
# cookie or ip. It emits
#
#    <visitor_id>  <datetime>   <url_path>
#
# where the partition key is visitor_id, and we sort by visitor_id and datetime.
#
class VisitorDatePath < Wukong::Streamer::StructStreamer
  def process visit, *args
    yield [visit.ip, visit.day_hr, visit.path]
  end
end

#
# Reducer:
#
# The reducer is given all page requests for the given visitor id, sorted by
# timestamp.
#
# It group by visits (pageviews separated by more than DISTINCT_VISIT_TIMEGAP)
# and emits
#
#     trail        <visitor_id> <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
#
# where the last is a comma-separated string of URL encoded paths (any internal comma is converted to %2C).
#
# You can instead emit
#
#     page_trails  <page1>      <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
#     page_trails  <page2>      <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
#     ....
#     page_trails  <pagen>      <n_pages_in_visit> <duration> <timestamp> < page1,page2,... >
#
# to discover all trails passing through a given page.
class VisitorDatePath < Wukong::Streamer::Reducer
  def get_key ip, day_hr, path, *args
    [ip, day_hr]
  end
  def process_group visit, *args
    yield [visit.ip, visit.day_hr, visit.path]
  end
end