examples/apache_log_parser.rb in wukong-1.4.6 vs examples/apache_log_parser.rb in wukong-1.4.7
- old
+ new
@@ -1,65 +1,74 @@
#!/usr/bin/env ruby
$: << File.dirname(__FILE__)+'/../lib'
+require 'rubygems'
require 'wukong'
+MONTHS = {
+ 'Jan' => '01',
+ 'Feb' => '02',
+ 'Mar' => '03',
+ 'Apr' => '04',
+ 'May' => '05',
+ 'Jun' => '06',
+ 'Jul' => '07',
+ 'Aug' => '08',
+ 'Sep' => '09',
+ 'Oct' => '10',
+ 'Nov' => '11',
+ 'Dec' => '12',
+}
module ApacheLogParser
class Mapper < Wukong::Streamer::LineStreamer
- # regular expression for apache-style log lines
- # note that we strip out the google analytics listener.
- LOG_RE = %r{\A
- (\d+\.\d+\.\d+\.\d+) # IP addr
- \s([^\s]+)\s # -
- \s([^\s]+) # -
- \s\[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] # [07/Jun/2008:20:37:11 +0000]
- \s(\d+) # 400
- \s"([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" # "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1"
- \s(\d+) # 173
- \s"([^\"]*)" "([^\"]*)" "([^\"]*)" # "-" "-" "-"
- \z}x
+ #
+ # Regular expression to parse an apache log line.
+ #
+ # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
+ #
+ LOG_RE = Regexp.compile(%r{\A
+ (\S+) # ip 83.240.154.3
+ \s(\S+) # j1 -
+ \s(\S+) # j2 -
+ \s\[(\d+)/(\w+)/(\d+) # date part [07/Jun/2008
+ :(\d+):(\d+):(\d+) # time part :20:37:11
+ \s(\+.*)\] # timezone +0000]
+ \s\"(?:(\S+) # http_method "GET
+ \s(\S+) # path /faq
+ \s(\S+)|-)" # protocol HTTP/1.1"
+ \s(\d+) # response_code 200
+ \s(\d+) # duration 569
+ \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
+ \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
+ \z}x)
# Use the regex to break line into fields
# Emit each record as flat line
def process line
line.chomp
m = LOG_RE.match(line)
if m
- ip, j1, j2, datepart, timepart, tzpart, resp, req, j3, ref, ua, j4 = m.captures
- req_date = DateTime.parse("#{datepart} #{timepart} #{tzpart}").to_flat
- req, method, path, protocol = parse_request(req)
- yield [:logline, method, path, protocol, ip, j1, j2, req_date, resp, req, j3, ref, ua, j4]
+ (ip, j1, j2,
+ ts_day, ts_mo, ts_year,
+ ts_hour, ts_min, ts_sec, req_tz,
+ http_method, path, protocol,
+ response_code, duration,
+ referer, ua, *cruft) = m.captures
+ # DateTime.parse("#{datepart} #{timepart}").to_flat # this takes way too long
+ req_date = [ts_year, MONTHS[ts_mo], ts_day].join("")
+ req_time = [ts_hour, ts_min, ts_sec].join("")
+ yield [:logline, ip, req_date, req_time, http_method, protocol, path, response_code, duration, referer, ua, req_tz]
else
yield [:unparseable, line]
end
end
-
- def parse_request req
- m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
- if m
- [''] + m.captures
- else
- [req, '', '', '']
- end
- end
-
end
+end
+Wukong::Script.new(ApacheLogParser::Mapper, nil, :sort_fields => 7).run
- class Reducer < Wukong::Streamer::LineStreamer
- end
+# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
- # Execute the script
- class Script < Wukong::Script
- def reduce_command
- "/usr/bin/uniq"
- end
- def default_options
- super.merge :sort_fields => 8 # , :reduce_tasks => 0
- end
- end
- Script.new(Mapper,nil).run
-end
-# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
+