examples/apache_log_parser.rb in wukong-1.4.0 vs examples/apache_log_parser.rb in wukong-1.4.1

- old
+ new

@@ -3,25 +3,25 @@ require 'wukong' module ApacheLogParser class Mapper < Wukong::Streamer::LineStreamer + # regular expression for apache-style log lines + # note that we strip out the google analytics listener. + LOG_RE = %r{\A + (\d+\.\d+\.\d+\.\d+) # IP addr + \s([^\s]+)\s # - + \s([^\s]+) # - + \s\[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] # [07/Jun/2008:20:37:11 +0000] + \s(\d+) # 400 + \s"([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" # "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1" + \s(\d+) # 173 + \s"([^\"]*)" "([^\"]*)" "([^\"]*)" # "-" "-" "-" + \z}x - def parse_request req - m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req) - if m - [''] + m.captures - else - [req, '', '', ''] - end - end - - - # regular expression to match on apache-style log lines - # IP addr - - [07/Jun/2008:20:37:11 +0000] 400 "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1" 173 "-" "-" "-" - LOG_RE = %r{\A(\d+\.\d+\.\d+\.\d+) ([^\s]+) ([^\s]+) \[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] (\d+) "([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" (\d+) "([^\"]*)" "([^\"]*)" "([^\"]*)"\z} - + # Use the regex to break line into fields + # Emit each record as flat line def process line line.chomp m = LOG_RE.match(line) if m ip, j1, j2, datepart, timepart, tzpart, resp, req, j3, ref, ua, j4 = m.captures @@ -30,10 +30,22 @@ yield [:logline, method, path, protocol, ip, j1, j2, req_date, resp, req, j3, ref, ua, j4] else yield [:unparseable, line] end end + + + def parse_request req + m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req) + if m + [''] + m.captures + else + [req, '', '', ''] + end + end + end + class Reducer < Wukong::Streamer::LineStreamer end # Execute the script