This class handles the parsing of each line in the log file
@@log = ActiveRecord::Base.logger
# File lib/vizi/vizi_tracker.rb, line 117 def initialize(drop_ips, spider_ips, spider_names, page_extensions, homepage, accept_only_homepage, hostname, drop_refers_by_hostname) @drops = drop_ips @sips = spider_ips @snames = spider_names @page_extensions = page_extensions @homepage = homepage @accept_only_homepage = accept_only_homepage @hostname = hostname @drop_refers_by_hostname = drop_refers_by_hostname @log_format = [] initialize_known_formats @parselog = Logger.new('./log/parse.log', shift_age = 'weekly') @parselog.level = Logger::WARN end
# File lib/vizi/vizi_tracker.rb, line 150 def build_format(line) fields = line.split(' ') i = 1 @format = "" while i < fields.length @format << "%"+FIELDNAMES[fields[i]]+" " # p log_format i = i + 1 end return @format end
Checks which standard the log file (well one line) is Automatically checks for most complex (longest) regex first..
# File lib/vizi/vizi_tracker.rb, line 143 def check_format(line) @known_formats.sort_by { |key, log_format| log_format.format_regex.source.size }.reverse.each { |key, log_format| return key if line.match(log_format.format_regex) } return :unknown end
processes the format string into symbols and test regex and saves using LogFormat class
# File lib/vizi/vizi_tracker.rb, line 134 def initialize_known_formats @known_formats = {} LOG_FORMATS.each do |name, format| @known_formats[name] = Vizi::LogFormat.new(name, format) end end
# File lib/vizi/vizi_tracker.rb, line 162 def match_partial (field, fldarray) hit = nil hit = fldarray.index(field) return hit i = 0 while k < fldarray.length hit = field.index(fldarray) break if hit != nil i = i + 1 end return hit end
apache files ... regex the file to determine logformat name IIS files ... parse the fields string to determine the file contents
# File lib/vizi/vizi_tracker.rb, line 177 def parse_line(line, logformat) if logformat != nil log_format = logformat # get log_format string @format_name = "temp" data = line.split(' ') else @format_name = check_format(line) # look for matching formats, check each time log_format = @known_formats[@format_name] # found a matched format raise ArgumentError if log_format.nil? or line !~ log_format.format_regex data = line.downcase.scan(log_format.format_regex).flatten end parsed_data = {} log_format.format_symbols.size.times do |i| parsed_data[log_format.format_symbols[i]] = data[i] # load data for each format_symbol end if parsed_data[:dtstring] parsed_data[:dtstring] = parsed_data[:dtstring][1...-1] parsed_data[:dtstring] = parsed_data[:dtstring].sub(":", " ") end # Add ip as domain if we don't have a domain (virtual host) # Assumes we always have an ip # parsed_data[:domain] = parsed_data[:ip] unless parsed_data[:domain] ##<<-- don't know this # parsed_data[:format_name] = @format_name parsed_data[:p_logformatname] = @format_name.to_s parsed_data[:p_logformat] = logformat parsed_data[:p_visitortype] = "H" # set default visitor type (H)uman parsed_data[:p_linetype] = "V" # linetype is (V)isitors parsed_data[:p_linetype] = "C" if parsed_data[:ip].nil? # reset if a comment line if @format_name.to_s == "w3c_f" # IIS file name ... generic @format = build_format(line) # parse fields to get log_format temp_format = Vizi::LogFormat.new(:temp, @format) # create temp format parsed_data[:p_logformat] = temp_format # shuttle the log_format object parsed_data[:p_logformatname] = "temp" # change the name parsed_data[:p_linetype] = "F" # linetype to (F)ield list parsed_data[:p_visitortype] = "-" # visitor type not relevant elsif @format_name.to_s == "w3c_c" # IIS file name ... comments parsed_data[:p_linetype] = "C" # linetype is (C)omment parsed_data[:p_visitortype] = "-" elsif parsed_data[:p_linetype] == "C" @parselog.warn line @parselog.warn "Found comment lines embedded in the log file ... resetting to nil" parsed_data[:p_logformat] = nil else # parsing the field names if parsed_data[:datestring] != nil dt = Time.parse(parsed_data[:datestring]+" "+parsed_data[:timestring]) parsed_data[:datetime] = Time.gm(dt.year, dt.month, dt.day, dt.hour, dt.min, dt.sec) end if parsed_data[:dtstring] != nil dt = Time.parse(parsed_data[:dtstring]) parsed_data[:datetime] = Time.gm(dt.year, dt.month, dt.day, dt.hour, dt.min, dt.sec) end if parsed_data[:request] != nil splitrequest = parsed_data[:request].gsub("/", " ").split parsed_data[:csuristem] = splitrequest[1] end # Now classify records based on logger rules ... parsed_data[:p_pageflag] = false if @accept_only_homepage parsed_data[:p_pageflag] = true if parsed_data[:csuristem].downcase.index(@homepage) == 1 else parsed_data[:p_pageflag] = true if match_partial(parsed_data[:csuristem], @page_extensions) end parsed_data[:p_linetype] = "D" if @drops.index(parsed_data[:ip]) parsed_data[:p_visitortype] = "S" if @sips.index(parsed_data[:ip]) if parsed_data[:user_agent] != nil parsed_data[:p_visitortype] = "S" if match_partial(parsed_data[:user_agent], @snames) end if parsed_data[:referer] != nil and @drop_refers_by_hostname parsed_data[:p_linetype] = "D" if parsed_data[:referer].index(@hostname) != nil end if parsed_data[:referer] != nil y = (/(search\?\S*?[pq])=(\S*?)(&)/).match(parsed_data[:referer]) parsed_data[:p_searchphrase] = y[2] if y != nil #p parsed_data[:ip] if y != nil #p parsed_data[:p_searchphrase] if y != nil if @drop_refers_by_hostname parsed_data[:p_linetype] = "D" if parsed_data[:referer].index(@hostname) != nil end end end # p parsed_data parsed_data end
Generated with the Darkfish Rdoc Generator 2.