require 'imw/parsers/line_parser' module IMW module Parsers # A RegexpParser is a line-oriented parser which uses a regular # expression to extract data from a line into either a hash or an # object obeying hash semantics. # # As an example, a flat file with one record per line in the # following format (this is a simplified version of common # webserver log formats) # # 151.199.53.145 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/main.php HTTP/1.0 # 81.227.179.120 14-Oct-2007:13:34:34-0500 GET /phpmyadmin/libraries/select_lang.lib.php HTTP/1.0 # 81.3.107.173 14-Oct-2007:13:54:26-0500 GET / HTTP/1.1 # ... # # could be parsed as follows # # file = File.new '/path/to/file.log' # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$}, # :into_fields => [:ip, :timestamp, :verb, :url, :version] # parser.parse file #=> [{:ip => '151.199.53.145', :timestamp => '14-Oct-2007:13:34:34-0500', :verb => 'GET', :url => '/phpmyadmin/main.php', :version => "1.0"}, ... ] # # Consecutive captures from the regular expression will be pushed # into a hash with keys given by the +into_fields+ property of # this parser. # # If the parser is instantiated with the :of keyword then # the parsed hash from each line is used to instantiate a new # object of the corresponding class: # # require 'ostruct' # # PageView = Class.new(OpenStruct) # # parser = IMW::Parsers::RegexpParser.new :by_regexp => %r{^([\d\.]+) (\d{2}-\w{3}-\d{4}:\d{2}:\d{2}:\d{2}-\d{4}) (\w+) ([^\s]+) HTTP/([\d.]{3})$}, # :into_fields => [:ip, :timestamp, :verb, :url, :version], # :of => PageView # # parser.parse! file #=> [#, ... ] # # The option :strictly can also be set to force the # parser to raise an error if it finds a line which doesn't match # its regexp. class RegexpParser < LineParser attr_accessor :regexp, :fields, :strict def initialize options={} @regexp = options[:regexp] || options[:by_regexp] @fields = options[:fields] || options[:into_fields] @strict = options[:strict] || options[:strictly] super options end def parse_line line match_data = regexp.match(line.chomp) {}.tap do |hsh| if match_data match_data.captures.each_with_index do |capture, index| hsh[fields[index]] = capture end else raise IMW::ParseError.new("Could not parse the following line:\n\n#{line}\n\nusing regexp\n\n#{regexp.to_s}") if strict end end end end end end