lib/citrus.rb in citrus-1.8.0 vs lib/citrus.rb in citrus-2.0.0

- old
+ new

@@ -1,21 +1,23 @@ +require 'strscan' + # Citrus is a compact and powerful parsing library for Ruby that combines the # elegance and expressiveness of the language with the simplicity and power of # parsing expressions. # # http://mjijackson.com/citrus module Citrus autoload :File, 'citrus/file' - VERSION = [1, 8, 0] + VERSION = [2, 0, 0] # Returns the current version of Citrus as a string. def self.version VERSION.join('.') end - # A pattern to match any character, including \n. + # A pattern to match any character, including \\n. DOT = /./m Infinity = 1.0 / 0 F = ::File @@ -37,30 +39,82 @@ # This error is raised whenever a parse fails. class ParseError < Exception def initialize(input) @input = input - c = consumed - s = [0, c.length - 40].max - msg = "Failed to parse input at offset %d" % max_offset - msg += ", just after %s" % c[s, c.length].inspect + "\n" + msg = "Failed to parse input at offset %d" % offset + msg << detail super(msg) end # The Input object that was used for the parse. attr_reader :input - # Returns the maximum offset that was reached before the error occurred. - def max_offset + # Returns the 0-based offset at which the error occurred in the input, i.e. + # the maximum offset in the input that was successfully parsed before the + # error occurred. + def offset input.max_offset end - # Returns the portion of the input string that was successfully consumed - # before the parse failed. - def consumed - input[0, max_offset] + # Returns the text of the line on which the error occurred. + def line + lines[line_index] end + + # Returns the 1-based number of the line in the input where the error + # occurred. + def line_number + line_index + 1 + end + + alias lineno line_number + + # Returns the 0-based offset at which the error occurred on the line on + # which it occurred. + def line_offset + pos = 0 + each_line do |line| + len = line.length + return (offset - pos) if pos + len >= offset + pos += len + end + 0 + end + + # Returns a string that, when printed, gives a visual representation of + # exactly where the error occurred on its line in the input. + def detail + "%s\n%s^" % [line, ' ' * line_offset] + end + + private + + def string + input.string + end + + def lines + string.send(string.respond_to?(:lines) ? :lines : :to_s).to_a + end + + def each_line(&block) + string.each_line(&block) + end + + # Returns the 0-based number of the line in the input where the error + # occurred. + def line_index + pos = 0 + idx = 0 + each_line do |line| + pos += line.length + return idx if pos >= offset + idx += 1 + end + 0 + end end # Inclusion of this module into another extends the receiver with the grammar # helper methods in GrammarMethods. Although this module does not actually # provide any methods, constants, or variables to modules that include it, the @@ -82,18 +136,19 @@ # Extends all modules that +include Grammar+ with GrammarMethods and # exposes Module#include. def self.included(mod) mod.extend(GrammarMethods) + # Expose #include so it can be called publicly. class << mod; public :include end end end # Contains methods that are available to Grammar modules at the class level. module GrammarMethods def self.extend_object(obj) - raise ArgumentError, "Grammars must be Ruby modules" unless Module === obj + raise ArgumentError, "Grammars must be Modules" unless Module === obj super end # Returns the name of this grammar as a string. def name @@ -151,13 +206,13 @@ # given, its return value will be used for the value of +obj+. # # It is important to note that this method will also check any included # grammars for a rule with the given +name+ if one cannot be found in this # grammar. - def rule(name, obj=nil) + def rule(name, obj=nil, &block) sym = name.to_sym - obj = Proc.new.call if block_given? + obj = block.call if block if obj rule_names << sym unless has_rule?(sym) rule = Rule.new(obj) @@ -254,13 +309,13 @@ end # Specifies a Module that will be used to extend all matches created with # the given +rule+. A block may also be given that will be used to create # an anonymous module. See Rule#ext=. - def ext(rule, mod=nil) + def ext(rule, mod=nil, &block) rule = Rule.new(rule) - mod = Proc.new if block_given? + mod = block if block rule.extension = mod if mod rule end # Parses the given input +string+ using the given +options+. If no match can @@ -271,13 +326,15 @@ raise 'No root rule specified' unless opts[:root] root_rule = rule(opts[:root]) raise 'No rule named "%s"' % root unless root_rule - input = Input.new(string, opts[:memoize]) - match = input.match(root_rule, opts[:offset]) + input = Input.new(string) + input.memoize! if opts[:memoize] + input.pos = opts[:offset] if opts[:offset] > 0 + match = input.match(root_rule) if match.nil? || (opts[:consume] && input.length != match.length) raise ParseError.new(input) end match @@ -288,14 +345,12 @@ # # offset:: The offset at which the parse should start. Defaults to 0. # root:: The name of the root rule to use for the parse. Defaults # to the name supplied by calling #root. # memoize:: If this is +true+ the matches generated during a parse are - # memoized. This technique (also known as Packrat parsing) - # guarantees parsers will operate in linear time but costs - # significantly more in terms of time and memory required. - # Defaults to +false+. + # memoized. See Input#memoize! for more information. Defaults to + # +false+. # consume:: If this is +true+ a ParseError will be raised during a parse # unless the entire input string is consumed. Defaults to # +false+. def default_parse_options { :offset => 0, @@ -306,65 +361,84 @@ end end # This class represents the core of the parsing algorithm. It wraps the input # string and serves matches to all nonterminals. - class Input - # Takes the input +string+ that is to be parsed. If +memoize+ is +true+ - # a cache is created that holds references to already generated matches. - def initialize(string, memoize=false) - @string = string + class Input < StringScanner + def initialize(string) + super(string) @max_offset = 0 - if memoize - @cache = {} - @cache_hits = 0 - end end - # The input string. - attr_reader :string - - # The maximum offset that has been achieved. + # The maximum offset that has been achieved during a parse. attr_reader :max_offset - # A two-level hash of rule id's and offsets to their respective matches. - # Only present if memoing is enabled. + # A nested hash of rule id's to offsets and their respective matches. Only + # present if memoing is enabled. attr_reader :cache # The number of times the cache was hit. Only present if memoing is enabled. attr_reader :cache_hits - # Sends all arguments to this input's +string+. - def [](*args) - @string.__send__(:[], *args) - end - # Returns the length of this input. def length - @string.length + string.length end - # Returns the match for a given +rule+ at +offset+. If memoing is enabled - # and a match does not already exist for the given rule/offset pair then - # the rule is executed and the result is cached before returning. See - # http://pdos.csail.mit.edu/~baford/packrat/icfp02/ for more information - # on memoing match results (also known as packrat parsing). - def match(rule, offset=0) - @max_offset = offset if offset > @max_offset + # Returns the match for a given +rule+ at the current position in the input. + def match(rule) + offset = pos + match = rule.match(self) - if @cache - c = @cache[rule.id] ||= {} + if match + @max_offset = pos if pos > @max_offset + else + # Reset the position for the next attempt at a match. + self.pos = offset + end - if c.key?(offset) - @cache_hits += 1 - c[offset] - else - c[offset] = rule.match(self, offset) + match + end + + # Returns true if this input uses memoization to cache match results. See + # #memoize!. + def memoized? + !! @cache + end + + # Modifies this object to cache match results during a parse. This technique + # (also known as "Packrat" parsing) guarantees parsers will operate in + # linear time but costs significantly more in terms of time and memory + # required to perform a parse. For more information, please read the paper + # on Packrat parsing at http://pdos.csail.mit.edu/~baford/packrat/icfp02/. + def memoize! + return if memoized? + + # Using +instance_eval+ here preserves access to +super+ within the + # methods we define inside the block. + instance_eval do + def match(rule) + c = @cache[rule.id] ||= {} + + if c.key?(pos) + @cache_hits += 1 + c[pos] + else + c[pos] = super + end end - else - rule.match(self, offset) + + def reset + super + @max_offset = 0 + @cache = {} + @cache_hits = 0 + end end + + @cache = {} + @cache_hits = 0 end end # A Rule is an object that is used by a grammar to create matches on the # Input during parsing. @@ -378,19 +452,18 @@ end # Returns a new Rule object depending on the type of object given. def self.new(obj) case obj - when Rule then obj - when Symbol then Alias.new(obj) - when String then FixedWidth.new(obj) - when Regexp then Expression.new(obj) - when Array then Sequence.new(obj) - when Range then Choice.new(obj.to_a) - when Numeric then FixedWidth.new(obj.to_s) + when Rule then obj + when Symbol then Alias.new(obj) + when String, Regexp then Terminal.new(obj) + when Array then Sequence.new(obj) + when Range then Choice.new(obj.to_a) + when Numeric then Terminal.new(obj.to_s) else - raise ArgumentError, "Invalid rule object: #{obj.inspect}" + raise ArgumentError, "Invalid rule object: %s" % obj.inspect end end @unique_id = 0 @@ -464,13 +537,12 @@ match.extend(extension) if extension match.names << name if name match end - def create_match(data, offset) - match = Match.new(data, offset) - extend_match(match, name) + def create_match(data) + extend_match(Match.new(data), name) end end # A Proxy is a Rule that is a placeholder for another rule. It stores the # name of some other rule in the grammar internally and resolves it to the @@ -494,14 +566,13 @@ # Returns the underlying Rule for this proxy. def rule @rule ||= resolve! end - # Returns the Match for this proxy's #rule on +input+ at the given +offset+, - # +nil+ if no match can be made. - def match(input, offset=0) - m = input.match(rule, offset) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) + m = input.match(rule) extend_match(m, name) if m end end # An Alias is a Proxy for a rule in the same grammar. It is used in rule @@ -556,76 +627,57 @@ rule end end # A Terminal is a Rule that matches directly on the input stream and may not - # contain any other rule. - module Terminal - include Rule - - def initialize(rule) - @rule = rule - end - - # The actual String or Regexp object this rule uses to match. - attr_reader :rule - - # Returns the Citrus notation of this rule as a string. - def to_s - rule.inspect - end - end - - # A FixedWidth is a Terminal that matches based on its length. The Citrus - # notation is any sequence of characters enclosed in either single or double - # quotes, e.g.: + # contain any other rule. Terminals may be created from either a String or a + # Regexp object. When created from strings, the Citrus notation is any + # sequence of characters enclosed in either single or double quotes, e.g.: # # 'expr' # "expr" # - class FixedWidth - include Terminal - - def initialize(rule='') - raise ArgumentError, "FixedWidth must be a String" unless String === rule - super - end - - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) - create_match(rule.dup, offset) if input[offset, rule.length] == rule - end - end - - # An Expression is a Terminal that has the same semantics as a regular - # expression in Ruby. The expression must match at the beginning of the input - # (index 0). The Citrus notation is identical to Ruby's regular expression - # notation, e.g.: + # When created from a regular expression, the Citrus notation is identical to + # Ruby's regular expression notation, e.g.: # # /expr/ # # Character classes and the dot symbol may also be used in Citrus notation for # compatibility with other parsing expression implementations, e.g.: # # [a-zA-Z] # . # - class Expression - include Terminal + class Terminal + include Rule - def initialize(rule=/^/) - raise ArgumentError, "Expression must be a Regexp" unless Regexp === rule - super + def initialize(rule='') + case rule + when String + @string = rule + @rule = Regexp.new(Regexp.escape(rule)) + when Regexp + @rule = rule + else + raise ArgumentError, "Cannot create terminal from object: %s" % + rule.inspect + end end - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) - m = input[offset, input.length - offset].match(rule) - create_match(m, offset) if m && m.begin(0) == 0 + # The actual Regexp object this rule uses to match. + attr_reader :rule + + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) + m = input.scan(@rule) + create_match(m) if m end + + # Returns the Citrus notation of this rule as a string. + def to_s + (@string || @rule).inspect + end end # A Nonterminal is a Rule that augments the matching behavior of one or more # other rules. Nonterminals may not match directly on the input, but instead # invoke the rule(s) they contain to determine if a match can be made from @@ -667,14 +719,13 @@ # &expr # class AndPredicate include Predicate - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) - create_match('', offset) if input.match(rule, offset) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) + create_match('') if input.match(rule) end # Returns the Citrus notation of this rule as a string. def to_s '&' + rule.embed @@ -688,14 +739,13 @@ # !expr # class NotPredicate include Predicate - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) - create_match('', offset) unless input.match(rule, offset) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) + create_match('') unless input.match(rule) end # Returns the Citrus notation of this rule as a string. def to_s '!' + rule.embed @@ -711,23 +761,20 @@ class ButPredicate include Predicate DOT_RULE = Rule.new(DOT) - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) matches = [] - os = offset - while input.match(rule, os).nil? - m = input.match(DOT_RULE, os) + while input.match(rule).nil? + m = input.match(DOT_RULE) break unless m matches << m - os += m.length end # Create a single match from the aggregate text value of all submatches. - create_match(matches.join, offset) if matches.any? + create_match(matches.join) if matches.any? end # Returns the Citrus notation of this rule as a string. def to_s '~' + rule.embed @@ -755,15 +802,15 @@ end # The label this rule adds to all its matches. attr_reader :label - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. When a Label makes a match, it re-names the match to - # the value of its label. - def match(input, offset=0) - m = input.match(rule, offset) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + # When a Label makes a match, it re-names the match to the value of its + # #label. + def match(input) + m = input.match(rule) extend_match(m, label) if m end # Returns the Citrus notation of this rule as a string. def to_s @@ -797,22 +844,19 @@ super(rule) raise ArgumentError, "Min cannot be greater than max" if min > max @range = Range.new(min, max) end - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) matches = [] - os = offset while matches.length < @range.end - m = input.match(rule, os) + m = input.match(rule) break unless m matches << m - os += m.length end - create_match(matches, offset) if @range.include?(matches.length) + create_match(matches) if @range.include?(matches.length) end # The minimum number of times this rule must match. def min @range.begin @@ -844,10 +888,11 @@ # A List is a Nonterminal that contains any number of other rules and tests # them for matches in sequential order. module List include Nonterminal + # See Rule#paren?. def paren? rules.length > 1 end end @@ -857,15 +902,14 @@ # expr | expr # class Choice include List - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) rules.each do |rule| - m = input.match(rule, offset) + m = input.match(rule) return extend_match(m, name) if m end nil end @@ -881,22 +925,19 @@ # expr expr # class Sequence include List - # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if - # no match can be made. - def match(input, offset=0) + # Returns the Match for this rule on +input+, +nil+ if no match can be made. + def match(input) matches = [] - os = offset rules.each do |rule| - m = input.match(rule, os) + m = input.match(rule) break unless m matches << m - os += m.length end - create_match(matches, offset) if matches.length == rules.length + create_match(matches) if matches.length == rules.length end # Returns the Citrus notation of this rule as a string. def to_s rules.map {|r| r.embed }.join(' ') @@ -905,28 +946,23 @@ # The base class for all matches. Matches are organized into a tree where any # match may contain any number of other matches. This class provides several # convenient tree traversal methods that help when examining parse results. class Match < String - def initialize(data, offset=0) + def initialize(data) case data when String super(data) - when MatchData - super(data[0]) - @captures = data.captures when Array super(data.join) @matches = data + else + raise ArgumentError, "Cannot create match from object: %s" % + data.inspect end - - @offset = offset end - # The offset in the input at which this match occurred. - attr_reader :offset - # An array of all names of this match. A name is added to a match object # for each rule that returns that object when matching. These names can then # be used to determine which rules were satisfied by a given match. def names @names ||= [] @@ -943,15 +979,9 @@ end # An array of all sub-matches of this match. def matches @matches ||= [] - end - - # An array of substrings returned by MatchData#captures if this match was - # created by an Expression. - def captures - @captures ||= [] end # Returns an array of all sub-matches with the given +name+. If +deep+ is # +false+, returns only sub-matches that are immediate descendants of this # match.