lib/citrus.rb in citrus-1.8.0 vs lib/citrus.rb in citrus-2.0.0
- old
+ new
@@ -1,21 +1,23 @@
+require 'strscan'
+
# Citrus is a compact and powerful parsing library for Ruby that combines the
# elegance and expressiveness of the language with the simplicity and power of
# parsing expressions.
#
# http://mjijackson.com/citrus
module Citrus
autoload :File, 'citrus/file'
- VERSION = [1, 8, 0]
+ VERSION = [2, 0, 0]
# Returns the current version of Citrus as a string.
def self.version
VERSION.join('.')
end
- # A pattern to match any character, including \n.
+ # A pattern to match any character, including \\n.
DOT = /./m
Infinity = 1.0 / 0
F = ::File
@@ -37,30 +39,82 @@
# This error is raised whenever a parse fails.
class ParseError < Exception
def initialize(input)
@input = input
- c = consumed
- s = [0, c.length - 40].max
- msg = "Failed to parse input at offset %d" % max_offset
- msg += ", just after %s" % c[s, c.length].inspect + "\n"
+ msg = "Failed to parse input at offset %d" % offset
+ msg << detail
super(msg)
end
# The Input object that was used for the parse.
attr_reader :input
- # Returns the maximum offset that was reached before the error occurred.
- def max_offset
+ # Returns the 0-based offset at which the error occurred in the input, i.e.
+ # the maximum offset in the input that was successfully parsed before the
+ # error occurred.
+ def offset
input.max_offset
end
- # Returns the portion of the input string that was successfully consumed
- # before the parse failed.
- def consumed
- input[0, max_offset]
+ # Returns the text of the line on which the error occurred.
+ def line
+ lines[line_index]
end
+
+ # Returns the 1-based number of the line in the input where the error
+ # occurred.
+ def line_number
+ line_index + 1
+ end
+
+ alias lineno line_number
+
+ # Returns the 0-based offset at which the error occurred on the line on
+ # which it occurred.
+ def line_offset
+ pos = 0
+ each_line do |line|
+ len = line.length
+ return (offset - pos) if pos + len >= offset
+ pos += len
+ end
+ 0
+ end
+
+ # Returns a string that, when printed, gives a visual representation of
+ # exactly where the error occurred on its line in the input.
+ def detail
+ "%s\n%s^" % [line, ' ' * line_offset]
+ end
+
+ private
+
+ def string
+ input.string
+ end
+
+ def lines
+ string.send(string.respond_to?(:lines) ? :lines : :to_s).to_a
+ end
+
+ def each_line(&block)
+ string.each_line(&block)
+ end
+
+ # Returns the 0-based number of the line in the input where the error
+ # occurred.
+ def line_index
+ pos = 0
+ idx = 0
+ each_line do |line|
+ pos += line.length
+ return idx if pos >= offset
+ idx += 1
+ end
+ 0
+ end
end
# Inclusion of this module into another extends the receiver with the grammar
# helper methods in GrammarMethods. Although this module does not actually
# provide any methods, constants, or variables to modules that include it, the
@@ -82,18 +136,19 @@
# Extends all modules that +include Grammar+ with GrammarMethods and
# exposes Module#include.
def self.included(mod)
mod.extend(GrammarMethods)
+ # Expose #include so it can be called publicly.
class << mod; public :include end
end
end
# Contains methods that are available to Grammar modules at the class level.
module GrammarMethods
def self.extend_object(obj)
- raise ArgumentError, "Grammars must be Ruby modules" unless Module === obj
+ raise ArgumentError, "Grammars must be Modules" unless Module === obj
super
end
# Returns the name of this grammar as a string.
def name
@@ -151,13 +206,13 @@
# given, its return value will be used for the value of +obj+.
#
# It is important to note that this method will also check any included
# grammars for a rule with the given +name+ if one cannot be found in this
# grammar.
- def rule(name, obj=nil)
+ def rule(name, obj=nil, &block)
sym = name.to_sym
- obj = Proc.new.call if block_given?
+ obj = block.call if block
if obj
rule_names << sym unless has_rule?(sym)
rule = Rule.new(obj)
@@ -254,13 +309,13 @@
end
# Specifies a Module that will be used to extend all matches created with
# the given +rule+. A block may also be given that will be used to create
# an anonymous module. See Rule#ext=.
- def ext(rule, mod=nil)
+ def ext(rule, mod=nil, &block)
rule = Rule.new(rule)
- mod = Proc.new if block_given?
+ mod = block if block
rule.extension = mod if mod
rule
end
# Parses the given input +string+ using the given +options+. If no match can
@@ -271,13 +326,15 @@
raise 'No root rule specified' unless opts[:root]
root_rule = rule(opts[:root])
raise 'No rule named "%s"' % root unless root_rule
- input = Input.new(string, opts[:memoize])
- match = input.match(root_rule, opts[:offset])
+ input = Input.new(string)
+ input.memoize! if opts[:memoize]
+ input.pos = opts[:offset] if opts[:offset] > 0
+ match = input.match(root_rule)
if match.nil? || (opts[:consume] && input.length != match.length)
raise ParseError.new(input)
end
match
@@ -288,14 +345,12 @@
#
# offset:: The offset at which the parse should start. Defaults to 0.
# root:: The name of the root rule to use for the parse. Defaults
# to the name supplied by calling #root.
# memoize:: If this is +true+ the matches generated during a parse are
- # memoized. This technique (also known as Packrat parsing)
- # guarantees parsers will operate in linear time but costs
- # significantly more in terms of time and memory required.
- # Defaults to +false+.
+ # memoized. See Input#memoize! for more information. Defaults to
+ # +false+.
# consume:: If this is +true+ a ParseError will be raised during a parse
# unless the entire input string is consumed. Defaults to
# +false+.
def default_parse_options
{ :offset => 0,
@@ -306,65 +361,84 @@
end
end
# This class represents the core of the parsing algorithm. It wraps the input
# string and serves matches to all nonterminals.
- class Input
- # Takes the input +string+ that is to be parsed. If +memoize+ is +true+
- # a cache is created that holds references to already generated matches.
- def initialize(string, memoize=false)
- @string = string
+ class Input < StringScanner
+ def initialize(string)
+ super(string)
@max_offset = 0
- if memoize
- @cache = {}
- @cache_hits = 0
- end
end
- # The input string.
- attr_reader :string
-
- # The maximum offset that has been achieved.
+ # The maximum offset that has been achieved during a parse.
attr_reader :max_offset
- # A two-level hash of rule id's and offsets to their respective matches.
- # Only present if memoing is enabled.
+ # A nested hash of rule id's to offsets and their respective matches. Only
+ # present if memoing is enabled.
attr_reader :cache
# The number of times the cache was hit. Only present if memoing is enabled.
attr_reader :cache_hits
- # Sends all arguments to this input's +string+.
- def [](*args)
- @string.__send__(:[], *args)
- end
-
# Returns the length of this input.
def length
- @string.length
+ string.length
end
- # Returns the match for a given +rule+ at +offset+. If memoing is enabled
- # and a match does not already exist for the given rule/offset pair then
- # the rule is executed and the result is cached before returning. See
- # http://pdos.csail.mit.edu/~baford/packrat/icfp02/ for more information
- # on memoing match results (also known as packrat parsing).
- def match(rule, offset=0)
- @max_offset = offset if offset > @max_offset
+ # Returns the match for a given +rule+ at the current position in the input.
+ def match(rule)
+ offset = pos
+ match = rule.match(self)
- if @cache
- c = @cache[rule.id] ||= {}
+ if match
+ @max_offset = pos if pos > @max_offset
+ else
+ # Reset the position for the next attempt at a match.
+ self.pos = offset
+ end
- if c.key?(offset)
- @cache_hits += 1
- c[offset]
- else
- c[offset] = rule.match(self, offset)
+ match
+ end
+
+ # Returns true if this input uses memoization to cache match results. See
+ # #memoize!.
+ def memoized?
+ !! @cache
+ end
+
+ # Modifies this object to cache match results during a parse. This technique
+ # (also known as "Packrat" parsing) guarantees parsers will operate in
+ # linear time but costs significantly more in terms of time and memory
+ # required to perform a parse. For more information, please read the paper
+ # on Packrat parsing at http://pdos.csail.mit.edu/~baford/packrat/icfp02/.
+ def memoize!
+ return if memoized?
+
+ # Using +instance_eval+ here preserves access to +super+ within the
+ # methods we define inside the block.
+ instance_eval do
+ def match(rule)
+ c = @cache[rule.id] ||= {}
+
+ if c.key?(pos)
+ @cache_hits += 1
+ c[pos]
+ else
+ c[pos] = super
+ end
end
- else
- rule.match(self, offset)
+
+ def reset
+ super
+ @max_offset = 0
+ @cache = {}
+ @cache_hits = 0
+ end
end
+
+ @cache = {}
+ @cache_hits = 0
end
end
# A Rule is an object that is used by a grammar to create matches on the
# Input during parsing.
@@ -378,19 +452,18 @@
end
# Returns a new Rule object depending on the type of object given.
def self.new(obj)
case obj
- when Rule then obj
- when Symbol then Alias.new(obj)
- when String then FixedWidth.new(obj)
- when Regexp then Expression.new(obj)
- when Array then Sequence.new(obj)
- when Range then Choice.new(obj.to_a)
- when Numeric then FixedWidth.new(obj.to_s)
+ when Rule then obj
+ when Symbol then Alias.new(obj)
+ when String, Regexp then Terminal.new(obj)
+ when Array then Sequence.new(obj)
+ when Range then Choice.new(obj.to_a)
+ when Numeric then Terminal.new(obj.to_s)
else
- raise ArgumentError, "Invalid rule object: #{obj.inspect}"
+ raise ArgumentError, "Invalid rule object: %s" % obj.inspect
end
end
@unique_id = 0
@@ -464,13 +537,12 @@
match.extend(extension) if extension
match.names << name if name
match
end
- def create_match(data, offset)
- match = Match.new(data, offset)
- extend_match(match, name)
+ def create_match(data)
+ extend_match(Match.new(data), name)
end
end
# A Proxy is a Rule that is a placeholder for another rule. It stores the
# name of some other rule in the grammar internally and resolves it to the
@@ -494,14 +566,13 @@
# Returns the underlying Rule for this proxy.
def rule
@rule ||= resolve!
end
- # Returns the Match for this proxy's #rule on +input+ at the given +offset+,
- # +nil+ if no match can be made.
- def match(input, offset=0)
- m = input.match(rule, offset)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
+ m = input.match(rule)
extend_match(m, name) if m
end
end
# An Alias is a Proxy for a rule in the same grammar. It is used in rule
@@ -556,76 +627,57 @@
rule
end
end
# A Terminal is a Rule that matches directly on the input stream and may not
- # contain any other rule.
- module Terminal
- include Rule
-
- def initialize(rule)
- @rule = rule
- end
-
- # The actual String or Regexp object this rule uses to match.
- attr_reader :rule
-
- # Returns the Citrus notation of this rule as a string.
- def to_s
- rule.inspect
- end
- end
-
- # A FixedWidth is a Terminal that matches based on its length. The Citrus
- # notation is any sequence of characters enclosed in either single or double
- # quotes, e.g.:
+ # contain any other rule. Terminals may be created from either a String or a
+ # Regexp object. When created from strings, the Citrus notation is any
+ # sequence of characters enclosed in either single or double quotes, e.g.:
#
# 'expr'
# "expr"
#
- class FixedWidth
- include Terminal
-
- def initialize(rule='')
- raise ArgumentError, "FixedWidth must be a String" unless String === rule
- super
- end
-
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
- create_match(rule.dup, offset) if input[offset, rule.length] == rule
- end
- end
-
- # An Expression is a Terminal that has the same semantics as a regular
- # expression in Ruby. The expression must match at the beginning of the input
- # (index 0). The Citrus notation is identical to Ruby's regular expression
- # notation, e.g.:
+ # When created from a regular expression, the Citrus notation is identical to
+ # Ruby's regular expression notation, e.g.:
#
# /expr/
#
# Character classes and the dot symbol may also be used in Citrus notation for
# compatibility with other parsing expression implementations, e.g.:
#
# [a-zA-Z]
# .
#
- class Expression
- include Terminal
+ class Terminal
+ include Rule
- def initialize(rule=/^/)
- raise ArgumentError, "Expression must be a Regexp" unless Regexp === rule
- super
+ def initialize(rule='')
+ case rule
+ when String
+ @string = rule
+ @rule = Regexp.new(Regexp.escape(rule))
+ when Regexp
+ @rule = rule
+ else
+ raise ArgumentError, "Cannot create terminal from object: %s" %
+ rule.inspect
+ end
end
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
- m = input[offset, input.length - offset].match(rule)
- create_match(m, offset) if m && m.begin(0) == 0
+ # The actual Regexp object this rule uses to match.
+ attr_reader :rule
+
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
+ m = input.scan(@rule)
+ create_match(m) if m
end
+
+ # Returns the Citrus notation of this rule as a string.
+ def to_s
+ (@string || @rule).inspect
+ end
end
# A Nonterminal is a Rule that augments the matching behavior of one or more
# other rules. Nonterminals may not match directly on the input, but instead
# invoke the rule(s) they contain to determine if a match can be made from
@@ -667,14 +719,13 @@
# &expr
#
class AndPredicate
include Predicate
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
- create_match('', offset) if input.match(rule, offset)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
+ create_match('') if input.match(rule)
end
# Returns the Citrus notation of this rule as a string.
def to_s
'&' + rule.embed
@@ -688,14 +739,13 @@
# !expr
#
class NotPredicate
include Predicate
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
- create_match('', offset) unless input.match(rule, offset)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
+ create_match('') unless input.match(rule)
end
# Returns the Citrus notation of this rule as a string.
def to_s
'!' + rule.embed
@@ -711,23 +761,20 @@
class ButPredicate
include Predicate
DOT_RULE = Rule.new(DOT)
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
matches = []
- os = offset
- while input.match(rule, os).nil?
- m = input.match(DOT_RULE, os)
+ while input.match(rule).nil?
+ m = input.match(DOT_RULE)
break unless m
matches << m
- os += m.length
end
# Create a single match from the aggregate text value of all submatches.
- create_match(matches.join, offset) if matches.any?
+ create_match(matches.join) if matches.any?
end
# Returns the Citrus notation of this rule as a string.
def to_s
'~' + rule.embed
@@ -755,15 +802,15 @@
end
# The label this rule adds to all its matches.
attr_reader :label
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made. When a Label makes a match, it re-names the match to
- # the value of its label.
- def match(input, offset=0)
- m = input.match(rule, offset)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ # When a Label makes a match, it re-names the match to the value of its
+ # #label.
+ def match(input)
+ m = input.match(rule)
extend_match(m, label) if m
end
# Returns the Citrus notation of this rule as a string.
def to_s
@@ -797,22 +844,19 @@
super(rule)
raise ArgumentError, "Min cannot be greater than max" if min > max
@range = Range.new(min, max)
end
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
matches = []
- os = offset
while matches.length < @range.end
- m = input.match(rule, os)
+ m = input.match(rule)
break unless m
matches << m
- os += m.length
end
- create_match(matches, offset) if @range.include?(matches.length)
+ create_match(matches) if @range.include?(matches.length)
end
# The minimum number of times this rule must match.
def min
@range.begin
@@ -844,10 +888,11 @@
# A List is a Nonterminal that contains any number of other rules and tests
# them for matches in sequential order.
module List
include Nonterminal
+ # See Rule#paren?.
def paren?
rules.length > 1
end
end
@@ -857,15 +902,14 @@
# expr | expr
#
class Choice
include List
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
rules.each do |rule|
- m = input.match(rule, offset)
+ m = input.match(rule)
return extend_match(m, name) if m
end
nil
end
@@ -881,22 +925,19 @@
# expr expr
#
class Sequence
include List
- # Returns the Match for this rule on +input+ at the given +offset+, +nil+ if
- # no match can be made.
- def match(input, offset=0)
+ # Returns the Match for this rule on +input+, +nil+ if no match can be made.
+ def match(input)
matches = []
- os = offset
rules.each do |rule|
- m = input.match(rule, os)
+ m = input.match(rule)
break unless m
matches << m
- os += m.length
end
- create_match(matches, offset) if matches.length == rules.length
+ create_match(matches) if matches.length == rules.length
end
# Returns the Citrus notation of this rule as a string.
def to_s
rules.map {|r| r.embed }.join(' ')
@@ -905,28 +946,23 @@
# The base class for all matches. Matches are organized into a tree where any
# match may contain any number of other matches. This class provides several
# convenient tree traversal methods that help when examining parse results.
class Match < String
- def initialize(data, offset=0)
+ def initialize(data)
case data
when String
super(data)
- when MatchData
- super(data[0])
- @captures = data.captures
when Array
super(data.join)
@matches = data
+ else
+ raise ArgumentError, "Cannot create match from object: %s" %
+ data.inspect
end
-
- @offset = offset
end
- # The offset in the input at which this match occurred.
- attr_reader :offset
-
# An array of all names of this match. A name is added to a match object
# for each rule that returns that object when matching. These names can then
# be used to determine which rules were satisfied by a given match.
def names
@names ||= []
@@ -943,15 +979,9 @@
end
# An array of all sub-matches of this match.
def matches
@matches ||= []
- end
-
- # An array of substrings returned by MatchData#captures if this match was
- # created by an Expression.
- def captures
- @captures ||= []
end
# Returns an array of all sub-matches with the given +name+. If +deep+ is
# +false+, returns only sub-matches that are immediate descendants of this
# match.