module EBNF::LL1 require 'ebnf/ll1/scanner' unless defined?(Scanner) ## # A lexical analyzer # # @example Tokenizing a Turtle string # terminals = [ # [:BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL}))], # ... # ] # ttl = "@prefix rdf: ." # lexer = EBNF::LL1::Lexer.tokenize(ttl, terminals) # lexer.each_token do |token| # puts token.inspect # end # # @example Tokenizing and returning a token stream # lexer = EBNF::LL1::Lexer.tokenize(...) # while :some-condition # token = lexer.first # Get the current token # token = lexer.shift # Get the current token and shift to the next # end # # @example Handling error conditions # begin # EBNF::LL1::Lexer.tokenize(query) # rescue EBNF::LL1::Lexer::Error => error # warn error.inspect # end # # @see http://en.wikipedia.org/wiki/Lexical_analysis class Lexer include Enumerable ESCAPE_CHARS = { '\\t' => "\t", # \u0009 (tab) '\\n' => "\n", # \u000A (line feed) '\\r' => "\r", # \u000D (carriage return) '\\b' => "\b", # \u0008 (backspace) '\\f' => "\f", # \u000C (form feed) '\\"' => '"', # \u0022 (quotation mark, double quote mark) "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark) '\\\\' => '\\' # \u005C (backslash) }.freeze ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/.freeze # \uXXXX ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/.freeze # \UXXXXXXXX ECHAR = /\\./ # More liberal unescaping UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/.freeze COMMENT = /#.*/.freeze WS = / |\t|\r|\n/m.freeze ML_START = /\'\'\'|\"\"\"/.freeze # Beginning of terminals that may span lines ## # @return [Regexp] defines whitespace, defaults to WS attr_reader :whitespace ## # @return [Regexp] defines single-line comment, defaults to COMMENT attr_reader :comment ## # Returns a copy of the given `input` string with all `\uXXXX` and # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their # unescaped UTF-8 character counterparts. # # @param [String] string # @return [String] # @see http://www.w3.org/TR/rdf-sparql-query/#codepointEscape def self.unescape_codepoints(string) string = string.dup string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding) # Decode \uXXXX and \UXXXXXXXX code points: string = string.gsub(UCHAR) do |c| s = [(c[2..-1]).hex].pack('U*') s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s end string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) string end ## # Returns a copy of the given `input` string with all string escape # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8 # character counterparts. # # @param [String] input # @return [String] # @see http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes def self.unescape_string(input) input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]} end ## # Tokenizes the given `input` string or stream. # # @param [String, #to_s] input # @param [Array>] terminals # Array of symbol, regexp pairs used to match terminals. # If the symbol is nil, it defines a Regexp to match string terminals. # @param [Hash{Symbol => Object}] options # @yield [lexer] # @yieldparam [Lexer] lexer # @return [Lexer] # @raise [Lexer::Error] on invalid input def self.tokenize(input, terminals, options = {}, &block) lexer = self.new(input, terminals, options) block_given? ? block.call(lexer) : lexer end ## # Initializes a new lexer instance. # # @param [String, #to_s] input # @param [Array, Terminal>] terminals # Array of symbol, regexp pairs used to match terminals. # If the symbol is nil, it defines a Regexp to match string terminals. # @param [Hash{Symbol => Object}] options # @option options [Regexp] :whitespace (WS) # Regular expression matching the beginning of terminals that may cross newlines # @option options [Regexp] :comment (COMMENT) def initialize(input = nil, terminals = nil, options = {}) @options = options.dup @whitespace = @options[:whitespace] || WS @comment = @options[:comment] || COMMENT @terminals = terminals.map do |term| term.is_a?(Array) ? Terminal.new(*term) : term end raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0 @lineno = 1 @scanner = Scanner.new(input) end ## # Any additional options for the lexer. # # @return [Hash] attr_reader :options ## # The current input string being processed. # # @return [String] attr_accessor :input ## # The current line number (zero-based). # # @return [Integer] attr_reader :lineno ## # Returns `true` if the input string is lexically valid. # # To be considered valid, the input string must contain more than zero # terminals, and must not contain any invalid terminals. # # @return [Boolean] def valid? begin !count.zero? rescue Error false end end ## # Enumerates each token in the input string. # # @yield [token] # @yieldparam [Token] token # @return [Enumerator] def each_token(&block) if block_given? while token = shift yield token end end enum_for(:each_token) end alias_method :each, :each_token ## # Returns first token in input stream # # @return [Token] def first return nil unless scanner @first ||= begin {} while !scanner.eos? && skip_whitespace return @scanner = nil if scanner.eos? token = match_token if token.nil? lexme = (scanner.rest.split(/#{@whitespace}|#{@comment}/).first rescue nil) || scanner.rest raise Error.new("Invalid token #{lexme[0..100].inspect}", :input => scanner.rest[0..100], :token => lexme, :lineno => lineno) end token end rescue ArgumentError, Encoding::CompatibilityError => e raise Error.new(e.message, :input => (scanner.rest[0..100] rescue '??'), :token => lexme, :lineno => lineno) rescue Error raise rescue STDERR.puts "Expected ArgumentError, got #{$!.class}" raise end ## # Returns first token and shifts to next # # @return [Token] def shift cur = first @first = nil cur end ## # Skip input until a token is matched # # @return [Token] def recover until scanner.eos? || tok = match_token if scanner.skip_until(@whitespace).nil? # Skip past current "token" # No whitespace at the end, must be and end of string scanner.terminate else skip_whitespace end end scanner.unscan if tok first end protected # @return [StringScanner] attr_reader :scanner ## # Skip whitespace or comments, as defined through input options or defaults def skip_whitespace # skip all white space, but keep track of the current line number while !scanner.eos? if matched = scanner.scan(@whitespace) @lineno += matched.count("\n") elsif (scanner.scan(@comment)) # else return end end end ## # Return the matched token. # # If the token was matched with a case-insensitive regexp, # track this with the resulting {Token}, so that comparisons # with that token are also case insensitive # # @return [Token] def match_token @terminals.each do |term| #STDERR.puts "match[#{term.type}] #{scanner.rest[0..100].inspect} against #{term.regexp.inspect}" #if term.type == :STRING_LITERAL_SINGLE_QUOTE if matched = scanner.scan(term.regexp) #STDERR.puts " matched #{term.type.inspect}: #{matched.inspect}" return token(term.type, term.canonicalize(matched)) end end nil end # Terminal class, representing the terminal identifier and # matching regular expression. Optionally, a Terminal may include # a map to turn case-insensitively matched terminals into their # canonical form class Terminal attr_reader :type attr_reader :regexp # @param [Symbol, nil] type # @param [Regexp] regexp # @param [Hash{Symbol => Object}] options # @option options [Hash{String => String}] :map ({}) # A mapping from terminals, in lower-case form, to # their canonical value # @option options [Boolean] :unescape # Cause strings and codepoints to be unescaped. def initialize(type, regexp, options = {}) @type, @regexp, @options = type, regexp, options @map = options.fetch(:map, {}) end # Map a terminal to it's canonical form. If there is no # map, `value` is returned. `value` is unescaped if there # is no canonical mapping, and the `:unescape` option is set. # # @param [String] value # value to canonicalize # @return [String] def canonicalize(value) @map.fetch(value.downcase, unescape(value)) end def ==(other) case other when Array @type == other.first && @regexp == other.last when Terminal @type == other.type && @regexp == other.regexp end end protected # Perform string and codepoint unescaping if defined for this terminal # @param [String] string # @return [String] def unescape(string) if @options[:unescape] Lexer.unescape_string(Lexer.unescape_codepoints(string)) else string end end end protected ## # Constructs a new token object annotated with the current line number. # # The parser relies on the type being a symbolized URI and the value being # a string, if there is no type. If there is a type, then the value takes # on the native representation appropriate for that type. # # @param [Symbol] type # @param [String] value # Scanner instance with access to matched groups # @param [Hash{Symbol => Object}] options # @return [Token] def token(type, value, options = {}) Token.new(type, value, options.merge(:lineno => lineno)) end ## # Represents a lexer token. # # @example Creating a new token # token = EBNF::LL1::Lexer::Token.new(:LANGTAG, "en") # token.type #=> :LANGTAG # token.value #=> "en" # # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token class Token ## # The token's symbol type. # # @return [Symbol] attr_reader :type ## # The token's value. # # @return [String] attr_reader :value ## # The line number where the token was encountered. # # @return [Integer] attr_reader :lineno ## # Any additional options for the token. # # @return [Hash] attr_reader :options ## # Initializes a new token instance. # # @param [Symbol] type # @param [String] value # @param [Hash{Symbol => Object}] options # @option options [Integer] :lineno (nil) def initialize(type, value, options = {}) @type = type.to_s.to_sym if type @value = value.to_s @options = options.dup @lineno = @options.delete(:lineno) end ## # Returns the attribute named by `key`. # # @param [Symbol] key # @return [Object] def [](key) key = key.to_s.to_sym unless key.is_a?(Integer) || key.is_a?(Symbol) case key when 0, :type then @type when 1, :value then @value else nil end end ## # Returns `true` if the given `value` matches either the type or value # of this token. # # @example Matching using the symbolic type # EBNF::LL1::Lexer::Token.new(:NIL) === :NIL #=> true # # @example Matching using the string value # EBNF::LL1::Lexer::Token.new(nil, "{") === "{" #=> true # # @param [Symbol, String] value # @return [Boolean] def ===(value) case value when Symbol value == @type when ::String @value == (@options[:case_insensitive] ? value.to_s.downcase : value.to_s) else value == @value end end ## # Returns a hash table representation of this token. # # @return [Hash] def to_hash {:type => @type, :value => @value} end ## # Readable version of token def to_s @type ? @type.inspect : @value end ## # Returns type, if not nil, otherwise value def representation @type ? @type : @value end ## # Returns an array representation of this token. # # @return [Array] def to_a [@type, @value] end ## # Returns a developer-friendly representation of this token. # # @return [String] def inspect "#{@value.inspect}#{'(' + @type.to_s + ')' if @type}" end end # class Token ## # Raised for errors during lexical analysis. # # @example Raising a lexer error # raise EBNF::LL1::Lexer::Error.new( # "invalid token '%' on line 10", # :input => query, :token => '%', :lineno => 9) # # @see http://ruby-doc.org/core/classes/StandardError.html class Error < StandardError ## # The input string associated with the error. # # @return [String] attr_reader :input ## # The invalid token which triggered the error. # # @return [String] attr_reader :token ## # The line number where the error occurred. # # @return [Integer] attr_reader :lineno ## # Initializes a new lexer error instance. # # @param [String, #to_s] message # @param [Hash{Symbol => Object}] options # @option options [String] :input (nil) # @option options [String] :token (nil) # @option options [Integer] :lineno (nil) def initialize(message, options = {}) @input = options[:input] @token = options[:token] @lineno = options[:lineno] super(message.to_s) end end # class Error unless "".respond_to?(:force_encoding) # Compatibility with 1.9 Encoding module Encoding class CompatibilityError < StandardError; end end end end # class Lexer end # module EBNF