module RDF::LL1 require 'rdf/ll1/scanner' unless defined?(Scanner) ## # A lexical analyzer # # @example Tokenizing a Turtle string # terminals = [ # [:BLANK_NODE_LABEL, %r(_:(#{PN_LOCAL}))], # ... # ] # ttl = "@prefix rdf: ." # lexer = RDF::LL1::Lexer.tokenize(ttl, terminals) # lexer.each_token do |token| # puts token.inspect # end # # @example Tokenizing and returning a token stream # lexer = RDF::LL1::Lexer.tokenize(...) # while :some-condition # token = lexer.first # Get the current token # token = lexer.shift # Get the current token and shift to the next # end # # @example Handling error conditions # begin # RDF::Turtle::Lexer.tokenize(query) # rescue RDF::Turtle::Lexer::Error => error # warn error.inspect # end # # @see http://en.wikipedia.org/wiki/Lexical_analysis class Lexer include Enumerable ESCAPE_CHARS = { '\\t' => "\t", # \u0009 (tab) '\\n' => "\n", # \u000A (line feed) '\\r' => "\r", # \u000D (carriage return) '\\b' => "\b", # \u0008 (backspace) '\\f' => "\f", # \u000C (form feed) '\\"' => '"', # \u0022 (quotation mark, double quote mark) "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark) '\\\\' => '\\' # \u005C (backslash) } ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/ # \uXXXX ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/ # \UXXXXXXXX ECHAR = /\\./ # More liberal unescaping UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/ COMMENT = /#.*/ WS = / |\t|\r|\n/m ML_START = /\'\'\'|\"\"\"/ # Beginning of terminals that may span lines ## # @!attribute whitespace # @return [Regexp] defines whitespace, defaults to WS attr_reader :whitespace ## # @!attribute comment # @return [Regexp] defines single-line comment, defaults to COMMENT attr_reader :comment ## # Returns a copy of the given `input` string with all `\uXXXX` and # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their # unescaped UTF-8 character counterparts. # # @param [String] string # @return [String] # @see http://www.w3.org/TR/rdf-sparql-query/#codepointEscape def self.unescape_codepoints(string) # Decode \uXXXX and \UXXXXXXXX code points: string = string.gsub(UCHAR) do |c| s = [(c[2..-1]).hex].pack('U*') s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s end string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) # Ruby 1.9+ string end ## # Returns a copy of the given `input` string with all string escape # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8 # character counterparts. # # @param [String] input # @return [String] # @see http://www.w3.org/TR/rdf-sparql-query/#grammarEscapes def self.unescape_string(input) input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]} end ## # Tokenizes the given `input` string or stream. # # @param [String, #to_s] input # @param [Array>] terminals # Array of symbol, regexp pairs used to match terminals. # If the symbol is nil, it defines a Regexp to match string terminals. # @param [Hash{Symbol => Object}] options # @yield [lexer] # @yieldparam [Lexer] lexer # @return [Lexer] # @raise [Lexer::Error] on invalid input def self.tokenize(input, terminals, options = {}, &block) lexer = self.new(input, terminals, options) block_given? ? block.call(lexer) : lexer end ## # Initializes a new lexer instance. # # @param [String, #to_s] input # @param [Array>] terminals # Array of symbol, regexp pairs used to match terminals. # If the symbol is nil, it defines a Regexp to match string terminals. # @param [Hash{Symbol => Object}] options # @option options [Regexp] :whitespace (WS) # @option options [Regexp] :comment (COMMENT) # @option options [Array] :unescape_terms ([]) # Regular expression matching the beginning of terminals that may cross newlines def initialize(input = nil, terminals = nil, options = {}) @options = options.dup @whitespace = @options[:whitespace] || WS @comment = @options[:comment] || COMMENT @unescape_terms = @options[:unescape_terms] || [] @terminals = terminals raise Error, "Terminal patterns not defined" unless @terminals && @terminals.length > 0 @lineno = 1 @scanner = Scanner.new(input) do |string| string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) # Ruby 1.9+ string end end ## # Any additional options for the lexer. # # @return [Hash] attr_reader :options ## # The current input string being processed. # # @return [String] attr_accessor :input ## # The current line number (zero-based). # # @return [Integer] attr_reader :lineno ## # Returns `true` if the input string is lexically valid. # # To be considered valid, the input string must contain more than zero # terminals, and must not contain any invalid terminals. # # @return [Boolean] def valid? begin !count.zero? rescue Error false end end ## # Enumerates each token in the input string. # # @yield [token] # @yieldparam [Token] token # @return [Enumerator] def each_token(&block) if block_given? while token = shift yield token end end enum_for(:each_token) end alias_method :each, :each_token ## # Returns first token in input stream # # @return [Token] def first return nil unless scanner @first ||= begin {} while !scanner.eos? && skip_whitespace return @scanner = nil if scanner.eos? token = match_token if token.nil? lexme = (scanner.rest.split(/#{@whitespace}|#{@comment}/).first rescue nil) || scanner.rest raise Error.new("Invalid token #{lexme[0..100].inspect}", :input => scanner.rest[0..100], :token => lexme, :lineno => lineno) end token end rescue ArgumentError, Encoding::CompatibilityError => e raise Error.new("#{e.message} on line #{lineno + 1}", :input => (scanner.rest[0..100] rescue '??'), :token => lexme, :lineno => lineno) rescue Error raise rescue STDERR.puts "Expected ArgumentError, got #{$!.class}" raise end ## # Returns first token and shifts to next # # @return [Token] def shift cur = first @first = nil cur end ## # Skip input until a token is matched # # @return [Token] def recover until scanner.eos? do begin shift return first rescue Error, ArgumentError # Ignore errors until something scans, or EOS. scanner.pos = scanner.pos + 1 end end end protected # @return [StringScanner] attr_reader :scanner # Perform string and codepoint unescaping # @param [String] string # @return [String] def unescape(string) self.class.unescape_string(self.class.unescape_codepoints(string)) end ## # Skip whitespace or comments, as defined through input options or defaults def skip_whitespace # skip all white space, but keep track of the current line number while !scanner.eos? if matched = scanner.scan(@whitespace) @lineno += matched.count("\n") elsif (com = scanner.scan(@comment)) else return end end end ## # Return the matched token # # @return [Token] def match_token @terminals.each do |(term, regexp)| #STDERR.puts "match[#{term}] #{scanner.rest[0..100].inspect} against #{regexp.inspect}" #if term == :STRING_LITERAL_SINGLE_QUOTE if matched = scanner.scan(regexp) matched = unescape(matched) if @unescape_terms.include?(term) #STDERR.puts " unescape? #{@unescape_terms.include?(term).inspect}" #STDERR.puts " matched #{term.inspect}: #{matched.inspect}" return token(term, matched) end end nil end protected ## # Constructs a new token object annotated with the current line number. # # The parser relies on the type being a symbolized URI and the value being # a string, if there is no type. If there is a type, then the value takes # on the native representation appropriate for that type. # # @param [Symbol] type # @param [String] value # Scanner instance with access to matched groups # @return [Token] def token(type, value) Token.new(type, value, :lineno => lineno) end ## # Represents a lexer token. # # @example Creating a new token # token = RDF::LL1::Lexer::Token.new(:LANGTAG, "en") # token.type #=> :LANGTAG # token.value #=> "en" # # @see http://en.wikipedia.org/wiki/Lexical_analysis#Token class Token ## # Initializes a new token instance. # # @param [Symbol] type # @param [String] value # @param [Hash{Symbol => Object}] options # @option options [Integer] :lineno (nil) def initialize(type, value, options = {}) @type, @value = (type ? type.to_s.to_sym : nil), value @options = options.dup @lineno = @options.delete(:lineno) end ## # The token's symbol type. # # @return [Symbol] attr_reader :type ## # The token's value. # # @return [String] attr_reader :value ## # The line number where the token was encountered. # # @return [Integer] attr_reader :lineno ## # Any additional options for the token. # # @return [Hash] attr_reader :options ## # Returns the attribute named by `key`. # # @param [Symbol] key # @return [Object] def [](key) key = key.to_s.to_sym unless key.is_a?(Integer) || key.is_a?(Symbol) case key when 0, :type then @type when 1, :value then @value else nil end end ## # Returns `true` if the given `value` matches either the type or value # of this token. # # @example Matching using the symbolic type # RDF::LL1::Lexer::Token.new(:NIL) === :NIL #=> true # # @example Matching using the string value # RDF::LL1::Lexer::Token.new(nil, "{") === "{" #=> true # # @param [Symbol, String] value # @return [Boolean] def ===(value) case value when Symbol then value == @type when ::String then value.to_s == @value.to_s else value == @value end end ## # Returns a hash table representation of this token. # # @return [Hash] def to_hash {:type => @type, :value => @value} end ## # Readable version of token def to_s @type ? @type.inspect : @value end ## # Returns type, if not nil, otherwise value def representation @type ? @type : @value end ## # Returns an array representation of this token. # # @return [Array] def to_a [@type, @value] end ## # Returns a developer-friendly representation of this token. # # @return [String] def inspect to_hash.inspect end end # class Token ## # Raised for errors during lexical analysis. # # @example Raising a lexer error # raise RDF::LL1::Lexer::Error.new( # "invalid token '%' on line 10", # :input => query, :token => '%', :lineno => 9) # # @see http://ruby-doc.org/core/classes/StandardError.html class Error < StandardError ## # The input string associated with the error. # # @return [String] attr_reader :input ## # The invalid token which triggered the error. # # @return [String] attr_reader :token ## # The line number where the error occurred. # # @return [Integer] attr_reader :lineno ## # Initializes a new lexer error instance. # # @param [String, #to_s] message # @param [Hash{Symbol => Object}] options # @option options [String] :input (nil) # @option options [String] :token (nil) # @option options [Integer] :lineno (nil) def initialize(message, options = {}) @input = options[:input] @token = options[:token] @lineno = options[:lineno] super(message.to_s) end end # class Error unless "".respond_to?(:force_encoding) # Compatibility with 1.9 Encoding module Encoding class CompatibilityError < StandardError; end end end end # class Lexer end # module RDF::Turtle