#!/usr/bin/env ruby -w # encoding: UTF-8 #-- # Simple Declarative Language (SDL) for Ruby # Copyright 2005 Ikayzo, inc. # # This program is free software. You can distribute or modify it under the # terms of the GNU Lesser General Public License version 2.1 as published by # the Free Software Foundation. # # This program is distributed AS IS and WITHOUT WARRANTY. OF ANY KIND, # INCLUDING MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. # See the GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program; if not, contact the Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #++ module SDL4R require 'strscan' require 'sdl4r/sdl_parse_error' require 'sdl4r/token' # Tokenizer for SDL. # # As Ruby's IO standard libraries are not so much low-level, this class works on lines. This # means that some token types reflect this line-oriented tokenizing. # # The other solution would be to implement a proper tokenizer natively, which I don't feel like # doing right now. # #-- # FIXME: implement a way of stacking the errors without raising an error immediately #++ # class Tokenizer # :nodoc: all class Matcher # :nodoc: all def initialize(token_type, regex, options = {}, &block) options = { :next_mode => nil, :push_back_eol => false, :error => nil, }.merge(options) @token_type = token_type @regex = regex @next_mode = options[:next_mode] @push_back_eol = options[:push_back_eol] @error = options[:error] if block_given? instance_eval(&block) end end attr_reader :token_type, :regex, :next_mode # Indicates whether the matched token tends to match the end of line character and whether # it should be pushed back in this cases. attr_reader :push_back_eol # If +nil+, this Matcher is normal, otherwise it is meant to detect errors and this # returns a message. attr_reader :error # Called when a token is found in order to remove meaningless characters, etc. def process_token(token) token end self.freeze end # A string used at the end of each line in order to trigger the EOL token. # @private @@EOL_STRING = "\n" # @private @@matcher_sets = { :top => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:WHITESPACE, /\A\s+/, :push_back_eol => true), Matcher.new(:SEMICOLON, /\A;/), Matcher.new(:COLON, /\A:/), Matcher.new(:EQUAL, /\A=/), Matcher.new(:BLOCK_START, /\A\{/), Matcher.new(:BLOCK_END, /\A\}/), Matcher.new(:BOOLEAN, /\Atrue|false|on|off/), Matcher.new(:NULL, /\Anull/), Matcher.new(:ONE_LINE_COMMENT, /\A(?:#|--|\/\/).*\Z/, :push_back_eol => true) do def process_token(token) token.gsub!(/\A(?:#|--|\/\/)/, "") end end, Matcher.new(:INLINE_COMMENT, /\A\/\*[\s\S]*?\*\//) do def process_token(token) token.gsub!(/\A\/\*|\*\/\Z/, "") end end, Matcher.new( :MULTILINE_COMMENT_START, /\A\/\*.*\Z/, :next_mode => :multiline_comment, :push_back_eol => true) do def process_token(token) token.gsub!(/\A\/\*/, "") end end, Matcher.new(:CHARACTER, /\A'(?:[^\\']|\\.)'/) do def process_token(token) token.gsub!(/\A'|'\Z/, "") end end, Matcher.new(:INLINE_BACKQUOTE_STRING, /\A`[^`]*`/, :is_node => true) do def process_token(token) token.gsub!(/\A`|`\Z/, "") end end, Matcher.new(:INLINE_DOUBLE_QUOTE_STRING, /\A"(?:[^\\"]|\\.)*"/) do def process_token(token) token.gsub!(/\A"|"\Z/, "") end end, Matcher.new( :MULTILINE_BACKQUOTE_STRING_START, /\A`[^`]*\Z/, :next_mode => :multiline_backquote_string, :is_node => true) do def process_token(token) token.gsub!(/\A`/, "") end end, Matcher.new( :MULTILINE_DOUBLE_QUOTE_STRING_START, /\A"(?:[^\\"]|\\\S)*\\\s*\Z/, :next_mode => :multiline_double_quote_string, :push_back_eol => true) do def process_token(token) token.gsub!(/\A"|\\\s*\Z/, "") end end, Matcher.new(:INLINE_BINARY, /\A\[[\sA-Za-z0-9\/=\+]*\]/) do def process_token(token) token.gsub!(/\A\[|\s+|\]\Z/, "") end end, Matcher.new( :MULTILINE_BINARY_START, /\A\[[\sA-Za-z0-9\/=\+]*\Z/, :next_mode => :multiline_binary, :push_back_eol => true) do def process_token(token) token.gsub!(/\A\[|\s+/, "") end end, Matcher.new( :IDENTIFIER, /\A#{SDL4R::IDENTIFIER_START_CLASS}#{SDL4R::IDENTIFIER_PART_CLASS}*/), Matcher.new(:DATE, /\A-?\d+\/\d+\/\d+/, :is_node => true), Matcher.new( :TIME_OR_TIMESPAN, /\A(?:-?\d+d:)?-?\d+:\d+(?::\d+(?:\.\d+)?)? (?:-[a-zA-Z\/]+(?:[+-]\d+(?::\d+)?)?)?/ix), Matcher.new(:INTEGER, /\A[\+\-]?\d+L/i), # takes precedence on floats # the float regex is meant to also catch bad syntaxed floats like "1.2.2" (otherwise, we # would not detect this kind of errors easily). Matcher.new( :FLOAT, /\A[\+\-]?(?:\d+(?:F|D|BD)|\d*\.[\d\.]+(?:F|D|BD)?)/i), Matcher.new(:INTEGER, /\A[\+\-]?\d+L?/i), Matcher.new(:LINE_CONTINUATION, /\A\\\s*\Z/), # outside of comments, strings, etc Matcher.new( :UNCLOSED_DOUBLE_QUOTE_STRING, /\A"(?:[^\\"]|\\\S)*/, :error => "unclosed string"), ], :multiline_comment => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:MULTILINE_COMMENT_END, /\A[\s\S]*?\*\//, :next_mode => :top) do def process_token(token) token.gsub!(/\*\/\Z/, "") end end, Matcher.new(:MULTILINE_COMMENT_PART, /\A.+\Z/, :push_back_eol => true) ], :multiline_backquote_string => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:MULTILINE_BACKQUOTE_STRING_END, /\A[^`]*`/, :next_mode => :top) do def process_token(token) token.gsub!(/`\Z/, "") end end, Matcher.new(:MULTILINE_BACKQUOTE_STRING_PART, /\A[^`]*\Z/) ], :multiline_double_quote_string => [ Matcher.new(:EOL, /\A\n/), Matcher.new( :MULTILINE_DOUBLE_QUOTE_STRING_END, /\A(?:[^\\"]|\\\S)*"/, :next_mode => :top) do def process_token(token) token.gsub!(/\A\s+|"\Z/, "") end end, Matcher.new( :MULTILINE_DOUBLE_QUOTE_STRING_PART, /\A(?:[^\\"]|\\\S)*\\\s*\Z/, :push_back_eol => true) do def process_token(token) token.gsub!(/\A\s+|\\\s*\Z/, "") end end, Matcher.new( :UNCLOSED_DOUBLE_QUOTE_STRING, /\A(?:[^\\"]|\\\S)*\Z/, :error => "unclosed multiline string") ], :multiline_binary => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:MULTILINE_BINARY_END, /\A[\sA-Za-z0-9\/=\+]*\]/, :next_mode => :top) do def process_token(token) token.gsub!(/\s+|\]\Z/, "") end end, Matcher.new(:MULTILINE_BINARY_PART, /\A[\sA-Za-z0-9\/=\+]*\Z/, :push_back_eol => true) do def process_token(token) token.gsub!(/\s+/, "") end end ] } # @param [IO] the IO to read from # @raise [ArgumentError] if +io+ is +nil+. def initialize io raise ArgumentError, 'io' unless io @io = io @scanner = nil @line_no = -1 set_mode(:top) @token = nil @pushed_back_token = nil @previous_token = nil @token_pool = [] # a pool of reusable Tokens end # @return [String] text of the current token. def token @token.text end # @return [Symbol] type of the current token (e.g. +:WHITESPACE+) def token_type @token.type end # @return [Integer] position of the current token (only meant for error tracking for the time # being) def token_line_no @token.line_no end # @return [Integer] position of the current token (only meant for error tracking for the time # being) def token_pos @token.pos end # Sets the current working mode of this Tokenizer. # # @param [Symbol] new mode # * +:top+ (normal default mode) # * +:multiline_comment+ # * +:multiline_backquote_string+ # * +:multiline_double_quote_string+ # * +:multiline_binary+ # # @return [self] # @raise [ArgumentError] if the given mode is unknown. # def set_mode(mode) ms = @@matcher_sets[mode] raise ArgumentError, "unknown tokenizer mode #{mode.to_s}" unless ms @matcher_set = ms self end # Reads a token from the pushed back ones. def read_pushed_back record_previous_token # Set the current state @token = @pushed_back_token @pushed_back_token = nil if @token.matcher next_mode = @token.matcher.next_mode set_mode(next_mode) if next_mode end end private :read_pushed_back # Goes to the next token. # # @return [Symbol] +nil+ if eof has been reached, the current token type otherwise. # def read if @pushed_back_token read_pushed_back return @token.type end record_previous_token @token = nil if @line_no < 0 or @scanner.eos? # fetch a line if beginning or at end of line unless read_line if previous_token_type == :EOF return nil else @token = Token.new(nil, :EOF, nil, @line_no, @scanner ? @scanner.pos : 0) return @token.type end end end pos = @scanner.pos @matcher_set.each do |matcher| if token_text = @scanner.scan(matcher.regex) error = matcher.error if error raise_parse_error(error) else set_matcher_token(matcher, token_text, pos) if matcher.push_back_eol and @scanner.eos? @scanner.pos = @scanner.pos - @@EOL_STRING.size end end break end end raise_unexpected_char unless @token return @token.type end def record_previous_token @previous_token = @token end private :record_previous_token # Sets the current Token using the Matcher that detected it def set_matcher_token(matcher, token_text, pos) @token = Token.new( matcher.process_token(token_text), matcher.token_type, matcher, @line_no, pos) next_mode = matcher.next_mode set_mode(next_mode) if next_mode end private :set_matcher_token # @return [Symbol] the type of the previous Token. def previous_token_type @previous_token ? @previous_token.type : nil end # Unreads the current token. # The previous token becomes the current one # # @raise if #unread has been called twice in a row (no call to #read) def unread if @pushed_back_token raise "only one token can be pushed back" else @pushed_back_token = @token @token = @previous_token # We have no memory of what happened before @previous_token = nil if @token.matcher next_mode = @token.matcher.next_mode set_mode(next_mode) if next_mode end end end # Raises a standard "unexpected character" error. def raise_unexpected_char(msg = "unexpected char") raise_parse_error "#{msg}: <#{@scanner.peek(1)}>" end def raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) line = (line_no == @line_no)? @scanner.string : nil raise SdlParseError.new(msg, line_no + 1, pos + 1, line) end private # Reads the next line of the IO. # All lines are normalized to end with a single '\n'. # # @return [String] the new read line. # def read_line line = @io.gets if line # Clean the line of its end characters line.gsub!(/(?:\n|\r\n|\r)\Z/, @@EOL_STRING) @line_no += 1 if @scanner @scanner.string = line else @scanner = StringScanner.new(line) end end line end end end