# The SQLTree::Tokenizer class transforms a string or stream of
# characters into a enumeration of tokens, that are more appropriate for
# the SQL parser to work with.
#
# An example:
#
# >> SQLTree::Tokenizer.new.tokenize('SELECT * FROM table')
# => [:select, :all, :from, Variable('table')]
#
# The tokenize method will return an array of tokens, while
# the each_token (aliased to each) will yield every
# token one by one.
class SQLTree::Tokenizer
include Enumerable
# The keyword queue, on which kywords are placed before they are yielded
# to the parser, to enable keyword combining (e.g. NOT LIKE)
attr_reader :keyword_queue
def initialize # :nodoc:
@keyword_queue = []
end
# Returns an array of tokens for the given string.
# string:: the string to tokenize
def tokenize(string)
@string = string
@current_char_pos = -1
self.entries
end
# Returns the current character that is being tokenized
def current_char
@current_char
end
# Returns the next character to tokenize, but does not move
# the pointer of the current character forward.
# lookahead:: how many positions forward to peek.
def peek_char(lookahead = 1)
@string[@current_char_pos + lookahead, 1]
end
# Returns the next character to tokenize, and moves the pointer
# of the current character one position forward.
def next_char
@current_char_pos += 1
@current_char = @string[@current_char_pos, 1]
end
# Combines several tokens to a single token if possible, and
# yields teh result, or yields every single token if they cannot
# be combined.
# token:: the token to yield or combine
# block:: the block to yield tokens and combined tokens to.
def handle_token(token, &block) # :yields: SQLTree::Token
if token.kind_of?(SQLTree::Token::Keyword)
keyword_queue.push(token)
else
empty_keyword_queue!(&block)
block.call(token)
end
end
# This method ensures that every keyword currently in the queue is
# yielded. This method get called by handle_token when it
# knows for sure that the keywords on the queue cannot be combined
# into a single keyword.
# block:: the block to yield the tokens on the queue to.
def empty_keyword_queue!(&block) # :yields: SQLTree::Token
block.call(@keyword_queue.shift) until @keyword_queue.empty?
end
# Iterator method that yields each token that is encountered in the
# SQL stream. These tokens are passed to the SQL parser to construct
# a syntax tree for the SQL query.
#
# This method is aliased to :each to make the Enumerable
# methods work on this method.
def each_token(&block) # :yields: SQLTree::Token
while next_char
case current_char
when /^\s?$/; # whitespace, go to next character
when '('; handle_token(SQLTree::Token::LPAREN, &block)
when ')'; handle_token(SQLTree::Token::RPAREN, &block)
when '.'; handle_token(SQLTree::Token::DOT, &block)
when ','; handle_token(SQLTree::Token::COMMA, &block)
when /\d/; tokenize_number(&block)
when "'"; tokenize_quoted_string(&block)
when OPERATOR_CHARS; tokenize_operator(&block)
when /\w/; tokenize_keyword(&block)
when '"'; tokenize_quoted_variable(&block) # TODO: allow MySQL quoting mode
end
end
# Make sure to yield any tokens that are still stashed on the queue.
empty_keyword_queue!(&block)
end
alias :each :each_token
# Tokenizes a eyword in the code. This can either be a reserved SQL keyword
# or a variable. This method will yield variables directly. Keywords will be
# yielded with a delay, because they may need to be combined with other
# keywords in the handle_token method.
def tokenize_keyword(&block) # :yields: SQLTree::Token
literal = current_char
literal << next_char while /[\w]/ =~ peek_char
if SQLTree::Token::KEYWORDS.include?(literal.upcase)
handle_token(SQLTree::Token.const_get(literal.upcase), &block)
else
handle_token(SQLTree::Token::Variable.new(literal), &block)
end
end
# Tokenizes a number (either an integer or float) in the SQL stream.
# This method will yield the token after the last digit of the number
# has been encountered.
def tokenize_number(&block) # :yields: SQLTree::Token::Number
number = current_char
dot_encountered = false
while /\d/ =~ peek_char || (peek_char == '.' && !dot_encountered)
dot_encountered = true if peek_char == '.'
number << next_char
end
if dot_encountered
handle_token(SQLTree::Token::Number.new(number.to_f), &block)
else
handle_token(SQLTree::Token::Number.new(number.to_i), &block)
end
end
# Reads a quoted string token from the SQL stream. This method will
# yield an SQLTree::Token::String when the closing quote character is
# encountered.
def tokenize_quoted_string(&block) # :yields: SQLTree::Token::String
string = ''
until next_char.nil? || current_char == "'"
string << (current_char == "\\" ? next_char : current_char)
end
handle_token(SQLTree::Token::String.new(string), &block)
end
# Tokenize a quoted variable from the SQL stream. This method will
# yield an SQLTree::Token::Variable when to closing quote is found.
#
# The actual quote character that is used depends on the DBMS. For now,
# only the more standard double quote is accepted.
def tokenize_quoted_variable(&block) # :yields: SQLTree::Token::Variable
variable = ''
until next_char.nil? || current_char == '"' # TODO: allow MySQL quoting mode
variable << (current_char == "\\" ? next_char : current_char)
end
handle_token(SQLTree::Token::Variable.new(variable), &block)
end
# A regular expression that matches all operator characters.
OPERATOR_CHARS = /\=|<|>|!|\-|\+|\/|\*|\%/
# Tokenizes an operator in the SQL stream. This method will yield the
# operator token when the last character of the token is encountered.
def tokenize_operator(&block) # :yields: SQLTree::Token
operator = current_char
if operator == '-' && /[\d\.]/ =~ peek_char
tokenize_number(&block)
else
operator << next_char if SQLTree::Token::OPERATORS_HASH.has_key?(operator + peek_char)
handle_token(SQLTree::Token.const_get(SQLTree::Token::OPERATORS_HASH[operator].to_s.upcase), &block)
end
end
end