# The Tokenizer module adds methods to the query language compiler that transforms a query string
# into a stream of tokens, which are more appropriate for parsing a query string.
module ScopedSearch::QueryLanguage::Tokenizer

  # All keywords that the language supports
  KEYWORDS = { 'and' => :and, 'or' => :or, 'not' => :not, 'set?' => :notnull, 'has' => :notnull, 'null?' => :null,  'before' => :lt, 'after' => :gt, 'at' => :eq }

  # Every operator the language supports.
  OPERATORS = { '&' => :and, '|' => :or, '&&' => :and, '||' => :or, '-'=> :not, '!' => :not, '~' => :like, '!~' => :unlike,
      '=' => :eq, '==' => :eq, '!=' => :ne, '<>' => :ne, '>' => :gt, '<' => :lt, '>=' => :gte, '<=' => :lte, '^' => :in, '!^' => :notin }

  # Tokenizes the string and returns the result as an array of tokens.
  def tokenize
    @current_char_pos = -1
    to_a
  end

  # Returns the current character of the string
  def current_char
    @current_char
  end

  # Returns a following character of the string (by default, the next
  # character), without updating the position pointer.
  def peek_char(amount = 1)
    @str[@current_char_pos + amount, 1]
  end

  # Returns the next character of the string, and moves the position
  # pointer one step forward
  def next_char
    @current_char_pos += 1
    @current_char = @str[@current_char_pos, 1]
  end

  # Tokenizes the string by iterating over the characters.
  def each_token(&block)
    while next_char
      case current_char
      when /^\s?$/; # ignore
      when '(';  yield(:lparen)
      when ')';  yield(:rparen)
      when ',';  yield(:comma)
      when /\&|\||=|<|>|\^|!|~|-/;  tokenize_operator(&block)
      when '"';                  tokenize_quoted_keyword(&block)
      else;                      tokenize_keyword(&block)
      end
    end
  end

  # Tokenizes an operator that occurs in the OPERATORS hash
  def tokenize_operator(&block)
    operator = current_char
    operator << next_char if OPERATORS.has_key?(operator + peek_char)
    yield(OPERATORS[operator])
  end

  # Tokenizes a keyword, and converts it to a Symbol if it is recognized as a
  # reserved language keyword (the KEYWORDS array).
  def tokenize_keyword(&block)
    keyword = current_char
    keyword << next_char while /[^=~<>\s\&\|\)\(,]/ =~ peek_char
    KEYWORDS.has_key?(keyword.downcase) ? yield(KEYWORDS[keyword.downcase]) : yield(keyword)
  end

  # Tokenizes a keyword that is quoted using double quotes. Allows escaping
  # of double quote characters by backslashes.
  def tokenize_quoted_keyword(&block)
    keyword = ""
    until next_char.nil? || current_char == '"'
      keyword << (current_char == "\\" ? next_char : current_char)
    end
    yield(keyword)
  end

  alias :each :each_token

end