module Ultrasphinx
  class Search
    module Parser    
      # We could rewrite this in Treetop, but since it works well, what's the point? It's
      # probably faster this way. Ragel would speed it up, but be too much work.
          
      class Error < RuntimeError
      end

      OPERATORS = {
        'OR' => '|',
        'AND' => '',
        'NOT' => '-',
        'or' => '|',
        'and' => '',
        'not' => '-'
      }
            
      private
      
      def parse query
        # Alters a Google query string into Sphinx 0.97 style
        return "" if query.blank?
        # Parse
        token_hash = token_stream_to_hash(query_to_token_stream(query))  
        # Join everything up and remove some spaces
        token_hash_to_array(token_hash).join(" ").squeeze(" ").strip
      end
      

      def token_hash_to_array(token_hash)              
        query = []
        
        token_hash.sort_by do |key, value| 
          key or ""
        end.each do |field, contents|
          # First operator always goes outside
          query << contents.first.first 
          
          query << "@#{field}" if field
          query << "(" if field and contents.size > 1
          
          contents.each_with_index do |op_and_content, index|
            op, content = op_and_content
            query << op unless index == 0
            query << content
          end
          
          query << ")" if field and contents.size > 1        
        end
        
        # XXX swap the first pair if the order is reversed
        if [OPERATORS['NOT'], OPERATORS['OR']].include? query.first.upcase
          query[0], query[1] = query[1], query[0]
        end
        
        query
      end
      

      def query_to_token_stream(query)      
        # First, split query on spaces that are not inside sets of quotes or parens
        
        query = query.scan(/[^"() ]*["(][^")]*[")]|[^"() ]+/) 
      
        token_stream = []
        has_operator = false
        
        query.each_with_index do |subtoken, index|
      
          # Recurse for parens, if necessary
          if subtoken =~ /^(.*?)\((.*)\)(.*?$)/
            subtoken = query[index] = "#{$1}(#{parse $2})#{$3}"
          end 
          
          # Reappend missing closing quotes
          if subtoken =~ /(^|\:)\"/
            subtoken = subtoken.chomp('"') + '"'
          end
          
          # Strip parentheses within quoted strings
          if subtoken =~ /\"(.*)\"/
            subtoken.sub!($1, $1.gsub(/[()]/, ''))
          end
          
          # add to the stream, converting the operator
          if !has_operator
            if OPERATORS.to_a.flatten.include? subtoken and index != (query.size - 1) # Operators at the end of the string are not parsed
              token_stream << OPERATORS[subtoken] || subtoken
              has_operator = true # flip
            else
              token_stream << ""
              token_stream << subtoken
            end
          else
            if OPERATORS.to_a.flatten.include? subtoken
              # Drop extra operator
            else
              token_stream << subtoken
              has_operator = false # flop
            end
          end        
        end
        
        raise Error, "#{token_stream.inspect} is not a valid token stream" unless token_stream.size % 2 == 0        
        token_stream.in_groups_of(2) 
      end
      
      
      def token_stream_to_hash(token_stream)
        token_hash = Hash.new([])        
        token_stream.map do |operator, content|
          # Remove some spaces
          content.gsub!(/^"\s+|\s+"$/, '"')
          # Convert fields into sphinx style, reformat the stream object
          if content =~ /(.*?):(.*)/
            token_hash[$1] += [[operator, $2]]
          else
            token_hash[nil] += [[operator, content]]
          end        
        end
        token_hash
      end


    end
  end
end