require 'httpclient'
require 'http_client_patch/include_client'

require 'json'

module BentoSearch
  # DOAJ Articles search. 
  # https://doaj.org/api/v1/docs
  #
  # Phrase searches with double quotes are respected. 
  #
  # Supports #get by unique_id feature
  #
  class DoajArticlesEngine
    include BentoSearch::SearchEngine
    include ActionView::Helpers::SanitizeHelper


    class_attribute :http_timeout
    self.http_timeout = 10

    extend HTTPClientPatch::IncludeClient
    include_http_client do |client|
      client.connect_timeout = client.send_timeout = client.receive_timeout = self.http_timeout
    end

    class_attribute :base_url
    self.base_url = "https://doaj.org/api/v1/search/articles/"

    def search_implementation(arguments)
      query_url = args_to_search_url(arguments)

      results = Results.new

      begin
        Rails.logger.debug("DoajEngine: requesting #{query_url}")
        response = http_client.get( query_url )
        json = JSON.parse(response.body)
      rescue TimeoutError, HTTPClient::TimeoutError,
             HTTPClient::ConfigurationError, HTTPClient::BadResponseError,
             JSON::ParserError  => e
        results.error ||= {}
        results.error[:exception] = e
      end

      if ( response.nil? || json.nil? ||
          (! HTTP::Status.successful? response.status) ||
          (json && json["error"]))

        results.error ||= {}
        results.error[:status] = response.status if response
        results.error[:message] = json["error"] if json["error"]

        return results
      end

      results.total_items = json["total"]

      (json["results"] || []).each do |item_response|
        results <<  hash_to_item(item_response)
      end

      return results
    end

    def get(unique_id)
      results = search(unique_id, :search_field => "id")

      raise (results.error[:exception] || StandardError.new(results.error[:message] || results.error[:status])) if results.failed?
      raise BentoSearch::NotFound.new("For id: #{unique_id}") if results.length == 0
      raise BentoSearch::TooManyFound.new("For id: #{unique_id}") if results.length > 1

      results.first
    end


    def args_to_search_url(arguments)
      query = if arguments[:query].kind_of?(Hash)
        # multi-field query
        arguments[:query].collect {|field, query| fielded_query(query, field)}.join(" ")
      else
        fielded_query(arguments[:query], arguments[:search_field])
      end

      # We need to escape this for going in a PATH component,
      # not a query. So space can't be "+", it needs to be "%20",
      # and indeed DOAJ API does not like "+".
      # 
      # But neither CGI.escape nor URI.escape does quite
      # the right kind of escaping, seems to work out
      # if we do CGI.escape but then replace '+'
      # with '%20'
      escaped_query = CGI.escape(query).gsub('+', '%20')
      url = self.base_url + escaped_query

      query_args = {}

      if arguments[:per_page]
        query_args["pageSize"]  = arguments[:per_page]
      end
      
      if arguments[:page]
        query_args["page"]      = arguments[:page]
      end

      if arguments[:sort] &&
          (defn = sort_definitions[arguments[:sort]]) &&
          (value = defn[:implementation])
        query_args["sort"] = value
      end

      query = query_args.to_query
      url = url + "?" + query if query.present?

      return url
    end

    # Prepares a DOAJ API (elastic search) query component for 
    # given textual query in a given field (or default non-fielded search)
    #
    # Separates query string into tokens (bare words and phrases),
    # so they can each be made mandatory for ElasticSearch. Default
    # DOAJ API makes them all optional, with a very low mm, which
    # leads to low-precision odd looking results for standard use
    # cases. 
    #
    # Escapes all remaining special characters as literals (not including
    # double quotes which can be used for phrases, which are respected. )
    #
    # Eg:
    #     fielded_query('apple orange "strawberry banana"', field_name)
    #     # => '+field_name(+apple +orange +"strawberry banana")'
    #
    # The "+" prefixed before field-name is to make sure all separate
    # fields are also mandatory when doing multi-field searches. It should
    # make no difference for a single-field search. 
    def fielded_query(query, field = nil)
      if field.present?
        "+#{field}:(#{prepare_mandatory_terms(query)})"
      else
        prepare_mandatory_terms(query)
      end
    end

    # Takes a query string, prepares an ElasticSearch query
    # doing what we want: 
    #   * tokenizes into bare words and double-quoted phrases
    #   * Escapes other punctuation to be literal not ElasticSearch operator.
    #     (Does NOT do URI escaping)
    #   * Makes each token mandatory with an ElasticSearch "+" operator prefixed. 
    def prepare_mandatory_terms(query)      
      # use string split with regex to too-cleverly split into space
      # seperated terms and phrases, keeping phrases as unit.
      terms = query.split %r{[[:space:]]+|("[^"]+")}
      # Wound up with some empty strings, get rid of em
      terms.delete_if {|t| t.blank?}

      terms.collect {|token| "+" + escape_query(token)}.join(" ")
    end

    # Converts from item found in DOAJ results to BentoSearch::ResultItem
    def hash_to_item(hash)
      item = ResultItem.new

      bibjson = hash["bibjson"] || {}

      item.unique_id  = hash["id"]

      # Hard-code to Article, we don't get any format information
      item.format     = "Article"

      item.title      = bibjson["title"]


      item.start_page = bibjson["start_page"]
      item.end_page   = bibjson["end_page"]
      
      item.year       = bibjson["year"]
      if (year = bibjson["year"].to_i) && (month = bibjson["month"].to_i)
        if year != 0 && month != 0
          item.publication_date = Date.new(bibjson["year"].to_i, bibjson["month"].to_i)
        end
      end      

      item.abstract   = sanitize(bibjson["abstract"]) if bibjson.has_key?("abstract")

      journal           = bibjson["journal"] || {}
      item.volume       = journal["volume"]
      item.issue        = journal["number"]
      item.source_title = journal["title"]
      item.publisher    = journal["publisher"]
      item.language_str = journal["language"].try(:first)

      (bibjson["identifier"] || []).each do |id_hash|
        case id_hash["type"]
        when "doi"
          item.doi = id_hash["id"]
        when "pissn"
          item.issn = id_hash["id"]
        end
      end

      (bibjson["author"] || []).each do |author_hash|
        if author_hash.has_key?("name")
          author = Author.new(:display => author_hash["name"])
          item.authors << author
        end
      end

      # I _think_ DOAJ articles results always only have one link,
      # and it may always be of type 'fulltext'
      link_hash             = (bibjson["link"] || []).first
      if link_hash && link_hash["url"]
        item.link             = link_hash["url"]
        item.link_is_fulltext = true if link_hash["type"] == "fulltext"
      end

      return item
    end

    # Escape special chars in query, Doaj says it's elastic search,
    # punctuation that needs to be escaped and how to escape (backslash)
    # for ES documented here: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
    #
    # We do not escape double quotes, want to allow them for phrases. 
    #
    # This method does NOT return URI-escaped, it returns literal, escaped for ES. 
    def escape_query(q)
      q.gsub(/([\+\-\=\&\|\>\<\!\(\)\{\}\[\]\^\~\*\?\:\\\/])/) {|m| "\\#{$1}"}
    end


    ###########
    # BentoBox::SearchEngine API
    ###########

    def max_per_page
      100
    end

    def search_field_definitions
      { nil                     => {:semantic => :general},
        "bibjson.title"         => {:semantic => :title},
        # Using 'exact' seems to produce much better results for
        # author, don't entirely understand what's up. 
        "bibjson.author.name"   => {:semantic => :author},
        "publisher"             => {:semantic => :publisher},
        "bibjson.subject.term"  => {:semantic => :subject},
        "bibjson.journal.title" => {:semantic => :source_title},
        "issn"                  => {:semantic => :issn},
        "doi"                   => {:semantic => :doi},
        "bibjson.journal.volume"   => {:semantic => :volume},
        "bibjson.journal.number"   => {:semantic => :issue},
        "bibjson.start_page"   => {:semantic => :start_page},
        "license" => {},
        "id"      => {}
      }
    end

    def multi_field_search?
      true
    end

    def sort_definitions
      # Don't believe DOAJ supports sorting by author
      {        
        "relevance" => {:implementation => nil}, # default
        "title" => {:implementation => "title:asc"},
        # We don't quite have publication date sorting, but we'll use
        # created_date from DOAJ
        "date_desc" => {:implementation => "article.created_date:desc"},
        "date_asc"  => {:implementation => "article.created_date:asc"},
        # custom one not previously standardized
        "publication_name" => {:implementation => "bibjson.journal.title:asc"}
      }
    end

  end
end