require 'nokogiri'

require 'http_client_patch/include_client'
require 'httpclient'

# Attempt to search using the WorldCat Search SRU variant, asking API for
# results in DC format. We'll see how far this takes us. 
#
# Does require an API key, and requires OCLC membership/FirstSearch subscription
# for access. 
#
# link is set to worldcat.org link. Change config link_base_url to, say,
# link to a worldcat local instance. 
#
# == Limitations
# Worldcat SRU APU provides _very little_ usable data on format/type. We provide
# some limited heuristics to try and clean up what IS there, but user-displayable
# format_str may be weird sometimes (and is frequently 'Text'), and machine
# readable semantic #format is often defaulted to "Book", which may not
# always be right. 
#
# WorldCat doesn't let you paginate past start_record 9999. If client asks,
# this engine will silenly reset to 9999. 
#
# == API Docs
# * http://oclc.org/developer/documentation/worldcat-search-api/using-api
# * http://oclc.org/developer/documentation/worldcat-search-api/sru
# * http://oclc.org/developer/documentation/worldcat-search-api/parameters
# * http://oclc.org/developer/documentation/worldcat-search-api/service-levels
# * http://oclc.org/developer/documentation/worldcat-search-api/complete-list-indexes
#
# == Required configuration keys
# * api_key
#
# == Optional configuration keys
# [frbrGrouping]   default nil, use worldcat default (which is 'on'). 
#                  See http://oclc.org/developer/documentation/worldcat-search-api/parameters
#                  for meaning of frbrGrouping. set to true or false. 
# [auth]           default false. Set to true to assume all users are authenticated
#                  and servicelevel=full for OCLC. 
#
# == Extra search args
#
# [auth]           default false. Set to true to specify current user is authenticated
#                  and servicelevel=full for OCLC. Overrides config 'auth' value.  
#
class BentoSearch::WorldcatSruDcEngine
  include BentoSearch::SearchEngine
  
  extend HTTPClientPatch::IncludeClient
  include_http_client
  
  MaxStartRecord = 9999 # at least as of Sep 2012, worldcat errors if you ask for pagination beyond this
  
  def search_implementation(args)
    url = construct_query_url(args)

    results = BentoSearch::Results.new

    response = http_client.get(url)
    
    # check for http errors
    if response.status != 200
      results.error ||= {}
      results.error[:status] = response.status
      results.error[:info] = response.body
      results.error[:url] = url
      
      return results    
    end
    
    xml = Nokogiri::XML(response.body)
    # namespaces only get in the way
    xml.remove_namespaces!
    
    
    results.total_items = xml.at_xpath("//numberOfRecords").try {|n| n.text.to_i }
    
    
    # check for SRU fatal errors, no results AND a diagnostic message
    # is a fatal error always, I think. 
    if (results.total_items == 0 && 
        error_xml = xml.at_xpath("./searchRetrieveResponse/diagnostics/diagnostic"))
    
      results.error ||= {}
      results.error[:info] = error_xml.children.to_xml
    end    
    
    (xml.xpath("/searchRetrieveResponse/records/record/recordData/oclcdcs") || []).each do |record|
      item = BentoSearch::ResultItem.new
      
      item.title        = first_text_if_present record, "title"
      
      # May have one (or more?) 'creator' and one or more 'contributor'. 
      # We'll use just creators if we got em, else contributors. 
      authors = record.xpath("./creator")
      authors = record.xpath("./contributor") if authors.empty?
      authors.each do |auth_node|
        item.authors << BentoSearch::Author.new(:display => auth_node.text)
      end
      
      
      # date may have garbage in it, just take the first four digits
      item.year         = record.at_xpath("date").try do |date_node|
        date_node.text =~ /(\d{4})/ ? $1 : nil          
      end
      
      # weird garbled from MARC format, best we have
      (item.format, item.format_str) = format_heuristics(record)
      
      
      item.publisher    = first_text_if_present record, "publisher"
      
      # OCLC DC format gives us a bunch of jumbled 'description' elements
      # with any Marc 5xx. Sigh. We'll just concat em all and call it an
      # abstract, best we can do. 
      item.abstract     = record.xpath("description").collect {|n| n.text}.join("... \n")
      
      # dc.identifier is a terrible smorgasbord of different identifiers,
      # with no way to tell for sure what's what other than pattern matching
      # of literals. sigh. 
      if ( id = first_text_if_present(record, "identifier"))
        possible_isxn = id.scan(/\d|X/).join('')
        # we could test check digit validity, but we ain't
        if possible_isxn.length == 10 || possible_isxn.length == 13
          item.isbn = possible_isxn
        elsif possible_isxn.length == 8
          item.issn = possible_isxn
        end
      end
      
      # The recordIdentifier with no "xsi:type" attrib is an oclcnum. sigh. 
      # lccn may also be in there if we wanted to keep it. 
      item.oclcnum        = first_text_if_present(record, "./recordIdentifier[not(@type)]")
      # oclcnum is our engine-specific unique id too. 
      item.unique_id      = item.oclcnum
      
      item.link           = "#{configuration.linking_base_url}#{item.oclcnum}"
      
      item.language_code  = first_text_if_present record, "./language[@type='http://purl.org/dc/terms/ISO639-2']"
      
      results << item
    end
    
    return results
  end
  
  # get a single record, by it's #unique_id (which is also an oclcnum), 
  # returns record, or raises BentoSearch::NotFound, BentoSearch::TooManyFound,
  # or possibly something weird. 
  def get(id)
    results = search(id, :semantic_search_field => :oclcnum)

    raise (results.error[:exception] || Exception.new(results.error)) if results.failed?
    raise BentoSearch::NotFound.new("ID: #{id}") if results.total_items == 0
    raise BentoSearch::TooManyFound.new("ID: #{ID}") if results.total_items > 1
    
    return results.first    
  end
  
  # Note, if pagination start record is beyond what we think is worldcat's
  # max, it will silently reset to max, and mutate the args passed in
  # so pagination appears to be at max too!
  def construct_query_url(args)
    url = configuration.base_url
    url += "&wskey=#{CGI.escape configuration.api_key}"
    url += "&recordSchema=#{CGI.escape 'info:srw/schema/1/dc'}"
    
     
    url += "&maximumRecords=#{args[:per_page]}" if args[:per_page]
    
    # pagination, WorldCat 'start' is 1-based, ours is 0-based. Catch max.    
    if args[:start] && args[:start] > (MaxStartRecord-1)
      args[:start]  = MaxStartRecord - 1
      args[:page] = (args[:start] / (args[:per_page] || 10)) + 1
    end
    url += "&startRecord=#{args[:start] + 1}" if args[:start]
    
    url += "&query=#{CGI.escape construct_cql_query(args)}"
    
    if (args[:sort]) && (value = sort_definitions[args[:sort]].try {|h| h[:implementation]})
      url += "&sortKeys=#{CGI.escape value}"
    end    
    
    unless configuration.frbrGrouping.nil?
      value = configuration.frbrGrouping ? "on" : "off"
      url += "&frbrGrouping=#{value}"
    end
    
    # service level? search arg over-rides config
    auth = args[:auth]
    auth = configuration.auth if auth.nil?
    if auth
      url += "&servicelevel=full"
    end
    
    return url
  end
  
  # input is a nokogiri node for a recordData/oclcdcs representing a hit. (with
  # namespaces stripped). 
  # 
  # output is [format, format_str], based on rough guess heuristics of what
  # we can do, OCLC does not provide particularly useful data here for either
  # user display passthrough OR semantics, this is inherently flawed but better
  # than nothing. 
  def format_heuristics(record_xml)
    # default semantic format to "Book", it'll sometimes be wrong,
    # but right more often than it's wrong when we lack sufficient
    # info to know otherwise. 
    format = "Book"
    # user display string, default to none, unless we come up with something. 
    format_str = nil         
    
    if xpath_contains(record_xml, "./subject", "--Periodicals")
      # if a subject includes "--Periodicals", we're going to guess it's
      # a serial/journal.
      format = :serial
      format_str = "Journal or Serial"
    elsif record_xml.xpath("./type[text()='Image']").length > 0
      # "Image" can mean video OR actual images, only thing we
      # can do really for user-presentable format is use the terrible "./format",
      # which will often tell the user more (along with a bunch of weird stuff). 
      format_str = first_text_if_present(record_xml, "./format")
    elsif record_xml.xpath("./type[text()='Sound']").length > 0
      # No great thing to display to user to say what this really is,
      # but at least we know it's Sound. 
      format_str = first_text_if_present(record_xml, "./format") || "Sound"
      format = "AudioObject"
    elsif  record_xml.xpath("./description").find {|node| node.text =~ /^Thesis \([^)]+\)--/}
      # yeah, to tag it as a dissertation we've got to heursitically regex
      # a description value for looking like a thesis label. 
      format = :dissertation
      format_str = "Dissertation/Thesis"      
    elsif (type = first_text_if_present(record_xml, "./type"))
      # defaults, 
      # If we have a type, titleize it to change things like MovingImage to
      # 'Moving Image'. 
      format_str = type.titleize
    else 
      # if we don't even have a 'type', use the 'format' if it's there, 
      # even though it's gonna be weird. 
      format_str = first_text_if_present(record, "format")      
    end        
    
    return [format, format_str]
    
  end
  
  def first_text_if_present(node, xpath)
    node.at_xpath(xpath).try {|n| n.text}
  end
  
  # if `node` has an `xpath` whose text() contains `text`.  
  # uses some tricky xpath, may not work with unsuual xpath passed in
  def xpath_contains(node, xpath, text)
    node.xpath(xpath).xpath("./text()[contains(.,'#{text}')]").length > 0
  end
    
  
  # construct valid CQL for the API's "query" param, from search
  # args. Tricky because we need to split terms/phrases ourselves
  #
  # returns CQL that is NOT uri escaped yet. 
  def construct_cql_query(args)
    # default is srw.kw, Keyword anywhere. 
    field = args[:search_field] || "srw.kw" 
    
    # We need to split terms and phrases, so we can formulate
    # CQL with seperate clauses for each, bah. 
    tokens = args[:query].split(%r{\s|("[^"]+")}).delete_if {|a| a.blank?}
    

    
    return tokens.collect do |token|
      quoted_token = nil
      if token =~ /^".*"$/
        # phrase
        quoted_token = token
      else
        # escape internal double quotes with single backslash. sorry ruby escaping
        # makes this crazy. 
        token = token.gsub('"', %Q{\\"})
        quoted_token = %Q{"#{token}"}
      end
      
      "#{field} = #{quoted_token}"
      end.join(" AND ")    
  end

  # date sort seems to work pretty terribly on worldcat. 
  # Author, Title, and "Score" (don't know what that is) also
  # avail on worldcat, asc and desc, but we aren't advertising here,
  # cause, who needs em. 
  def sort_definitions
    {
      "relevance" => {:implementation => "relevance"},
      "date_desc" => {:implementation => "Date,,0"},   
      "library_count_desc" => {:implementation => "Library Count,,0"}
    }
  end
  
  # WorldCat offers more search fields than this, this is what we
  # think is useful right now. Some WorldCat search fields are only
  # available at 'full' service level, but we think all the ones
  # we're listing now are available even at 'default' service level. 
  def search_field_definitions
    {
      nil           => {:semantic => :general},
      "srw.ti"      => {:semantic => :title},
      "srw.au"      => {:semantic => :author},
      "srw.su"      => {:semantic => :subject},
      "srw.bn"      => {:semantic => :isbn},
      # Oddly no ISSN index, all we get is 'number'
      "srw.sn"      => {:semantic => :number},
      "srw.no"      => {:semantic => :oclcnum}
    }
  end
  
  def max_per_page
    100
  end
  
  def self.required_configuration
    [:api_key]
  end
  
  def self.default_configuration
    {
      :base_url => "http://www.worldcat.org/webservices/catalog/search/sru?",
      :linking_base_url => "http://worldcat.org/oclc/",
      :auth => false
    }
  end
  
end