# -*- ruby -*-
#
# Copyright (C) 2009 Cathal Mc Ginley
# Copyright (C) 2011, 2014 Matijs van Zuijlen
#
# Alexandria is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Alexandria is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with Alexandria; see the file COPYING.  If not,
# write to the Free Software Foundation, Inc., 51 Franklin Street,
# Fifth Floor, Boston, MA 02110-1301 USA.

# http://en.wikipedia.org/wiki/WorldCat
# See http://www.oclc.org/worldcat/policies/terms/

# New WorldCat provider, taken from the Palatina MetaDataSource and
# modified to fit the structure of Alexandria book providers.
# (25 Feb 2009)
#
# Updated from Palatina, to reflect changes in the worldcat website.
# (1 Sep 2009)

require 'cgi'
require 'alexandria/net'
require 'alexandria/book_providers/web'

module Alexandria
  class BookProviders
    class WorldCatProvider < WebsiteBasedProvider
      include Alexandria::Logging

      SITE = 'http://www.worldcat.org'
      BASE_SEARCH_URL = "#{SITE}/search?q=%s%s&qt=advanced" # type, term

      def initialize
        super('WorldCat', 'WorldCat')
        # prefs.add("enabled", _("Enabled"), true, [true,false])
        prefs.read
      end

      def search(criterion, type)
        # puts create_search_uri(type, criterion)
        req = create_search_uri(type, criterion)
        puts req if $DEBUG
        html_data = transport.get_response(URI.parse(req))
        # Note: I tried to use Alexandria::WWWAgent,
        #       but this caused failures here (empty pages...)
        #       find out how the requests differ

        # puts html_data.class
        if type == SEARCH_BY_ISBN
          parse_result_data(html_data.body, criterion)
        else
          results = parse_search_result_data(html_data.body)
          raise NoResultsError if results.empty?

          results.map { |result| get_book_from_search_result(result) }
        end
      end

      def url(book)
        create_search_uri(SEARCH_BY_ISBN, book.isbn)
      rescue => ex
        log.warn { "Cannot create url for book #{book}; #{ex.message}" }
        nil
      end

      private

      def create_search_uri(search_type, search_term)
        search_type_code = { SEARCH_BY_ISBN => 'isbn:',
                             SEARCH_BY_AUTHORS => 'au:',
                             SEARCH_BY_TITLE => 'ti:',
                             SEARCH_BY_KEYWORD => ''
        }[search_type] or ''
        search_type_code = CGI.escape(search_type_code)
        search_term_encoded = search_term # TODO, remove attack stuff
        if search_type == SEARCH_BY_ISBN
          search_term_encoded = Library.canonicalise_ean(search_term) # isbn-13
        else
          search_term_encoded = CGI.escape(search_term)
        end
        BASE_SEARCH_URL % [search_type_code, search_term_encoded]
      end

      def get_book_from_search_result(result)
        log.debug { "Fetching book from #{result[:url]}" }
        html_data =  transport.get_response(URI.parse(result[:url]))
        parse_result_data(html_data.body)
      end

      def parse_search_result_data(html)
        doc = html_to_doc(html, 'UTF-8')
        book_search_results = []
        begin
          result_cells = doc / 'td.result/div.name/..'
          # puts result_cells.length
          result_cells.each do |td|
            type_icon = (td % 'div.type/img.icn')
            next unless type_icon and type_icon['src'] =~ /icon-bks/
            name_div = td % 'div.name'
            title = name_div.inner_text
            anchor = name_div % :a
            if anchor
              url = anchor['href']
            end
            lookup_url = "#{SITE}#{url}"
            result = {}
            result[:title] = title
            result[:url] = lookup_url

            book_search_results << result
          end
        rescue => ex
          trace = ex.backtrace.join("\n> ")
          log.warn {'Failed parsing search results for WorldCat ' \
                    "#{ex.message} #{trace}" }
        end
        book_search_results
      end

      def parse_result_data(html, search_isbn = nil, recursing = false)
        doc = html_to_doc(html, 'UTF-8')

        begin
          if doc % 'div#div-results-none'
            log.debug { 'WorldCat reports no results' }
            raise NoResultsError
          end

          if doc % 'table.table-results'
            if recursing
              log.warn { 'Infinite loop prevented redirecting through WorldCat' }
              raise NoResultsError
            end
            log.info { 'Found multiple results for lookup: checking each' }
            search_results = parse_search_result_data(html)
            book = nil
            cover_url = nil
            first_result = nil
            search_results.each do |rslt|
              # rslt = search_results.rslt
              log.debug { "checking #{rslt[:url]}" }
              rslt2 = transport.get_response(URI.parse(rslt[:url]))
              html2 = rslt2.body

              book, cover_url = parse_result_data(html2, search_isbn, true)
              if first_result.nil?
                first_result = [book, cover_url]
              end

              log.debug { "got book #{book}" }

              if search_isbn
                search_isbn_canon = Library.canonicalise_ean(search_isbn)
                rslt_isbn_canon = Library.canonicalise_ean(book.isbn)
                if search_isbn_canon == rslt_isbn_canon
                  log.info { "book #{book} is a match" }
                  return [book, cover_url]
                end
                log.debug { 'not a match, checking next' }
              else
                # no constraint to match isbn, just return first result
                return [book, cover_url]
              end
            end

            # gone through all and no ISBN match, so just return first result
            log.info { 'no more results to check. Returning first result, just an approximation' }
            return first_result

          end

          title_header = doc % 'h1.title'
          title = title_header.inner_text if title_header
          unless title
            log.warn { 'Unexpected lack of title from WorldCat lookup' }
            raise NoResultsError
          end
          log.info { "Found book #{title} at WorldCat" }

          authors = []
          authors_tr = doc % 'tr#details-allauthors'
          if authors_tr
            (authors_tr / :a).each do |a|
              authors << a.inner_text
            end
          end

          # can we do better? get the City name?? or multiple publishers?
          bibdata = doc % 'div#bibdata'
          bibdata_table = bibdata % :table
          publisher_row = bibdata_table % 'th[text()*=Publisher]/..'

          if publisher_row
            publication_info = (publisher_row / 'td').last.inner_text

            if publication_info.index(';')
              publication_info =~ /;[\s]*([^\d]+)[\s]*[\d]*/
            elsif publication_info.index(':')
              publication_info =~ /:[\s]*([^;:,]+)/
            else
              publication_info =~ /([^;,]+)/
            end

            publisher = Regexp.last_match[1]
            publication_info =~ /([12][0-9]{3})/
            year = Regexp.last_match[1].to_i if Regexp.last_match[1]
          else
            publisher = nil
            year = nil
          end

          isbn = search_isbn
          unless isbn
            isbn_row = doc % 'tr#details-standardno' # #bibdata_table % 'th[text()*=ISBN]/..'
            if isbn_row
              isbns = (isbn_row / 'td').last.inner_text.split
              isbn = Library.canonicalise_isbn(isbns.first)
            else
              log.warn { 'No ISBN found on page' }
            end
          end

          binding = '' # not given on WorldCat website (as far as I can tell)

          book = Book.new(title, authors, isbn, publisher, year, binding)

          image_url = nil # hm, it's on the website, but uses JavaScript...

          return [book, image_url]

        rescue => ex
          raise ex if ex.instance_of? NoResultsError
          trace = ex.backtrace.join("\n> ")
          log.warn {'Failed parsing search results for WorldCat ' \
                    "#{ex.message} #{trace}" }
          raise NoResultsError
        end
      end
    end # class WorldCatProvider
  end # class BookProviders
end # module Alexandria