# Copyright (C) 2009 Cathal Mc Ginley # Copyright (C) 2014 Matijs van Zuijlen # # Alexandria is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Alexandria is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with Alexandria; see the file COPYING. If not, # write to the Free Software Foundation, Inc., 51 Franklin Street, # Fifth Floor, Boston, MA 02110-1301 USA. # http://de.wikipedia.org/wiki/Thalia_%28Buchhandel%29 # Thalia.de bought the Austrian book trade chain Amadeus # New Tlalia provider, taken from Palatina MetaDataSource and modified # for Alexandria. (21 Dec 2009) require 'net/http' require 'cgi' require 'alexandria/book_providers/web' module Alexandria class BookProviders class ThaliaProvider < WebsiteBasedProvider include Alexandria::Logging SITE = 'http://www.thalia.de' BASE_SEARCH_URL = "#{SITE}/shop/bde_bu_hg_startseite/suche/?%s=%s" # type,term def initialize super('Thalia', 'Thalia (Germany)') # no preferences for the moment prefs.read end def url(book) create_search_uri(SEARCH_BY_ISBN, book.isbn) end def search(criterion, type) req = create_search_uri(type, criterion) puts req if $DEBUG html_data = transport.get_response(URI.parse(req)) if type == SEARCH_BY_ISBN parse_result_data(html_data.body, criterion) else results = parse_search_result_data(html_data.body) raise NoResultsError if results.empty? results.map { |result| get_book_from_search_result(result) } end end def create_search_uri(search_type, search_term) search_type_code = { SEARCH_BY_ISBN => 'sq', SEARCH_BY_AUTHORS => 'sa', # Autor SEARCH_BY_TITLE => 'st', # Titel SEARCH_BY_KEYWORD => 'ssw' # Schlagwort }[search_type] or '' search_type_code = CGI.escape(search_type_code) search_term_encoded = search_term if search_type == SEARCH_BY_ISBN # search_term_encoded = search_term.as_isbn_13 search_term_encoded = Library.canonicalise_isbn(search_term) # check this! else search_term_encoded = CGI.escape(search_term) end BASE_SEARCH_URL % [search_type_code, search_term_encoded] end def parse_search_result_data(html) doc = html_to_doc(html) book_search_results = [] results_divs = doc / 'div.articlePresentationSearchCH' results_divs.each do |div| result = {} title_link = div % 'div.articleText/h2/a' result[:title] = title_link.inner_html result[:lookup_url] = title_link['href'] book_search_results << result end book_search_results end def data_from_label(node, label_text) label_node = node % "strong[text()*='#{label_text}']" if (item_node = label_node.parent) data = '' item_node.children.each do |n| if n.text? data += n.to_html end end data.strip else '' end end def get_book_from_search_result(result) log.debug { "Fetching book from #{result[:lookup_url]}" } html_data = transport.get_response(URI.parse(result[:lookup_url])) parse_result_data(html_data.body, 'noisbn', true) end def parse_result_data(html, isbn, recursing = false) doc = html_to_doc(html) results_divs = doc / 'div.articlePresentationSearchCH' unless results_divs.empty? if recursing # already recursing, avoid doing so endlessly second time # around *should* lead to a book description, not a result # list return end # ISBN-lookup results in multiple results (trying to be # useful, such as for new editions e.g. 9780974514055 # "Programming Ruby" ) results = parse_search_result_data(html) isbn10 = Library.canonicalise_isbn(isbn) # e.g. .../dave_thomas/ISBN0-9745140-5-5/ID6017044.html chosen = results.first # fallback! results.each do |rslt| if rslt[:lookup_url] =~ /\/ISBN(\d+[\d-]*)\// if Regexp.last_match[1].gsub('-', '') == isbn10 chosen = rslt break end end end html_data = transport.get_response(URI.parse(chosen[:lookup_url])) return parse_result_data(html_data.body, isbn, true) end begin if (div = doc % 'div#contentFull') title_img = ((div % :h2) / :img).first title = title_img['alt'] # note, the following img also has alt="von Author, Author..." if (author_h = doc % 'h3[text()*="Mehr von"]') # "More from..." links authors = [] author_links = author_h.parent / :a author_links.each do |a| if a['href'] =~ /BUCH\/sa/ # 'sa' means search author, there may also be 'ssw' (search keyword) links authors << a.inner_text[0..-2].strip # NOTE stripping the little >> character here... end end end item_details = doc % 'ul.itemDataList' isbns = [] isbns << data_from_label(item_details, 'EAN') isbns << data_from_label(item_details, 'ISBN') year = nil date = data_from_label(item_details, 'Erschienen:') if date =~ /([\d]{4})/ year = Regexp.last_match[1].to_i end binding = data_from_label(item_details, 'Einband') publisher = data_from_label(item_details, 'Erschienen bei:') book = Book.new(title, authors, isbns.first, publisher, year, binding) image_url = nil if (image_link = doc % 'a[@id=itemPicStart]') image_url = image_link['href'] end return [book, image_url] end rescue => ex trace = ex.backtrace.join("\n> ") log.warn {'Failed parsing search results for Thalia ' \ "#{ex.message} #{trace}" } raise NoResultsError end end end end end