# Copyright (C) 2004 Laurent Sansonetti # Copyright (C) 2007 Laurent Sansonetti and Marco Costantini # Copyright (C) 2009 Cathal Mc Ginley # Copyright (C) 2011, 2014, 2016 Matijs van Zuijlen # # Alexandria is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Alexandria is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with Alexandria; see the file COPYING. If not, # write to the Free Software Foundation, Inc., 51 Franklin Street, # Fifth Floor, Boston, MA 02110-1301 USA. # Adapted code from 'bn.rb' (I hope that it works!) # Almost completely rewritten by Cathal Mc Ginley (21 Feb 2009) # based on the new code for Palatina require 'net/http' require 'cgi' require 'alexandria/book_providers/web' module Alexandria class BookProviders class SicilianoProvider < WebsiteBasedProvider include Logging SITE = 'http://www.siciliano.com.br'.freeze # The string interpolations in this URL are the search term and search # type, respectively. BASE_SEARCH_URL = "#{SITE}/pesquisaweb/pesquisaweb.dll/pesquisa?" \ '&FIL_ID=102' \ '&PALAVRASN1=%s' \ '&FILTRON1=%s' \ '&ESTRUTN1=0301&ORDEMN2=E'.freeze def initialize super('Siciliano', 'Livraria Siciliano (Brasil)') # no preferences for the moment prefs.read end def get_book_from_search_result(result) log.info { "Fetching book from #{result[:url]}" } html_data = transport.get(URI.parse(result[:url])) parse_result_data(html_data, result) end def search(criterion, type) criterion = criterion.encode('ISO-8859-1') # still needed?? trying_again = false begin req = create_search_uri(type, criterion, trying_again) log.debug { "#{name} #{trying_again ? 'retrying ' : ''}request = #{req}" } data = transport.get(URI.parse(req)) results = parse_search_result_data(data) raise NoResultsError if results.empty? if type == SEARCH_BY_ISBN get_book_from_search_result(results.first) else results.map { |result| get_book_from_search_result(result) } end rescue NoResultsError => err if (type == SEARCH_BY_ISBN) && (trying_again == false) trying_again = true retry else raise err end end end # the new Siciliano website no longer has direct links to books by their ISBN # (the permalink now seems to be based on the product id) def url(_book) nil end private def create_search_uri(search_type, search_term, trying_again = false) (search_type_code = { SEARCH_BY_ISBN => 'G', SEARCH_BY_TITLE => 'A', SEARCH_BY_AUTHORS => 'B', SEARCH_BY_KEYWORD => 'X' }[search_type]) || 'X' search_term_encoded = if search_type == SEARCH_BY_ISBN if trying_again # on second attempt, try ISBN-10... Library.canonicalise_isbn(search_term) # isbn-10 else # search by ISBN-13 first Library.canonicalise_ean(search_term) # isbn-13 end else CGI.escape(search_term) end BASE_SEARCH_URL % [search_term_encoded, search_type_code] end def parse_search_result_data(html) # The layout... # td[@class="normal"] # span[@class="vitrine_nome_produto"] # a (title and link to 'product page') # br # TEXT --> author / publisher # br # div[@class="vitrine_preco_por"] (price info) doc = html_to_doc(html) book_search_results = [] # each result will be a dict with keys :title, :author, :publisher, :url list_items = doc.search('div.pesquisa-item-lista-conteudo') list_items.each do |item| begin result = {} # author & publisher author_publisher = '' item.children.each do |node| author_publisher += node.to_s if node.text? author_publisher.strip! break unless author_publisher.empty? end author, publisher = author_publisher.split('/') result[:author] = author.strip if author result[:publisher] = publisher.strip if publisher # title & url link = item % 'a' result[:title] = link.inner_text.strip link_to_description = link['href'] slash = '' slash = '/' unless link_to_description =~ /^\// result[:url] = "#{SITE}#{slash}#{link_to_description}" book_search_results << result rescue => ex trace = ex.backtrace.join("\n> ") log.error { "Failed parsing Siciliano search page #{ex.message}\n#{trace}" } end end book_search_results end def parse_result_data(html, search_result) # checked against Siciliano website 21 Feb 2009 doc = html_to_doc(html) # title title_div = doc % 'div#conteudo//div.titulo' raise NoResultsError unless title_div title_h = title_div % 'h2' title = title_h.inner_text if title_h # title = first_non_empty_text_node(title_div) # author_spans = doc/'span.rotulo' author_hs = title_div / 'h3.autor' authors = [] author_hs.each do |h| authors << h.inner_text.strip end ## synopsis_div = doc % 'div#sinopse' details_div = doc % 'div#tab-caracteristica' details = string_array_to_map(lines_of_text_as_array(details_div)) # ISBN isbn = details['ISBN'] ## ean = details["CdBarras"] translator = details['Tradutor'] authors << translator if translator binding = details['Acabamento'] publisher = search_result[:publisher] # publish year publish_year = nil edition = details['Edio'] if edition if edition =~ /([12][0-9]{3})/ # publication date publish_year = Regexp.last_match[1].to_i end end # cover # ImgSrc[1]="/imagem/imagem.dll?pro_id=1386929&PIM_Id=658849"; image_urls = [] (doc / 'script').each do |script| next if script.children.nil? script.children.each do |ch| ch_text = ch.to_s if ch_text =~ /ImgSrc\[[\d]\]="(.+)";/ img_link = Regexp.last_match[1] image_urls << img_link end end end book = Book.new(title, authors, isbn, publisher, publish_year, binding) result = [book, image_urls.first] return result rescue => ex trace = ex.backtrace.join("\n> ") log.error { "Failed parsing Siciliano product page #{ex.message}\n#{trace}" } return nil end def first_non_empty_text_node(elem) text = '' elem.children.each do |node| next unless node.text? text = node.to_s.strip break unless text.empty? end text end def lines_of_text_as_array(elem) lines = [] current_text = '' elem.children.each do |e| if e.text? current_text += e.to_s elsif e.name == 'br' lines << current_text.strip current_text = '' else current_text += e.inner_text end end lines << current_text.strip lines.delete('') lines end def string_array_to_map(arr) map = {} arr.each do |str| key, val = str.split(':') # a real hack for not handling encoding properly :^) map[key.gsub(/[^a-zA-Z]/, '')] = val.strip if val end map end # def binding_type(binding) # portuguese string # {"brochura" => :paperback, # "encadernado" => :hardback}[binding.downcase] or :unknown # end end end end