#!/bin/env ruby
# encoding: utf-8
require_relative 'base'
require_relative 'bibliographical_book_extractor'
require 'sanitize'
module Biblionet
module Extractors
class BookExtractor < Base
attr_reader :book
def initialize(uri=nil)
super(uri)
extract_book unless uri.nil? or @page.nil?
end
def load_and_extract_book(uri=nil)
load_page(uri)
extract_book unless uri.nil? or @page.nil?
end
# Converts the parsed contributors string to hash.
# String must have been processed into the following form:
# job1: contributor1, contributor2 job2: contributor3
# The returned hash is in form: {job1 => ["contributor1","contributor2"],job2 => ["contributor3"]}
def proccess_contributors(raw_contributors)
contributors = Hash.new
partners = Array.new
job = :author
raw_contributors.each do |cb|
if cb.is_a?(String) and cb.end_with? ":"
job = cb[0..-2]
partners.clear
else
partners << cb
contributors[job] = partners.clone
end
end unless raw_contributors.nil? or raw_contributors.empty?
return contributors
end
def proccess_details(details)
details_hash = Hash.new
details.each do |detail|
date_regex = /(^\d{4}$)/
status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
detail = decode_text(detail)
begin
if detail =~ date_regex
#puts "Publication Year: #{detail}"
details_hash[:publication_year] = detail
elsif detail.end_with? "σελ."
pages = detail.gsub(/[^\d]/, '')
#puts "Pages: #{pages}"
details_hash[:pages] = pages
elsif detail.start_with? "ISBN-13"
isbn_13 = detail.gsub(/ISBN-13 /, "")
details_hash[:isbn_13] = isbn_13
#puts "ISBN: #{isbn_13}"
elsif detail.start_with? "ISBN"
isbn = detail.gsub(/ISBN /, "")
#puts "ISBN: #{isbn}"
details_hash[:isbn] = isbn
elsif detail =~ status_regex
status = detail.gsub(/\[|\]/, '')
#puts "Status: #{status}"
details_hash[:status] = status
elsif detail.start_with? "Τιμή"
price = detail.gsub(/[^\d,\d]/, '')
#puts "Price: #{price}"
details_hash[:price] = price
elsif detail.start_with? ''
award = Sanitize.clean(detail).strip
details_hash[:awards] = [] if details_hash[:awards].nil?
details_hash[:awards] << award
elsif detail.start_with? "ISMN" #Special typo case
isbn = detail.gsub(/ISMN /, "")
#puts "ISBN: #{isbn}"
details_hash[:isbn] = isbn
else
raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
end
rescue NoIdeaWhatThisIsError => e
pp e
end
end
return details_hash
end
def proccess_ddc(ddc, extract_parents = false)
# Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/
# Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
non_text_re = /\s*(\[.*\]|\(.*\))\s*/
# Gets the dcc part from text and removes anything but digits in [DDC: digits].
ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
# Extracts the parent tree of current ddc.
# ddcparser.parse(ddc_id)
# Gets text by reomoving anything but text.
ddc_text = ddc.gsub(non_text_re, '').strip
ddc_hash = { ddc: ddc_id, name: ddc_text }
return ddc_hash
end
def extract_book(biblionet_id=@biblionet_id, book_page=@page)
# log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
log = Logger.new(STDOUT)
page = BookDataExtractor.new(book_page)
# End extraction if BookDataExtractor couldnt create a nodeset
return nil if page.nodeset.nil?
book_hash = Hash.new
begin
img = page.image
raise NoImageError.new(biblionet_id) if img.nil?
rescue NoImageError => e
pp e
log.warn(e.message)
rescue StandardError => e
pp err_msg = "Error #{e} at book: #{biblionet_id}"
log.error(err_msg)
end
book_hash[:title] = page.title
book_hash[:subtitle] = page.subtitle
book_hash[:image] = img
contributors = proccess_contributors(page.contributors)
author = contributors[:author]
contributors.delete(:author)
# If author is empty, maybe its a collective work.
if author.nil? or author.empty?
if page.collective_work?
# author = 'Συλλογικό έργο'
author = ['Συλλογικό έργο']
else
pp err_msg = "No author has been found at book: #{biblionet_id}"
log.warn(err_msg)
author = []
end
end
book_hash[:author] = author
book_hash[:contributors] = contributors
book_hash[:publisher] = page.publisher
details = page.details
if details.nil?
pp err_msg = "No details at book: #{biblionet_id}"
log.error(err_msg)
end
details_hash = proccess_details(details)
# book_hash[:publication_year] = details_hash[:publication_year]
# book_hash[:pages] = details_hash[:pages]
book_hash[:isbn] = details_hash[:isbn]
if details_hash[:isbn_13].nil?
if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13)
book_hash[:isbn_13] = book_hash[:isbn]
else
book_hash[:isbn_13] = nil
end
else
book_hash[:isbn_13] = details_hash[:isbn_13]
end
# book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
# book_hash[:status] = details_hash[:status]
# book_hash[:price] = details_hash[:price]
book_hash[:award] = page.awards
book_hash[:description] = page.description
ddcs = page.ddcs.map do |ddc|
# Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
ddc_biblionet_id = ddc[:href].split(/\//).last
# Extact DdC id and DdC text.
ddc = proccess_ddc(ddc.text)
ddc.merge!(b_id: ddc_biblionet_id)
end
book_hash[:category] = ddcs
book_hash[:b_id] = biblionet_id
uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
book_hash[:publisher] = bibliographical_details[:publisher]
book_hash[:publication] = bibliographical_details[:publication]
book_hash[:format] = bibliographical_details[:format]
book_hash[:original_language] = bibliographical_details[:original_language]
book_hash[:original_title] = bibliographical_details[:original_title]
book_hash[:price] = bibliographical_details[:price]
book_hash[:availability] = bibliographical_details[:availability]
book_hash[:last_update] = bibliographical_details[:last_update]
book_hash[:series] = bibliographical_details[:series]
physical_description_hash = {}
physical_description_hash[:pages] = details_hash[:pages]
physical_description_hash[:size] = bibliographical_details[:physical_size]
physical_description_hash[:cover_type] = bibliographical_details[:cover_type]
book_hash[:physical_description] = physical_description_hash
return @book = book_hash
end
end
class BookDataExtractor
attr_reader :nodeset
def initialize(document)
# No need to operate on whole page. Just on part containing the book.
content_re = /.*/m
if (content_re.match(document)).nil?
puts document
end
content = content_re.match(document)[0] unless (content_re.match(document)).nil?
# If content is nil, there is something wrong with the html, so return nil
if content.nil?
@nodeset = nil
else
@nodeset = Nokogiri::HTML(content)
end
end
def image
img_node = nil
img_nodes = @nodeset.xpath("/html/body//img").each do |i|
img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
end
img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
return img
end
def title
@nodeset.css('h1.book_title').text
end
def subtitle
subtitle = nil
@nodeset.xpath("//h1[@class='book_title']").each do |item|
if item.next_element.name == 'br' and item.next_element.next.name != 'br'
subtitle = item.next_element.next.text.strip
end
end
subtitle
end
def publisher
publisher_hash = {}
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
publisher_hash[:name] = item.text
publisher_hash[:b_id] = (item[:href].split("/"))[2]
end
publisher_hash
end
def contributors
contributors = []
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
pre_text = item.previous.text.strip
contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
contributor = {}
contributor[:name] = item.text
contributor[:b_id] = (item[:href].split("/"))[2]
contributors << contributor
end
# Alternative way based on intersecting sets
# set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
# set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"
# others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
# text = other.inner_text.strip
# other = text == "," ? nil : text
# end.compact
contributors
end
def details
details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "
").split("
").map(&:strip).reject!(&:empty?)
if details.nil?
details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "
").split("
").map(&:strip).reject!(&:empty?)
end
return details
end
def description
desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/
/,'\\n')
desc = Sanitize.clean(desc, elements: ['br'])
if (desc =~ /\p{Word}{3,}/).nil?
return nil
else
return desc
end
end
def ddcs
@nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
end
def collective_work?
return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
end
# Special case in which there is no author but there are contributors
def has_contributors_but_no_authors?
node_start = "//h1[@class='book_title']/following::text()"
node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
if !between.empty? and between.end_with? ':'
true
else
false
end
end
def awards
awards = []
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
awards << award
end
return awards
end
end
# Raised when a book has no image.
#
class NoImageError < StandardError
attr_reader :biblionet_id
def initialize(biblionet_id)
msg = "This book has no image. At book #{biblionet_id}"
super(msg)
end
end
end
end
# Both methods write a file
# File.open('book_133435_decoded.html', 'w') { |file| file.write(dec) }
# File.write('filename', 'content')
# puts decode_file('book_133435.html')
# biblionet_id = '123351'
# biblionet_id = '17351'
# biblionet_id = '133435'
# page = Nokogiri::HTML(open("book_#{biblionet_id}.html"))
# book_hash = Hash.new
# book = page.css('//tr/td[width="180"][valign="top"][align="left"]')
# img = (page.xpath("/html/body//img[@src[contains(.,'/covers/')]][1]").first)['src']
# book_hash['image'] = BASE_URL+img
# title = page.css('h1.book_title').text
# book_hash['title'] = title
# author = page.css('a.booklink').first.text
# book_hash['author'] = author
# # others = page.xpath("//a[@class='booklink' and @href[not(contains(.,'/com/')) ]]")
# publisher = page.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").text
# book_hash['publisher'] = publisher
# A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
# B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"
# others = book.xpath("#{A}[count(.|#{B}) = count(#{B})]").inner_text
# others = others.split(/\n/).map(&:strip).reject!(&:empty?)
# details = page.css('.book_details').inner_html.gsub(/(^\d,\d)|(\D,)|(,\D)/, "
").split("
").map(&:strip).reject!(&:empty?)
# details_hash = proccess_details(details)
# book_hash['publication_year'] = details_hash['publication_year']
# book_hash['pages'] = details_hash['pages']
# book_hash['isbn'] = details_hash['isbn']
# book_hash['isbn_13'] = details_hash['isbn_13'].nil? ? nil : details_hash['isbn_13']
# book_hash['status'] = details_hash['status']
# book_hash['price'] = details_hash['price']
# contributors = proccess_contributors(others)
# book_hash['contributors'] = contributors
# # puts test.xpath("#{A}[count(.|#{B}) = count(#{B})]")
# # puts author.search('/following::node()')
# desc = page.css('p').last.inner_html #.to_s.gsub(/
/,'\\n')
# desc = Sanitize.clean(desc, elements: ['br'])
# if (desc =~ /\p{Word}{3,}/).nil?
# book_hash['description'] = nil
# else
# book_hash['description'] = desc
# end
# ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
# # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
# ddc_biblionet_id = ddc[:href].split(/\//).last
# # Extact DdC id and DdC text.
# ddc = proccess_ddc(ddc.text)
# ddc.merge!(b_id: ddc_biblionet_id)
# end
# book_hash['ddc_ids'] = ddcs
# book_hash['biblionet_id'] = biblionet_id
# book_json = book_hash.to_json
# puts book_json_pretty = JSON.pretty_generate(book_hash)
# File.open("book_#{biblionet_id}.json","w") do |f|
# f.write(book_json)
# end
# def contributors(n)
# contributors = []
# n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
# pre_text = item.previous.text.strip
# contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
# contributor = {}
# contributor['name'] = item.text
# contributor['b_id'] = (item[:href].split("/"))[2]
# contributors << contributor
# end
# contributors
# end
# c = contributors(n4)
# def proccess_contributors(raw_contributors)
# contributors = Hash.new
# partners = Array.new
# job = "author"
# raw_contributors.each do |cb|
# if cb.is_a?(String) and cb.end_with? ":"
# job = cb[0..-2]
# partners.clear
# else
# partners << cb
# contributors[job] = partners.clone
# end
# end unless raw_contributors.nil? or raw_contributors.empty?
# return contributors
# end
# c2 = proccess_contributors(c)