lib/bookshark/extractors/bibliographical_book_extractor.rb in bookshark-1.0.0.pre.2 vs lib/bookshark/extractors/bibliographical_book_extractor.rb in bookshark-1.0.1
- old
+ new
@@ -3,35 +3,35 @@
require_relative 'base'
module Biblionet
- module Extractors
-
- class BibliographicalBookExtractor < Base
+ module Extractors
+
+ class BibliographicalBookExtractor < Base
attr_reader :bibliographical_book
def initialize(uri=nil)
- super(uri)
- extract_bibliographical_book unless uri.nil? or @page.nil?
+ super(uri)
+ extract_bibliographical_book unless uri.nil? or @page.nil?
end
def load_and_extract_book(uri=nil)
load_page(uri)
extract_bibliographical_book unless uri.nil? or @page.nil?
- end
+ end
- def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page)
+ def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page)
# log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
log = Logger.new(STDOUT)
-
+
page = BibliographicalBookDataExtractor.new(book_page)
# End extraction if BookDataExtractor couldnt create a nodeset
return nil if page.nodeset.nil?
- bibliographical_book_hash = Hash.new
+ bibliographical_book_hash = Hash.new
extracted_details = page.details
bibliographical_book_hash[:original_language] = extracted_details[:original_language]
bibliographical_book_hash[:original_title] = extracted_details[:original_title]
@@ -45,11 +45,11 @@
bibliographical_book_hash[:format] = extracted_details[:format]
bibliographical_book_hash[:publisher] = extracted_details[:publisher]
bibliographical_book_hash[:publication] = extracted_details[:publication]
-
+
return @bibliographical_book = bibliographical_book_hash
end
end
@@ -61,18 +61,18 @@
content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
if (content_re.match(document)).nil?
puts document
end
content = content_re.match(document)[0] unless (content_re.match(document)).nil?
-
+
# If content is nil, there is something wrong with the html, so return nil
if content.nil?
@nodeset = nil
else
- @nodeset = Nokogiri::HTML(content)
- end
- end
+ @nodeset = Nokogiri::HTML(content)
+ end
+ end
def size
size_regex = /\d+x\d+/
end
@@ -99,74 +99,96 @@
details_hash[:original_language] = original_language
elsif detail.start_with? "Τίτλος πρωτοτύπου:"
original_title = detail.gsub(/Τίτλος πρωτοτύπου:/, "").strip
details_hash[:original_title] = original_title
end
-
- details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex
+ details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex
+
details_hash[:isbn_13] = detail[isbn_13_regex] if detail =~ isbn_13_regex
- details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex
+ details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex
- details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex
+ details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex
details_hash[:availability] = detail[availability_regex] if detail =~ availability_regex
- details_hash[:price] = detail[price_regex] if detail =~ price_regex
-
+ details_hash[:price] = detail[price_regex] if detail =~ price_regex
+
end
pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)
series_regex = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)* · \d+(?=\))/
series_regex_no_vol = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)*(?=\))/
series_name_regex = /\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= ·)/
series_volume_regex = /(?<=· )\d+/
- physical_size_regex = /\d+x\d+/
+ physical_size_regex = /\d+x\d+/
series_hash = {}
if pre_details_text =~ series_regex
series = pre_details_text[series_regex]
series_hash[:name] = series[series_name_regex] if series =~ series_name_regex
- series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex
+ series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex
elsif pre_details_text =~ series_regex_no_vol
series = pre_details_text[series_regex_no_vol]
series_hash[:name] = series
series_hash[:volume] = nil
end
details_hash[:series] = series_hash
-
+
details_hash[:physical_size] = (pre_details_text =~ physical_size_regex) ? pre_details_text[physical_size_regex] : nil
format_regex = /(?<=\[).+(?=\])/
after_title_text = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][1]").first.next_sibling.text.strip
format = after_title_text[format_regex] if after_title_text =~ format_regex
details_hash[:format] = format.nil? ? 'Βιβλίο' : format
publisher_node = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").first
- publisher_hash = {}
- publisher_hash[:text] = publisher_node.text
- publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2]
+ if !publisher_node.nil?
+ publisher_hash = {}
+ publisher_hash[:text] = publisher_node.text
+ publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2]
- pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text)
- after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text)
+ pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text)
+ after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text)
- publication_hash = {}
- publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/]
- publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/]
- publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
+ publication_hash = {}
+ publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/]
+ publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/]
+ publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
- details_hash[:publisher] = publisher_hash
- details_hash[:publication] = publication_hash
+ details_hash[:publisher] = publisher_hash
+ details_hash[:publication] = publication_hash
+ else
+ publisher_node = @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/com/') ]][1]").first
+ if !publisher_node.nil?
+ details_hash[:publisher] = {
+ text: publisher_node.text,
+ b_id: (publisher_node[:href].split("/"))[2]
+ }
+ after_last_author_text = @nodeset
+ .xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]][last()]").last
+ .next_sibling.text.strip
+ puts after_last_author_text
+ details_hash[:publication] = {
+ year: after_last_author_text[/(?<=: )\d+(?=\.)/],
+ version: after_last_author_text[/(?<=- )\d+(?=η)/],
+ place: after_last_author_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
+ }
+ else
+ details_hash[:publisher] = {text: nil, b_id: nil}
+ details_hash[:publication] = {year: nil, version: nil, place: nil}
+ end
+ end
details_hash
end
end
end
-end
\ No newline at end of file
+end