lib/bookshark/extractors/bibliographical_book_extractor.rb in bookshark-1.0.0.pre.2 vs lib/bookshark/extractors/bibliographical_book_extractor.rb in bookshark-1.0.1

- old
+ new

@@ -3,35 +3,35 @@ require_relative 'base' module Biblionet - module Extractors - - class BibliographicalBookExtractor < Base + module Extractors + + class BibliographicalBookExtractor < Base attr_reader :bibliographical_book def initialize(uri=nil) - super(uri) - extract_bibliographical_book unless uri.nil? or @page.nil? + super(uri) + extract_bibliographical_book unless uri.nil? or @page.nil? end def load_and_extract_book(uri=nil) load_page(uri) extract_bibliographical_book unless uri.nil? or @page.nil? - end + end - def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page) + def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page) # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+')) log = Logger.new(STDOUT) - + page = BibliographicalBookDataExtractor.new(book_page) # End extraction if BookDataExtractor couldnt create a nodeset return nil if page.nodeset.nil? - bibliographical_book_hash = Hash.new + bibliographical_book_hash = Hash.new extracted_details = page.details bibliographical_book_hash[:original_language] = extracted_details[:original_language] bibliographical_book_hash[:original_title] = extracted_details[:original_title] @@ -45,11 +45,11 @@ bibliographical_book_hash[:format] = extracted_details[:format] bibliographical_book_hash[:publisher] = extracted_details[:publisher] bibliographical_book_hash[:publication] = extracted_details[:publication] - + return @bibliographical_book = bibliographical_book_hash end end @@ -61,18 +61,18 @@ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m if (content_re.match(document)).nil? puts document end content = content_re.match(document)[0] unless (content_re.match(document)).nil? - + # If content is nil, there is something wrong with the html, so return nil if content.nil? @nodeset = nil else - @nodeset = Nokogiri::HTML(content) - end - end + @nodeset = Nokogiri::HTML(content) + end + end def size size_regex = /\d+x\d+/ end @@ -99,74 +99,96 @@ details_hash[:original_language] = original_language elsif detail.start_with? "Τίτλος πρωτοτύπου:" original_title = detail.gsub(/Τίτλος πρωτοτύπου:/, "").strip details_hash[:original_title] = original_title end - - details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex + details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex + details_hash[:isbn_13] = detail[isbn_13_regex] if detail =~ isbn_13_regex - details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex + details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex - details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex + details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex details_hash[:availability] = detail[availability_regex] if detail =~ availability_regex - details_hash[:price] = detail[price_regex] if detail =~ price_regex - + details_hash[:price] = detail[price_regex] if detail =~ price_regex + end pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text) series_regex = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)* · \d+(?=\))/ series_regex_no_vol = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)*(?=\))/ series_name_regex = /\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= ·)/ series_volume_regex = /(?<=· )\d+/ - physical_size_regex = /\d+x\d+/ + physical_size_regex = /\d+x\d+/ series_hash = {} if pre_details_text =~ series_regex series = pre_details_text[series_regex] series_hash[:name] = series[series_name_regex] if series =~ series_name_regex - series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex + series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex elsif pre_details_text =~ series_regex_no_vol series = pre_details_text[series_regex_no_vol] series_hash[:name] = series series_hash[:volume] = nil end details_hash[:series] = series_hash - + details_hash[:physical_size] = (pre_details_text =~ physical_size_regex) ? pre_details_text[physical_size_regex] : nil format_regex = /(?<=\[).+(?=\])/ after_title_text = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][1]").first.next_sibling.text.strip format = after_title_text[format_regex] if after_title_text =~ format_regex details_hash[:format] = format.nil? ? 'Βιβλίο' : format publisher_node = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").first - publisher_hash = {} - publisher_hash[:text] = publisher_node.text - publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2] + if !publisher_node.nil? + publisher_hash = {} + publisher_hash[:text] = publisher_node.text + publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2] - pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text) - after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text) + pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text) + after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text) - publication_hash = {} - publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/] - publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/] - publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/] + publication_hash = {} + publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/] + publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/] + publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/] - details_hash[:publisher] = publisher_hash - details_hash[:publication] = publication_hash + details_hash[:publisher] = publisher_hash + details_hash[:publication] = publication_hash + else + publisher_node = @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/com/') ]][1]").first + if !publisher_node.nil? + details_hash[:publisher] = { + text: publisher_node.text, + b_id: (publisher_node[:href].split("/"))[2] + } + after_last_author_text = @nodeset + .xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]][last()]").last + .next_sibling.text.strip + puts after_last_author_text + details_hash[:publication] = { + year: after_last_author_text[/(?<=: )\d+(?=\.)/], + version: after_last_author_text[/(?<=- )\d+(?=η)/], + place: after_last_author_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/] + } + else + details_hash[:publisher] = {text: nil, b_id: nil} + details_hash[:publication] = {year: nil, version: nil, place: nil} + end + end details_hash end end end -end \ No newline at end of file +end