#!/bin/env ruby # encoding: utf-8 require_relative 'base' module Biblionet module Extractors class PublisherExtractor < Base attr_reader :publisher def initialize(uri=nil) super(uri) extract_publisher unless uri.nil? or @page.nil? end def load_and_extract_publisher(uri=nil) load_page(uri) extract_publisher unless uri.nil? or @page.nil? end def extract_publisher(biblionet_id=@biblionet_id, publisher_page=@page) puts "Extracting publisher: #{biblionet_id}" page = PublisherDataExtractor.new(publisher_page) return nil if page.nodeset.nil? headquarters = page.headquarters bookstores = page.bookstores bookstores['Έδρα'] = headquarters publisher_hash = {} publisher_hash[:name] = page.name publisher_hash[:owner] = page.owner publisher_hash[:bookstores] = bookstores publisher_hash[:b_id] = biblionet_id return @publisher = publisher_hash end end class PublisherDataExtractor attr_reader :nodeset def initialize(document) # No need to operate on whole page. Just on part containing the content. content_re = /.*/m if (content_re.match(document)).nil? puts document end content = content_re.match(document)[0] unless (content_re.match(document)).nil? # If content is nil, there is something wrong with the html, so return nil if content.nil? @nodeset = nil else @nodeset = Nokogiri::HTML(content) end end def name @nodeset.css('h1.page_title').text.strip end def owner return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip end def headquarters headquarters_hash = {} temp_array = [] current_key = nil last_key = nil @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item| key = item.children[0].text.strip current_key = key.end_with?(":") ? key[0..-2] : last_key value = item.children[1].text.strip unless key.empty? and value.empty? if current_key == last_key temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array) temp_array << value.gsub(/,$/, '').strip unless value.empty? headquarters_hash[current_key] = temp_array else temp_array = [] headquarters_hash[current_key] = value.gsub(/,$/, '').strip end end last_key = current_key end # Change keys. Use the same as in bookstores. mappings = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website} headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }] headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',') return headquarters_hash end def bookstores bookstores_hash = Hash.new { |h,k| h[k] = {} } address_array = [] tel_array = [] # Defaunt key in case there is none. key = 'Βιβλιοπωλείο' @nodeset.css('//p[align="justify"]').inner_html.split('

').map(&:strip).reject(&:empty?).each do |item_group| if item_group.end_with?(":") key = item_group[0..-2] address_array = [] tel_array = [] else if bookstores_hash[key].any? key[-1].to_i key += ((key[-1].to_i > 0) ? (' '+(key[-1].to_i+1).to_s) : ' 2') address_array = [] tel_array = [] end item_group.split('
').each do |item| regex_tel = /\d{3,5} \d{5,7}/ regex_tk = /\d{3} \d{2}/ regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i regex_url = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix if item.end_with?(":") key = item[0..-2] address_array = [] tel_array = [] elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel bookstores_hash[key][:fax] = item.gsub(/[^\d{3} \d{2}]/, '').strip elsif item =~ regex_tel tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip bookstores_hash[key][:telephone] = tel_array elsif item =~ regex_tk address_array << item.gsub(/,$/, '').strip bookstores_hash[key][:address] = address_array elsif item =~ regex_email bookstores_hash[key][:email] = (regex_email.match(item))[0] elsif item =~ regex_url bookstores_hash[key][:website] = item[regex_url,1] else address_array << item.gsub(/,$/, '').strip bookstores_hash[key][:address] = address_array end end end end bookstores_hash.delete_if { |k, v| v.empty? } return bookstores_hash end end end end