Sha256: b110efc3c04c920a343c9cfec3f60b544cb63fba3de3fa87e521105da1d6db28

Contents?: true

Size: 1.45 KB

Versions: 5

Compression:

Stored size: 1.45 KB

Contents

require_relative 'base'

module Biblionet
  module Crawlers

    class BookCrawler < Base
      def initialize(options = {})
        options[:folder]    ||= 'lib/bookshark/storage/html_book_pages'
        options[:base_url]  ||= 'http://www.biblionet.gr/book/'
        options[:page_type] ||= 'book'
        options[:extension] ||= '.html'
        options[:save_only_content] ||= true
        options[:start]     ||= 1
        options[:finish]    ||= 10000
        options[:step]      ||= 1000    
        super(options)
      end

      def crawl_and_save 
        downloader = Extractors::Base.new

        spider do |url_to_download, file_to_save|                   
          downloader.load_page(url_to_download)

          # Create a new directory (does nothing if directory exists) 
          path = File.dirname(file_to_save)
          FileUtils.mkdir_p path unless File.directory?(path)

          # No need to download the whole page. Just the part containing the book.
          if @save_only_content
            content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
            content    = content_re.match(downloader.page)[0] unless (content_re.match(downloader.page)).nil?
            downloader.save_to(file_to_save, content) unless downloader.page.nil? or downloader.page.length < 1024
          else
            downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
          end
        end
      end

    end

  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
bookshark-1.0.7 lib/bookshark/crawlers/book_crawler.rb
bookshark-1.0.6 lib/bookshark/crawlers/book_crawler.rb
bookshark-1.0.4 lib/bookshark/crawlers/book_crawler.rb
bookshark-1.0.3 lib/bookshark/crawlers/book_crawler.rb
bookshark-1.0.1 lib/bookshark/crawlers/book_crawler.rb