Sha256: 7a1c4d1be383f8a8d9ca587d4cd5daa6cf4843e42ea61becbc64000a5cf24dc2

Contents?: true

Size: 1.96 KB

Versions: 1

Compression:

Stored size: 1.96 KB

Contents

require 'page_by_page/version'
require 'page_by_page/enum'
require 'page_by_page/mutex_enum'
require 'nokogiri'
require 'open-uri'
require 'erb'

class PageByPage

  class << self
    def fetch &block
      pbp = self.new &block
      pbp.fetch
    end
  end

  def initialize &block
    @from, @step, @to = 1, 1, Float::INFINITY
    @progress = {}
    instance_eval &block
  end

  def url tmpl
    @tmpl = ERB.new tmpl
  end

  def selector sl
    @selector = sl
  end

  def from n
    @from = n
  end

  def step n
    @step = n
  end

  def to n
    @to = n
  end

  def threads n
    @threads = n
  end

  def no_progress
    @progress = nil
  end

  def fetch
    nodes_2d =
      unless defined? @threads
        @enum = Enum.new options
        _fetch
      else
        @enum = MutexEnum.new options
        parallel_fetch
      end
    puts if @progress
    nodes_2d.reject(&:nil?).flatten
  end

  private

  def _fetch
    items, pages = [nil], []
    catch :no_more do
      until items.empty?
        n = @enum.next
        break if n > limit
        url = @tmpl.result binding
        doc = parse url
        items = doc.css @selector
        pages[n] = items
        update_progress Thread.current, n if @progress
      end
    end
    pages
  end

  def parallel_fetch
    ts = @threads.times.map do |n|
      Thread.new do
        Thread.current[:sub] = _fetch
      end
    end
    ts.each_with_object([]) do |t, pages|
      t.join
      t[:sub].each_with_index do |items, i|
        pages[i] = items if items
      end
    end
  end

  def parse url
    page = open(url)
    Nokogiri::HTML page.read
  rescue OpenURI::HTTPError => e
    if e.message == '404 Not Found'
      throw :no_more
    else
      raise e
    end
  end

  def options
    {from: @from, step: @step}
  end

  def limit
    @to ||= Float::INFINITY
  end

  def update_progress thread, page_num
    @progress[thread] = page_num
    printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort
  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
page_by_page-0.1.8 lib/page_by_page.rb