Sha256: 3cdec49463f16b0d1f9cb06155d320f562cd303c06ae0d255ef3d12df01ac1be

Contents?: true

Size: 1.42 KB

Versions: 1

Compression:

Stored size: 1.42 KB

Contents

require 'page_by_page/enum'
require 'page_by_page/mutex_enum'
require 'erb'

class PageByPage
  module Fetch

    def url tmpl
      @tmpl = ERB.new tmpl
    end

    def from n
      @from = n
    end

    def step n
      @step = n
    end

    def threads n
      @threads = n
    end

    def no_progress *arg
      @progress = nil
    end

    def fetch
      nodes_2d =
        unless defined? @threads
          @enum = Enum.new enum_options
          _fetch
        else
          @enum = MutexEnum.new enum_options
          parallel_fetch
        end
      puts if @progress
      nodes_2d.reject(&:nil?).flatten
    end

    protected

    def _fetch
      items, pages = [nil], []
      catch :no_more do
        until items.empty?
          n = @enum.next
          break if n > limit

          url = @tmpl.result binding
          doc = parse url
          items = doc.css @selector
          pages[n] = items

          update_progress Thread.current, n if @progress
          sleep @interval if @interval
        end
      end
      pages
    end

    def parallel_fetch
      ts = @threads.times.map do |n|
        Thread.new do
          Thread.current[:sub] = _fetch
        end
      end
      ts.each_with_object([]) do |t, pages|
        t.join
        t[:sub].each_with_index do |items, i|
          pages[i] = items if items
        end
      end
    end

    def enum_options
      {from: @from, step: @step}
    end

  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
page_by_page-0.1.10 lib/page_by_page/fetch.rb