Sha256: 7a1c4d1be383f8a8d9ca587d4cd5daa6cf4843e42ea61becbc64000a5cf24dc2
Contents?: true
Size: 1.96 KB
Versions: 1
Compression:
Stored size: 1.96 KB
Contents
require 'page_by_page/version' require 'page_by_page/enum' require 'page_by_page/mutex_enum' require 'nokogiri' require 'open-uri' require 'erb' class PageByPage class << self def fetch &block pbp = self.new &block pbp.fetch end end def initialize &block @from, @step, @to = 1, 1, Float::INFINITY @progress = {} instance_eval &block end def url tmpl @tmpl = ERB.new tmpl end def selector sl @selector = sl end def from n @from = n end def step n @step = n end def to n @to = n end def threads n @threads = n end def no_progress @progress = nil end def fetch nodes_2d = unless defined? @threads @enum = Enum.new options _fetch else @enum = MutexEnum.new options parallel_fetch end puts if @progress nodes_2d.reject(&:nil?).flatten end private def _fetch items, pages = [nil], [] catch :no_more do until items.empty? n = @enum.next break if n > limit url = @tmpl.result binding doc = parse url items = doc.css @selector pages[n] = items update_progress Thread.current, n if @progress end end pages end def parallel_fetch ts = @threads.times.map do |n| Thread.new do Thread.current[:sub] = _fetch end end ts.each_with_object([]) do |t, pages| t.join t[:sub].each_with_index do |items, i| pages[i] = items if items end end end def parse url page = open(url) Nokogiri::HTML page.read rescue OpenURI::HTTPError => e if e.message == '404 Not Found' throw :no_more else raise e end end def options {from: @from, step: @step} end def limit @to ||= Float::INFINITY end def update_progress thread, page_num @progress[thread] = page_num printf "\r%s => %s", Time.now.strftime('%F %T'), @progress.values.sort end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
page_by_page-0.1.8 | lib/page_by_page.rb |