Sha256: 5ee18061df66a6b2ef129d8493780cad5892c90a6c92d767c669bd45c2b1bd2f

Contents?: true

Size: 932 Bytes

Versions: 3

Compression:

Stored size: 932 Bytes

Contents

require 'page_by_page/common'

module PageByPage
  class Jump

    include Common

    def start url
      @start = url
    end

    def iterate selector
      @iterate = selector
    end

    def process
      url, items, page_count = @start, [], 0

      while true do
        doc = parse url
        doc.css(@selector).each{ |item| items << item }

        page_count += 1
        update_progress Thread.current, page_count if @progress
        break if page_count >= limit

        next_url = doc.at_css(@iterate)
        break unless next_url

        path = next_url.attr('href')
        url = concat_host path

        sleep @interval if @interval
      end

      puts if @progress
      items
    end

    private

    def concat_host path
      @prefix = (
        regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
        @start.gsub(regex, '\1')
      )
      File.join @prefix, path
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
page_by_page-0.1.13 lib/page_by_page/jump.rb
page_by_page-0.1.12 lib/page_by_page/jump.rb
page_by_page-0.1.11 lib/page_by_page/jump.rb