Sha256: 79ae0b551f6be150be79949935d163ad55195f1a0df6927c8c246ea8c6326b71

Contents?: true

Size: 1.02 KB

Versions: 7

Compression:

Stored size: 1.02 KB

Contents

module Elsmore
  class Scraper
    attr_accessor :emitter

    def initialize initial_url
      seed = Elsmore::Document.new(initial_url)

      @valid_domains = [seed.url.host]
      @unprocessed = [seed]
      @processed = []
      @invalid = []
    end

    def run
      while !@unprocessed.empty?
        document = @unprocessed.shift
        next if @processed.include?(document.url.canonical_url)
        emitter.dot

        enqueue(document.links)
        document.rewrite
        document.write!

        @processed << document.url.canonical_url
      end

      {
        processed: @processed,
        invalid: @invalid
      }
    end

    private

    def enqueue links
      links.each_with_index do |document, index|
        if !document.url.valid
          emitter.unsure
          @invalid << document.url.raw_url
          next
        end

        next if !@valid_domains.include?(document.url.host)
        next if @processed.include?(document.url.canonical_url)
        @unprocessed << document
      end
    end
  end
end

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
elsmore-0.1.7 lib/elsmore/scraper.rb
elsmore-0.1.6 lib/elsmore/scraper.rb
elsmore-0.1.5 lib/elsmore/scraper.rb
elsmore-0.1.4 lib/elsmore/scraper.rb
elsmore-0.1.3 lib/elsmore/scraper.rb
elsmore-0.1.2 lib/elsmore/scraper.rb
elsmore-0.1.1 lib/elsmore/scraper.rb