lib/elsmore/scraper.rb in elsmore-0.1.7 vs lib/elsmore/scraper.rb in elsmore-0.2.0

- old
+ new

@@ -1,49 +1,65 @@ module Elsmore class Scraper - attr_accessor :emitter + attr_accessor :emitter, :unprocessed, :processed, :invalid, :unprocessed_urls, :valid_domains def initialize initial_url seed = Elsmore::Document.new(initial_url) - @valid_domains = [seed.url.host] - @unprocessed = [seed] - @processed = [] - @invalid = [] - end + self.valid_domains = [seed.url.host] - def run - while !@unprocessed.empty? - document = @unprocessed.shift - next if @processed.include?(document.url.canonical_url) - emitter.dot + self.unprocessed = [seed] + self.unprocessed_urls = [seed.url.canonical_url] - enqueue(document.links) - document.rewrite - document.write! + self.processed = [] + self.invalid = [] + end - @processed << document.url.canonical_url + def run + while !unprocessed.empty? + document = unprocessed.shift + process document end { - processed: @processed, - invalid: @invalid + processed: processed, + invalid: invalid } end private + def process document + emitter.log(document.url.canonical_url.colorize(:green)) + + document.emitter = emitter + + enqueue(document.links) + document.rewrite + document.write! + + processed << document.url.canonical_url + end + def enqueue links links.each_with_index do |document, index| - if !document.url.valid - emitter.unsure - @invalid << document.url.raw_url - next - end + next unless valid?(document) + next if !valid_domains.include?(document.url.host) + next if processed.include?(document.url.canonical_url) + next if unprocessed_urls.include?(document.url.canonical_url) - next if !@valid_domains.include?(document.url.host) - next if @processed.include?(document.url.canonical_url) - @unprocessed << document + emitter.log("> Enqueued: #{document.url.canonical_url}") + + unprocessed << document + unprocessed_urls << document.url.canonical_url end + end + + def valid?(document) + if !document.url.valid && !self.invalid.include?(document.url.raw_url) + emitter.warning("> Invalid URL: #{document.url.raw_url}") + invalid << document.url.raw_url + end + document.url.valid end end end