lib/elsmore/scraper.rb in elsmore-0.1.7 vs lib/elsmore/scraper.rb in elsmore-0.2.0
- old
+ new
@@ -1,49 +1,65 @@
module Elsmore
class Scraper
- attr_accessor :emitter
+ attr_accessor :emitter, :unprocessed, :processed, :invalid, :unprocessed_urls, :valid_domains
def initialize initial_url
seed = Elsmore::Document.new(initial_url)
- @valid_domains = [seed.url.host]
- @unprocessed = [seed]
- @processed = []
- @invalid = []
- end
+ self.valid_domains = [seed.url.host]
- def run
- while !@unprocessed.empty?
- document = @unprocessed.shift
- next if @processed.include?(document.url.canonical_url)
- emitter.dot
+ self.unprocessed = [seed]
+ self.unprocessed_urls = [seed.url.canonical_url]
- enqueue(document.links)
- document.rewrite
- document.write!
+ self.processed = []
+ self.invalid = []
+ end
- @processed << document.url.canonical_url
+ def run
+ while !unprocessed.empty?
+ document = unprocessed.shift
+ process document
end
{
- processed: @processed,
- invalid: @invalid
+ processed: processed,
+ invalid: invalid
}
end
private
+ def process document
+ emitter.log(document.url.canonical_url.colorize(:green))
+
+ document.emitter = emitter
+
+ enqueue(document.links)
+ document.rewrite
+ document.write!
+
+ processed << document.url.canonical_url
+ end
+
def enqueue links
links.each_with_index do |document, index|
- if !document.url.valid
- emitter.unsure
- @invalid << document.url.raw_url
- next
- end
+ next unless valid?(document)
+ next if !valid_domains.include?(document.url.host)
+ next if processed.include?(document.url.canonical_url)
+ next if unprocessed_urls.include?(document.url.canonical_url)
- next if !@valid_domains.include?(document.url.host)
- next if @processed.include?(document.url.canonical_url)
- @unprocessed << document
+ emitter.log("> Enqueued: #{document.url.canonical_url}")
+
+ unprocessed << document
+ unprocessed_urls << document.url.canonical_url
end
+ end
+
+ def valid?(document)
+ if !document.url.valid && !self.invalid.include?(document.url.raw_url)
+ emitter.warning("> Invalid URL: #{document.url.raw_url}")
+ invalid << document.url.raw_url
+ end
+ document.url.valid
end
end
end