Sha256: e01c5629be85b122acb83b15533992bba8195fd6cd639e49490bca0bd23d64d8
Contents?: true
Size: 1.54 KB
Versions: 2
Compression:
Stored size: 1.54 KB
Contents
# Author:: Robert Dormer (mailto:rdormer@gmail.com) # Copyright:: Copyright (c) 2016 Robert Dormer # License:: MIT require 'bloom-filter' require 'exclusion' module Spider class VisitQueue class IterationExit < Exception; end attr_accessor :visit_count attr_accessor :robot_txt def initialize(robots=nil, agent=nil, finish=nil) @visited = BloomFilter.new(size: 10_000, error_rate: 0.001) @robot_txt = ExclusionParser.new(robots, agent) if robots @finalize = finish @visit_count = 0 @pending = [] end def visit_each begin until @pending.empty? url = @pending.pop if url_okay(url) yield url if block_given? @visited.insert(url) @visit_count += 1 end end rescue IterationExit end @finalize.call if @finalize end def push_front(urls) add_url(urls) {|u| @pending.push(u)} end def push_back(urls) add_url(urls) {|u| @pending.unshift(u)} end def size @pending.size end def empty? @pending.empty? end def stop raise IterationExit end private def url_okay(url) return false if @visited.include?(url) return false if @robot_txt && @robot_txt.excluded?(url) true end def add_url(urls) urls = [urls] unless urls.is_a? Array urls.compact! urls.each do |url| unless @visited.include?(url) yield url end end end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
spiderkit-0.1.1 | lib/queue.rb |
spiderkit-0.1.0 | lib/queue.rb |