require 'spidr/rules' require 'spidr/page' require 'spidr/spidr' require 'net/http' module Spidr class Agent # Proxy to use attr_accessor :proxy # User-Agent to use attr_accessor :user_agent # Referer to use attr_accessor :referer # Delay in between fetching pages attr_accessor :delay # List of acceptable URL schemes to follow attr_reader :schemes # History containing visited URLs attr_reader :history # List of unreachable URLs attr_reader :failures # Queue of URLs to visit attr_reader :queue # # Creates a new Agent object with the given _options_ and _block_. # If a _block_ is given, it will be passed the newly created # Agent object. # # _options_ may contain the following keys: # :proxy:: The proxy to use while spidering. # :user_agent:: The User-Agent string to send. # :referer:: The referer URL to send. # :delay:: Duration in seconds to pause between spidering each # link. Defaults to 0. # :schemes:: The list of acceptable URL schemes to follow. # Defaults to +http+ and +https+. +https+ URL # schemes will be ignored if net/http # cannot be loaded. # :host:: The host-name to visit. # :hosts:: An +Array+ of host patterns to visit. # :ignore_hosts:: An +Array+ of host patterns to not visit. # :ports:: An +Array+ of port patterns to visit. # :ignore_ports:: An +Array+ of port patterns to not visit. # :links:: An +Array+ of link patterns to visit. # :ignore_links:: An +Array+ of link patterns to not visit. # :exts:: An +Array+ of File extension patterns to visit. # :ignore_exts:: An +Array+ of File extension patterns to not # visit. # :queue:: An initial queue of URLs to visit. # :history:: An initial list of visited URLs. # def initialize(options={},&block) @proxy = (options[:proxy] || Spidr.proxy) @user_agent = (options[:user_agent] || Spidr.user_agent) @referer = options[:referer] @schemes = [] if options[:schemes] @schemes += options[:schemes] else @schemes << 'http' begin require 'net/https' @schemes << 'https' rescue Gem::LoadError => e raise(e) rescue ::LoadError STDERR.puts "Warning: cannot load 'net/https', https support disabled" end end @host_rules = Rules.new( :accept => options[:hosts], :reject => options[:ignore_hosts] ) @port_rules = Rules.new( :accept => options[:ports], :reject => options[:ignore_ports] ) @link_rules = Rules.new( :accept => options[:links], :reject => options[:ignore_links] ) @ext_rules = Rules.new( :accept => options[:exts], :reject => options[:ignore_exts] ) @every_url_blocks = [] @every_failed_url_blocks = [] @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] } @every_page_blocks = [] @delay = (options[:delay] || 0) @history = [] @failures = [] @queue = [] @paused = true if options[:host] visit_hosts_like(options[:host]) end if options[:queue] self.queue = options[:queue] end if options[:history] self.history = options[:history] end block.call(self) if block end # # Creates a new Agent object with the given _options_ and will begin # spidering at the specified _url_. If a _block_ is given it will be # passed the newly created Agent object, before the agent begins # spidering. # def self.start_at(url,options={},&block) self.new(options) do |spider| block.call(spider) if block spider.start_at(url) end end # # Creates a new Agent object with the given _options_ and will begin # spidering the specified host _name_. If a _block_ is given it will be # passed the newly created Agent object, before the agent begins # spidering. # def self.host(name,options={},&block) self.new(options.merge(:host => name)) do |spider| block.call(spider) if block spider.start_at("http://#{name}/") end end # # Creates a new Agent object with the given _options_ and will begin # spidering the host of the specified _url_. If a _block_ is given it # will be passed the newly created Agent object, before the agent # begins spidering. # def self.site(url,options={},&block) url = URI(url.to_s) return self.new(options.merge(:host => url.host)) do |spider| block.call(spider) if block spider.start_at(url) end end # # Returns the +Array+ of host patterns to visit. # def visit_hosts @host_rules.accept end # # Adds the given _pattern_ to the visit_hosts. If a _block_ is given, # it will be added to the visit_hosts. # def visit_hosts_like(pattern=nil,&block) if pattern visit_hosts << pattern elsif block visit_hosts << block end return self end # # Returns the +Array+ of URL host patterns to not visit. # def ignore_hosts @host_rules.reject end # # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given, # it will be added to the ignore_hosts. # def ignore_hosts_like(pattern=nil,&block) if pattern ignore_hosts << pattern elsif block ignore_hosts << block end return self end # # Returns the +Array+ of URL port patterns to visit. # def visit_ports @port_rules.accept end # # Adds the given _pattern_ to the visit_ports. If a _block_ is given, # it will be added to the visit_ports. # def visit_ports_like(pattern=nil,&block) if pattern visit_ports << pattern elsif block visit_ports << block end return self end # # Returns the +Array+ of URL port patterns to not visit. # def ignore_ports @port_rules.reject end # # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given, # it will be added to the ignore_hosts. # def ignore_ports_like(pattern=nil,&block) if pattern ignore_ports << pattern elsif block ignore_ports << block end return self end # # Returns the +Array+ of link patterns to visit. # def visit_links @link_rules.accept end # # Adds the given _pattern_ to the visit_links. If a _block_ is given, # it will be added to the visit_links. # def visit_links_like(pattern=nil,&block) if pattern visit_links << pattern elsif block visit_links << block end return self end # # Returns the +Array+ of link patterns to not visit. # def ignore_links @link_rules.reject end # # Adds the given _pattern_ to the ignore_links. If a _block_ is given, # it will be added to the ignore_links. # def ignore_links_like(pattern=nil,&block) if pattern ignore_links << pattern elsif block ignore_links << block end return self end # # Returns the +Array+ of URL extension patterns to visit. # def visit_exts @ext_rules.accept end # # Adds the given _pattern_ to the visit_exts. If a _block_ is given, # it will be added to the visit_exts. # def visit_exts_like(pattern=nil,&block) if pattern visit_exts << pattern elsif block visit_exts << block end return self end # # Returns the +Array+ of URL extension patterns to not visit. # def ignore_exts @ext_rules.reject end # # Adds the given _pattern_ to the ignore_exts. If a _block_ is given, # it will be added to the ignore_exts. # def ignore_exts_like(pattern=nil,&block) if pattern ignore_exts << pattern elsif block ignore_exts << block end return self end # # For every URL that the agent visits it will be passed to the # specified _block_. # def every_url(&block) @every_url_blocks << block return self end # # For every URL that the agent is unable to visit, it will be passed # to the specified _block_. # def every_failed_url(&block) @every_failed_url_blocks << block return self end # # For every URL that the agent visits and matches the specified # _pattern_, it will be passed to the specified _block_. # def urls_like(pattern,&block) @urls_like_blocks[pattern] << block return self end # # For every Page that the agent visits, pass the page to the # specified _block_. # def every_page(&block) @every_page_blocks << block return self end # # For every Page that the agent visits, pass the headers to the given # _block_. # def all_headers(&block) every_page { |page| block.call(page.headers) } end # # Clears the history of the agent. # def clear @queue.clear @history.clear @failures.clear return self end # # Start spidering at the specified _url_. # def start_at(url) enqueue(url) return continue! end # # Start spidering until the queue becomes empty or the agent is # paused. # def run until (@queue.empty? || @paused == true) visit_page(dequeue) end return self end # # Continue spidering. # def continue! @paused = false return run end # # Returns +true+ if the agent is still spidering, returns +false+ # otherwise. # def running? @paused == false end # # Returns +true+ if the agent is paused, returns +false+ otherwise. # def paused? @paused == true end # # Pauses the agent, causing spidering to temporarily stop. # def pause! @paused = true return self end # # Sets the list of acceptable URL schemes to follow to the # _new_schemes_. # # agent.schemes = ['http'] # def schemes=(new_schemes) @schemes = new_schemes.map { |scheme| scheme.to_s } end # # Sets the history of links that were previously visited to the # specified _new_history_. # # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/'] # def history=(new_history) @history = new_history.map do |url| unless url.kind_of?(URI) URI(url.to_s) else url end end end alias visited_urls history # # Returns the +Array+ of visited URLs. # def visited_links @history.map { |uri| uri.to_s } end # # Return the +Array+ of hosts that were visited. # def visited_hosts @history.map { |uri| uri.host }.uniq end # # Returns +true+ if the specified _url_ was visited, returns +false+ # otherwise. # def visited?(url) url = URI(url) unless url.kind_of?(URI) return @history.include?(url) end # # Returns +true+ if the specified _url_ was unable to be visited, # returns +false+ otherwise. # def failed?(url) url = URI(url) unless url.kind_of?(URI) return @failures.include?(url) end alias pending_urls queue # # Creates a new Page object from the specified _url_. If a _block_ is # given, it will be passed the newly created Page object. # def get_page(url,&block) host = url.host port = url.port unless url.path.empty? path = url.path else path = '/' end proxy_host = @proxy[:host] proxy_port = @proxy[:port] proxy_user = @proxy[:user] proxy_password = @proxy[:password] begin Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess| headers = {} headers['User-Agent'] = @user_agent if @user_agent headers['Referer'] = @referer if @referer new_page = Page.new(url,sess.get(path,headers)) block.call(new_page) if block return new_page end rescue SystemCallError, Net::HTTPBadResponse failed(url) return nil end end # # Returns the agent represented as a Hash containing the agents # +history+ and +queue+ information. # def to_hash {:history => @history, :queue => @queue} end # # Sets the queue of links to visit to the specified _new_queue_. # # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/'] # def queue=(new_queue) @queue = new_queue.map do |url| unless url.kind_of?(URI) URI(url.to_s) else url end end end # # Returns +true+ if the specified _url_ is queued for visiting, returns # +false+ otherwise. # def queued?(url) @queue.include?(url) end # # Enqueues the specified _url_ for visiting, only if it passes all the # agent's rules for visiting a given URL. Returns +true+ if the _url_ # was successfully enqueued, returns +false+ otherwise. # def enqueue(url) link = url.to_s url = URI(link) if (!(queued?(url)) && visit?(url)) @every_url_blocks.each { |block| block.call(url) } @urls_like_blocks.each do |pattern,blocks| if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url) blocks.each { |url_block| url_block.call(url) } end end @queue << url return true end return false end protected # # Dequeues a URL that will later be visited. # def dequeue @queue.shift end # # Returns +true+ if the specified _url_ should be visited, based on # it's scheme, returns +false+ otherwise. # def visit_scheme?(url) if url.scheme return @schemes.include?(url.scheme) else return true end end # # Returns +true+ if the specified _url_ should be visited, based on # the host of the _url_, returns +false+ otherwise. # def visit_host?(url) @host_rules.accept?(url.host) end # # Returns +true+ if the specified _url_ should be visited, based on # the port of the _url_, returns +false+ otherwise. # def visit_port?(url) @port_rules.accept?(url.port) end # # Returns +true+ if the specified _url_ should be visited, based on # the pattern of the _url_, returns +false+ otherwise. # def visit_link?(url) @link_rules.accept?(url.to_s) end # # Returns +true+ if the specified _url_ should be visited, based on # the file extension of the _url_, returns +false+ otherwise. # def visit_ext?(url) @ext_rules.accept?(File.extname(url.path)[1..-1]) end # # Returns +true+ if the specified URL should be visited, returns # +false+ otherwise. # def visit?(url) (!(visited?(url)) && visit_scheme?(url) && visit_host?(url) && visit_port?(url) && visit_link?(url) && visit_ext?(url)) end # # Visits the spedified _url_ and enqueus it's links for visiting. If a # _block_ is given, it will be passed a newly created Page object # for the specified _url_. # def visit_page(url,&block) get_page(url) do |page| @history << page.url page.urls.each { |next_url| enqueue(next_url) } @every_page_blocks.each { |page_block| page_block.call(page) } block.call(page) if block end end # # Adds the specified _url_ to the failures list. # def failed(url) url = URI(url.to_s) unless url.kind_of?(URI) @every_failed_url_blocks.each { |block| block.call(url) } @failures << url return true end end end