lib/spidr/agent.rb in spidr-0.1.5 vs lib/spidr/agent.rb in spidr-0.1.6

- old
+ new

@@ -21,12 +21,15 @@ # Delay in between fetching pages attr_accessor :delay # History containing visited URLs - attr_accessor :history + attr_reader :history + # List of unreachable URLs + attr_reader :failures + # # Creates a new Agent object with the given _options_ and _block_. # If a _block_ is given, it will be passed the newly created # Agent object. # @@ -68,16 +71,18 @@ :accept => options[:exts], :reject => options[:ignore_exts] ) @every_url_blocks = [] + @every_failed_url_blocks = [] @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] } @every_page_blocks = [] @delay = (options[:delay] || 0) @history = [] + @failures = [] @queue = [] if options[:host] visit_hosts_like(options[:host]) end @@ -285,11 +290,11 @@ # # Adds the given _pattern_ to the ignore_exts. If a _block_ is given, # it will be added to the ignore_exts. # - def ignore_exts_like(&block) + def ignore_exts_like(pattern=nil,&block) if pattern ignore_exts << pattern elsif block ignore_exts << block end @@ -305,10 +310,19 @@ @every_url_blocks << block return self end # + # For every URL that the agent is unable to visit, it will be passed + # to the specified _block_. + # + def every_failed_url(&block) + @every_failed_url_blocks << block + return self + end + + # # For every URL that the agent visits and matches the specified # _pattern_, it will be passed to the specified _block_. # def urls_like(pattern,&block) @urls_like_blocks[pattern] << block @@ -323,14 +337,24 @@ @every_page_blocks << block return self end # + # Clears the history of the agent. + # + def clear + @queue.clear + @history.clear + @failures.clear + return self + end + + # # Clear the history and start spidering at the specified _url_. # def start_at(url) - @history.clear + clear return run(url) end # # Start spidering at the specified _url_. @@ -364,18 +388,30 @@ # # Returns +true+ if the specified _url_ was visited, returns +false+ # otherwise. # def visited?(url) - if url.kind_of?(URI) - return @history.include?(url) - else - return @history.include?(URI(url).to_s) + unless url.kind_of?(URI) + url = URI(url) end + + return @history.include?(url) end # + # Returns +true+ if the specified _url_ was unable to be visited, + # returns +false+ otherwise. + # + def failed?(url) + unless url.kind_of?(URI) + url = URI(url) + end + + return @failures.include?(url) + end + + # # Creates a new Page object from the specified _url_. If a _block_ is # given, it will be passed the newly created Page object. # def get_page(url,&block) host = url.host @@ -390,20 +426,25 @@ proxy_host = @proxy[:host] proxy_port = @proxy[:port] proxy_user = @proxy[:user] proxy_password = @proxy[:password] - Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess| - headers = {} + begin + Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess| + headers = {} - headers['User-Agent'] = @user_agent if @user_agent - headers['Referer'] = @referer if @referer + headers['User-Agent'] = @user_agent if @user_agent + headers['Referer'] = @referer if @referer - new_page = Page.new(url,sess.get(path,headers)) + new_page = Page.new(url,sess.get(path,headers)) - block.call(new_page) if block - return new_page + block.call(new_page) if block + return new_page + end + rescue SystemCallError, Net::HTTPBadResponse + failed(url) + return nil end end protected @@ -446,10 +487,54 @@ def dequeue @queue.shift end # + # Returns +true+ if the specified _url_ should be visited, based on + # it's scheme, returns +false+ otherwise. + # + def visit_scheme?(url) + if url.scheme + return SCHEMES.include?(url.scheme) + else + return true + end + end + + # + # Returns +true+ if the specified _url_ should be visited, based on + # the host of the _url_, returns +false+ otherwise. + # + def visit_host?(url) + @host_rules.accept?(url.host) + end + + # + # Returns +true+ if the specified _url_ should be visited, based on + # the port of the _url_, returns +false+ otherwise. + # + def visit_port?(url) + @port_rules.accept?(url.port) + end + + # + # Returns +true+ if the specified _url_ should be visited, based on + # the pattern of the _url_, returns +false+ otherwise. + # + def visit_link?(url) + @link_rules.accept?(url.to_s) + end + + # + # Returns +true+ if the specified _url_ should be visited, based on + # the file extension of the _url_, returns +false+ otherwise. + # + def visit_ext?(url) + @ext_rules.accept?(File.extname(url.path)[1..-1]) + end + + # # Returns +true+ if the specified URL should be visited, returns # +false+ otherwise. # def visit?(url) (!(visited?(url)) && @@ -475,31 +560,20 @@ block.call(page) if block end end - def visit_scheme?(url) - if url.scheme - return SCHEMES.include?(url.scheme) - else - return true + # + # Adds the specified _url_ to the failures list. + # + def failed(url) + unless url.kind_of?(URI) + url = URI(url.to_s) end - end - def visit_host?(url) - @host_rules.accept?(url.host) - end - - def visit_port?(url) - @port_rules.accept?(url.port) - end - - def visit_link?(url) - @link_rules.accept?(url.to_s) - end - - def visit_ext?(url) - @ext_rules.accept?(File.extname(url.path)[1..-1]) + @every_failed_url_blocks.each { |block| block.call(url) } + @failures << url + return true end end end