lib/spidr/agent.rb in spidr-0.1.5 vs lib/spidr/agent.rb in spidr-0.1.6
- old
+ new
@@ -21,12 +21,15 @@
# Delay in between fetching pages
attr_accessor :delay
# History containing visited URLs
- attr_accessor :history
+ attr_reader :history
+ # List of unreachable URLs
+ attr_reader :failures
+
#
# Creates a new Agent object with the given _options_ and _block_.
# If a _block_ is given, it will be passed the newly created
# Agent object.
#
@@ -68,16 +71,18 @@
:accept => options[:exts],
:reject => options[:ignore_exts]
)
@every_url_blocks = []
+ @every_failed_url_blocks = []
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@every_page_blocks = []
@delay = (options[:delay] || 0)
@history = []
+ @failures = []
@queue = []
if options[:host]
visit_hosts_like(options[:host])
end
@@ -285,11 +290,11 @@
#
# Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
# it will be added to the ignore_exts.
#
- def ignore_exts_like(&block)
+ def ignore_exts_like(pattern=nil,&block)
if pattern
ignore_exts << pattern
elsif block
ignore_exts << block
end
@@ -305,10 +310,19 @@
@every_url_blocks << block
return self
end
#
+ # For every URL that the agent is unable to visit, it will be passed
+ # to the specified _block_.
+ #
+ def every_failed_url(&block)
+ @every_failed_url_blocks << block
+ return self
+ end
+
+ #
# For every URL that the agent visits and matches the specified
# _pattern_, it will be passed to the specified _block_.
#
def urls_like(pattern,&block)
@urls_like_blocks[pattern] << block
@@ -323,14 +337,24 @@
@every_page_blocks << block
return self
end
#
+ # Clears the history of the agent.
+ #
+ def clear
+ @queue.clear
+ @history.clear
+ @failures.clear
+ return self
+ end
+
+ #
# Clear the history and start spidering at the specified _url_.
#
def start_at(url)
- @history.clear
+ clear
return run(url)
end
#
# Start spidering at the specified _url_.
@@ -364,18 +388,30 @@
#
# Returns +true+ if the specified _url_ was visited, returns +false+
# otherwise.
#
def visited?(url)
- if url.kind_of?(URI)
- return @history.include?(url)
- else
- return @history.include?(URI(url).to_s)
+ unless url.kind_of?(URI)
+ url = URI(url)
end
+
+ return @history.include?(url)
end
#
+ # Returns +true+ if the specified _url_ was unable to be visited,
+ # returns +false+ otherwise.
+ #
+ def failed?(url)
+ unless url.kind_of?(URI)
+ url = URI(url)
+ end
+
+ return @failures.include?(url)
+ end
+
+ #
# Creates a new Page object from the specified _url_. If a _block_ is
# given, it will be passed the newly created Page object.
#
def get_page(url,&block)
host = url.host
@@ -390,20 +426,25 @@
proxy_host = @proxy[:host]
proxy_port = @proxy[:port]
proxy_user = @proxy[:user]
proxy_password = @proxy[:password]
- Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
- headers = {}
+ begin
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
+ headers = {}
- headers['User-Agent'] = @user_agent if @user_agent
- headers['Referer'] = @referer if @referer
+ headers['User-Agent'] = @user_agent if @user_agent
+ headers['Referer'] = @referer if @referer
- new_page = Page.new(url,sess.get(path,headers))
+ new_page = Page.new(url,sess.get(path,headers))
- block.call(new_page) if block
- return new_page
+ block.call(new_page) if block
+ return new_page
+ end
+ rescue SystemCallError, Net::HTTPBadResponse
+ failed(url)
+ return nil
end
end
protected
@@ -446,10 +487,54 @@
def dequeue
@queue.shift
end
#
+ # Returns +true+ if the specified _url_ should be visited, based on
+ # it's scheme, returns +false+ otherwise.
+ #
+ def visit_scheme?(url)
+ if url.scheme
+ return SCHEMES.include?(url.scheme)
+ else
+ return true
+ end
+ end
+
+ #
+ # Returns +true+ if the specified _url_ should be visited, based on
+ # the host of the _url_, returns +false+ otherwise.
+ #
+ def visit_host?(url)
+ @host_rules.accept?(url.host)
+ end
+
+ #
+ # Returns +true+ if the specified _url_ should be visited, based on
+ # the port of the _url_, returns +false+ otherwise.
+ #
+ def visit_port?(url)
+ @port_rules.accept?(url.port)
+ end
+
+ #
+ # Returns +true+ if the specified _url_ should be visited, based on
+ # the pattern of the _url_, returns +false+ otherwise.
+ #
+ def visit_link?(url)
+ @link_rules.accept?(url.to_s)
+ end
+
+ #
+ # Returns +true+ if the specified _url_ should be visited, based on
+ # the file extension of the _url_, returns +false+ otherwise.
+ #
+ def visit_ext?(url)
+ @ext_rules.accept?(File.extname(url.path)[1..-1])
+ end
+
+ #
# Returns +true+ if the specified URL should be visited, returns
# +false+ otherwise.
#
def visit?(url)
(!(visited?(url)) &&
@@ -475,31 +560,20 @@
block.call(page) if block
end
end
- def visit_scheme?(url)
- if url.scheme
- return SCHEMES.include?(url.scheme)
- else
- return true
+ #
+ # Adds the specified _url_ to the failures list.
+ #
+ def failed(url)
+ unless url.kind_of?(URI)
+ url = URI(url.to_s)
end
- end
- def visit_host?(url)
- @host_rules.accept?(url.host)
- end
-
- def visit_port?(url)
- @port_rules.accept?(url.port)
- end
-
- def visit_link?(url)
- @link_rules.accept?(url.to_s)
- end
-
- def visit_ext?(url)
- @ext_rules.accept?(File.extname(url.path)[1..-1])
+ @every_failed_url_blocks.each { |block| block.call(url) }
+ @failures << url
+ return true
end
end
end