lib/rawler/base.rb in rawler-0.0.4 vs lib/rawler/base.rb in rawler-0.0.5
- old
+ new
@@ -20,10 +20,12 @@
private
def validate_links_in_page(current_url)
Rawler::Crawler.new(current_url).links.each do |page_url|
validate_page(page_url)
+ # Todo: include this in a configuration option
+ sleep(3)
end
end
def validate_page(page_url)
if not_yet_parsed?(page_url)
@@ -37,16 +39,16 @@
write("#{response.code} - #{link}")
responses[link] = { :status => response.code.to_i }
rescue Errno::ECONNREFUSED
write("Connection refused - '#{link}'")
- rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT,
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
write("Connection problems - '#{link}'")
end
def same_domain?(link)
- URI.parse(URI.encode(Rawler.url)).host == URI.parse(URI.encode(link)).host
+ URI.parse(Rawler.url).host == URI.parse(link).host
end
def not_yet_parsed?(link)
responses[link].nil?
end
\ No newline at end of file