examples/link_checker.rb in metainspector-4.0.0 vs examples/link_checker.rb in metainspector-4.1.0

- old
+ new

@@ -5,11 +5,10 @@ # ruby link_checker.rb alazan.com require 'metainspector' class BrokenLinkChecker - attr_reader :broken def initialize(url) @url = url @queue = [] @visited = [] @@ -31,36 +30,30 @@ end private def check - # Resolve initial redirections - page = MetaInspector.new(@url) + # Resolves redirections of initial URL before placing it on the queue + @queue.push(MetaInspector.new(@url).url) - # Push this initial URL to the queue - @queue.push(page.url) + process_next_on_queue while @queue.any? + end - while @queue.any? - url = @queue.pop + def process_next_on_queue + page = MetaInspector.new(@queue.pop, :warn_level => :store) - page = MetaInspector.new(url, :warn_level => :store) + page.links.all.select {|l| l =~ /^http(s)?:\/\//i}.each do |link| + check_status(link, page.url) + end if page.ok? - if page.ok? - # Gets all HTTP links - page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link| - check_status(link, page.url) - end - end + @visited.push(page.url) - @visited.push(page.url) - - page.internal_links.each do |link| - @queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link) - end - - puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links" + page.links.internal.each do |link| + @queue.push(link) if should_be_enqueued?(link) end + + show_stats end # Checks the response status of the linked_url and stores it on the ok or broken collections def check_status(linked_url, from_url) if @broken.keys.include?(linked_url) @@ -74,9 +67,17 @@ else @broken[linked_url] = [from_url] end end end + end + + def should_be_enqueued?(url) + !(@visited.include?(url) || @broken.include?(url) || @queue.include?(url)) + end + + def show_stats + puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links" end # A page is reachable if its response status is less than 400 # In the case of exceptions, like timeouts or server connection errors, # we consider it unreachable