Sha256: ca3b6fed21d5dff50db36c479f5aa4a11ae6dfb2445303fe92e98eeb3e5e871a

Contents?: true

Size: 998 Bytes

Versions: 18

Compression:

Stored size: 998 Bytes

Contents

# A basic spider that will follow internal links
#
# Usage example:
#
#   ruby spider.rb example.com

require '../lib/metainspector'
puts "Using MetaInspector #{MetaInspector::VERSION}"

# Two arrays, one for the scraping queue and one for the visited links
queue   = []
visited = []

# Get the starting URL
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)

# Resolve initial redirections
page = MetaInspector.new(url)

# Push this initial URL to the queue
queue.push(page.url)

while queue.any?
  url = queue.pop

  visited.push(url)

  puts "VISITED: #{url}"

  begin
    page = MetaInspector.new(url)

    page.links.internal.each do |link|
      queue.push(link) unless visited.include?(link) || queue.include?(link)
    end
  rescue MetaInspector::ParserError
    puts "Couldn't get links from #{url}, skipping"
  end

  puts "#{visited.size} pages visited, #{queue.size} pages on queue\n\n"
end

puts "\nScraping finished, these are the internal links found:\n\n"
puts visited.sort

Version data entries

18 entries across 18 versions & 1 rubygems

Version Path
metainspector-5.15.0 examples/spider.rb
metainspector-5.14.0 examples/spider.rb
metainspector-5.13.1 examples/spider.rb
metainspector-5.13.0 examples/spider.rb
metainspector-5.12.1 examples/spider.rb
metainspector-5.12.0 examples/spider.rb
metainspector-5.11.2 examples/spider.rb
metainspector-5.11.1 examples/spider.rb
metainspector-5.11.0 examples/spider.rb
metainspector-5.10.1 examples/spider.rb
metainspector-5.10.0 examples/spider.rb
metainspector-5.9.0 examples/spider.rb
metainspector-5.8.0 examples/spider.rb
metainspector-5.7.0 examples/spider.rb
metainspector-5.6.0 examples/spider.rb
metainspector-5.5.0 examples/spider.rb
metainspector-5.4.3 examples/spider.rb
metainspector-5.4.2 examples/spider.rb