Sha256: 230ec6b1c7a1ad92e19e7a8bd53ca9355f1cb711b37f172e648f57ace222f0b7
Contents?: true
Size: 1.56 KB
Versions: 3
Compression:
Stored size: 1.56 KB
Contents
module Spieker class Crawler def initialize(url, verbose: false) @url = url @tracked_links = [] @verbose = verbose end def crawl! report "Starting to crawl on #{@url}" scraper = LinkScraper.new(@url) track_link(@url) links = scraper.result recursively_crawl(links) print_results if @verbose end def current_path URI.parse(@url).path end private def recursively_crawl(links) new_links_threaded = {} links.each do |link| report "Crawling page #{link}" scraper = LinkScraper.new(link) new_links_threaded[link] = scraper.result track_link(link) report "Finished page #{link}, #{new_links_threaded[link].length} links found" end new_links = select_untracked_links(new_links_threaded.values.flatten.uniq) report "Recursively crawling #{new_links.length} links ..." report "NEW LINKS FOUND: \n#{new_links.join("\n")}" report "TRACKED LINKS: \n#{@tracked_links.join("\n")}" recursively_crawl(new_links) if new_links.any? end def select_untracked_links(links) links.select { |l| !@tracked_links.include?(l) } end def track_link(link) @tracked_links = @tracked_links.push(link) end def report(text) if @verbose puts text end end def print_results report "\n\n:::RESULTS:::\n\n" report "Pages found #{@tracked_links.uniq.compact.length}\n\n" report "All links found:\n\n #{@tracked_links.compact.join("\n")}" end end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
spieker-0.0.3 | lib/spieker/crawler.rb |
spieker-0.0.2 | lib/spieker/crawler.rb |
spieker-0.0.1 | lib/spieker/crawler.rb |