Sha256: 848479c31d659b56b1c5c61f5714cb3f916c60931f67ad9761c93fc36393aec8
Contents?: true
Size: 1.61 KB
Versions: 3
Compression:
Stored size: 1.61 KB
Contents
module Spieker class Crawler def initialize(url, verbose: false, lang: 'en') @url = url @tracked_links = [] @verbose = verbose @lang = lang end def crawl! report "Starting to crawl on #{@url}" scraper = LinkScraper.new(@url, lang: @lang) track_link(@url) links = scraper.result recursively_crawl(links) print_results if @verbose end def current_path URI.parse(@url).path end private def recursively_crawl(links) new_links_threaded = {} links.each do |link| report "Crawling page #{link}" scraper = LinkScraper.new(link, lang: @lang) new_links_threaded[link] = scraper.result track_link(link) report "Finished page #{link}, #{new_links_threaded[link].length} links found" end new_links = select_untracked_links(new_links_threaded.values.flatten.uniq) report "Recursively crawling #{new_links.length} links ..." report "NEW LINKS FOUND: \n#{new_links.join("\n")}" report "TRACKED LINKS: \n#{@tracked_links.join("\n")}" recursively_crawl(new_links) if new_links.any? end def select_untracked_links(links) links.select { |l| !@tracked_links.include?(l) } end def track_link(link) @tracked_links = @tracked_links.push(link) end def report(text) if @verbose puts text end end def print_results report "\n\n:::RESULTS:::\n\n" report "Pages found #{@tracked_links.uniq.compact.length}\n\n" report "All links found:\n\n #{@tracked_links.compact.join("\n")}" end end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
spieker-0.0.10 | lib/spieker/crawler.rb |
spieker-0.0.9 | lib/spieker/crawler.rb |
spieker-0.0.8 | lib/spieker/crawler.rb |