Sha256: f73ecc9a177760104898c2e26d91abd8b79be5390bb94da9e750a16a4eabef7e
Contents?: true
Size: 1.6 KB
Versions: 1
Compression:
Stored size: 1.6 KB
Contents
module Spieker class Crawler def initialize(url, verbose: false, lang: 'en') @url = url @tracked_links = [] @verbose = verbose @lang = lang end def crawl! report "Starting to crawl on #{@url}" scraper = LinkScraper.new(@url, lang: @lang) track_link(@url) links = scraper.result recursively_crawl(links) print_results if @verbose end def current_path URI.parse(@url).path end private def recursively_crawl(links) new_links_threaded = {} links.each do |link| report "Crawling page #{link}" scraper = LinkScraper.new(link) new_links_threaded[link] = scraper.result track_link(link) report "Finished page #{link}, #{new_links_threaded[link].length} links found" end new_links = select_untracked_links(new_links_threaded.values.flatten.uniq) report "Recursively crawling #{new_links.length} links ..." report "NEW LINKS FOUND: \n#{new_links.join("\n")}" report "TRACKED LINKS: \n#{@tracked_links.join("\n")}" recursively_crawl(new_links) if new_links.any? end def select_untracked_links(links) links.select { |l| !@tracked_links.include?(l) } end def track_link(link) @tracked_links = @tracked_links.push(link) end def report(text) if @verbose puts text end end def print_results report "\n\n:::RESULTS:::\n\n" report "Pages found #{@tracked_links.uniq.compact.length}\n\n" report "All links found:\n\n #{@tracked_links.compact.join("\n")}" end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
spieker-0.0.7 | lib/spieker/crawler.rb |