Sha256: 848479c31d659b56b1c5c61f5714cb3f916c60931f67ad9761c93fc36393aec8

Contents?: true

Size: 1.61 KB

Versions: 3

Compression:

Stored size: 1.61 KB

Contents

module Spieker
  class Crawler
    def initialize(url, verbose: false, lang: 'en')
      @url = url
      @tracked_links = []
      @verbose = verbose
      @lang = lang
    end

    def crawl!
      report "Starting to crawl on #{@url}"

      scraper = LinkScraper.new(@url, lang: @lang)
      track_link(@url)
      links =  scraper.result
      recursively_crawl(links)

      print_results if @verbose
    end

    def current_path
      URI.parse(@url).path
    end

    private
    def recursively_crawl(links)
      new_links_threaded = {}
      links.each do |link|
        report "Crawling page #{link}"

        scraper = LinkScraper.new(link, lang: @lang)
        new_links_threaded[link] = scraper.result
        track_link(link)

        report "Finished page #{link}, #{new_links_threaded[link].length} links found"
      end
      new_links = select_untracked_links(new_links_threaded.values.flatten.uniq)
      report "Recursively crawling #{new_links.length} links ..."
      report "NEW LINKS FOUND: \n#{new_links.join("\n")}"
      report "TRACKED LINKS: \n#{@tracked_links.join("\n")}"

      recursively_crawl(new_links) if new_links.any?
    end

    def select_untracked_links(links)
      links.select { |l| !@tracked_links.include?(l) }
    end

    def track_link(link)
      @tracked_links = @tracked_links.push(link)
    end

    def report(text)
      if @verbose
        puts text
      end
    end

    def print_results
      report "\n\n:::RESULTS:::\n\n"
      report "Pages found #{@tracked_links.uniq.compact.length}\n\n"
      report "All links found:\n\n #{@tracked_links.compact.join("\n")}"
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
spieker-0.0.10 lib/spieker/crawler.rb
spieker-0.0.9 lib/spieker/crawler.rb
spieker-0.0.8 lib/spieker/crawler.rb