Sha256: 4d33acd0822738c157a6a43a7951d58ea8febb98e9760b568103224d5a2e4694

Contents?: true

Size: 1.44 KB

Versions: 4

Compression:

Stored size: 1.44 KB

Contents

module EmailCrawler
  class PageLinks
    MAX_LINKS  = 100
    SLEEP_TIME = 0.5

    include MechanizeHelper

    def initialize(url)
      @url = url
      uri = URI(url)
      scheme_and_host = if uri.host
                          "#{uri.scheme}://#{uri.host}"
                        else
                          url[%r(\A(https?://([^/]+))), 1]
                        end
      @domain = Regexp.new("#{scheme_and_host}/", true)
      @logger = ::Logger.new(STDOUT).tap do |logger|
        logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
      end
    end

    def self.for(url, max_links = MAX_LINKS)
      new(url).fetch_links(max_links)
    end

    def fetch_links(max_links = MAX_LINKS)
      queue, links = Set.new([@url]), Set.new([@url])

      until queue.empty?
        current_link = queue.first
        @logger.info "current_link: #{current_link}"
        page = get(current_link)

        if page
          new_links = page.links_with(href: @domain).map(&:href)
          new_links.reject! { |link| links.include?(link) }
          @logger.debug "found: #{new_links.length} new link(s)"
          new_links.each { |link| queue << link }
          links << current_link

          if links.length == max_links
            break
          else
            sleep(SLEEP_TIME)
          end
        end

        queue.delete(current_link)
      end

      links.to_a
    end

  private

    def agent
      @agent ||= new_agent
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
email_crawler-0.0.5 lib/email_crawler/page_links.rb
email_crawler-0.0.4 lib/email_crawler/page_links.rb
email_crawler-0.0.3 lib/email_crawler/page_links.rb
email_crawler-0.0.2 lib/email_crawler/page_links.rb