Sha256: dab68e35cecfbf23726dcc8d5ee815147c6cee33487a12aab09cb4d6531069b7

Contents?: true

Size: 968 Bytes

Versions: 4

Compression:

Stored size: 968 Bytes

Contents

require "open-uri"

module EmailCrawler
  class EmailScanner
    EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
    SLEEP_TIME = 0.5

    def initialize(url)
      @url = url
      @logger = ::Logger.new(STDOUT).tap do |logger|
        logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
      end
    end

    def scan(links)
      emails_by_link = {}

      links.each do |link|
        @logger.info "searching for emails on '#{link}'.."

        html = begin
          open(link).read
        rescue OpenURI::HTTPError => err
          @logger.warn(err)
          nil
        rescue => err
          if err.message =~ /redirection forbidden/
            link = err.message.split(" ").last
            retry
          end
        end
        next unless html

        emails = html.scan(EMAIL_REGEXP)
        emails_by_link[link] = Set.new(emails) unless emails.empty?
        sleep(SLEEP_TIME)
      end

      emails_by_link
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
email_crawler-0.0.5 lib/email_crawler/email_scanner.rb
email_crawler-0.0.4 lib/email_crawler/email_scanner.rb
email_crawler-0.0.3 lib/email_crawler/email_scanner.rb
email_crawler-0.0.2 lib/email_crawler/email_scanner.rb