Sha256: 8080b3fbd2c4ec4b2001721449b31e23e84c0d7b11a9e8714611cc5af0ac4c7a

Contents?: true

Size: 986 Bytes

Versions: 1

Compression:

Stored size: 986 Bytes

Contents

module EmailCrawler
  class EmailScanner
    EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
    UTF_8 = "UTF-8".freeze

    include MechanizeHelper

    def initialize(logger = Logger.new("/dev/null"))
      @logger = logger
    end

    def scan(links)
      links.each_with_object({}) do |link, h|
        @logger.info "searching for emails on '#{link}'.."
        retried = false

        begin
          html = get(link).body
        rescue => err
          @logger.warn err.inspect
          nil
        end
        next unless html

        begin
          emails = html.scan(EMAIL_REGEXP)
        rescue ArgumentError => err
          if retried
            emails = []
          else
            @logger.warn err.inspect
            html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
            retried = true
            retry
          end
        end

        h[link] = Set.new(emails) unless emails.empty?
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
email_crawler-0.1.1 lib/email_crawler/email_scanner.rb