Sha256: 8080b3fbd2c4ec4b2001721449b31e23e84c0d7b11a9e8714611cc5af0ac4c7a
Contents?: true
Size: 986 Bytes
Versions: 1
Compression:
Stored size: 986 Bytes
Contents
module EmailCrawler class EmailScanner EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i UTF_8 = "UTF-8".freeze include MechanizeHelper def initialize(logger = Logger.new("/dev/null")) @logger = logger end def scan(links) links.each_with_object({}) do |link, h| @logger.info "searching for emails on '#{link}'.." retried = false begin html = get(link).body rescue => err @logger.warn err.inspect nil end next unless html begin emails = html.scan(EMAIL_REGEXP) rescue ArgumentError => err if retried emails = [] else @logger.warn err.inspect html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "") retried = true retry end end h[link] = Set.new(emails) unless emails.empty? end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
email_crawler-0.1.1 | lib/email_crawler/email_scanner.rb |