lib/email_crawler.rb in email_crawler-0.0.8 vs lib/email_crawler.rb in email_crawler-0.0.9
- old
+ new
@@ -12,32 +12,20 @@
module EmailCrawler
class Runner
MAX_CONCURRENCY = 50
- attr_writer :max_results, :max_links, :max_concurrency
+ attr_writer :max_results, :max_links, :max_concurrency, :logger
def initialize(google_website)
@google_website = google_website
-
- # @logger = ::Logger.new(STDOUT).tap do |logger|
- log_file = File.join(ENV["HOME"], "email-crawler.log")
- file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
- @logger = ::Logger.new(file).tap do |logger|
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
- end
-
yield(self)
-
- @logger.info "max_results: #{@max_results}"
- @logger.info "max_links: #{@max_links}"
- @logger.info "max_concurrency: #{@max_concurrency}"
end
def run(q)
urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
- urls.each { |url| @logger.info "#{url}" }
+ urls.each { |url| logger.info "#{url}" }
queue = Queue.new
urls.each { |url| queue.push(url) }
links_by_url = ThreadSafe::Array.new
threads = (1..[urls.length, @max_concurrency].min).map do |i|
@@ -45,22 +33,22 @@
url = begin
queue.pop(true)
rescue ThreadError; end
while url
- @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
- links = PageLinks.for(url, @max_links)
+ logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
+ links = PageLinks.for(url, max_links: @max_links, logger: logger)
links_by_url << [url, links]
url = begin
queue.pop(true)
rescue ThreadError; end
end
end
end
threads.each(&:join)
- @logger.debug "links_by_url: #{links_by_url.inspect}"
+ logger.debug "links_by_url: #{links_by_url.inspect}"
links_by_url.each { |arr| queue.push(arr) }
emails_by_url = ThreadSafe::Hash.new
threads = (1..[links_by_url.length, @max_concurrency].min).map do |i|
Thread.new(i) do |i|
@@ -68,36 +56,48 @@
queue.pop(true)
rescue ThreadError; end
while arr
url, links = arr
- @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
- emails = EmailScanner.new(url).scan(links)
+ logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
+ emails = EmailScanner.new(url, logger).scan(links)
emails_by_url[url] = emails
arr = begin
queue.pop(true)
rescue ThreadError; end
end
end
end
threads.each(&:join)
- @logger.debug "emails_by_url: #{emails_by_url.inspect}"
+ logger.debug "emails_by_url: #{emails_by_url.inspect}"
read_emails = Set.new
CSV.generate do |csv|
csv << %w(Email Domain URL)
csv << []
emails_by_url.each do |url, emails_by_link|
email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
- @logger.info "#{url} (#{email_count} emails)"
+ logger.info "#{url} (#{email_count} emails)"
emails_by_link.each do |link, emails|
emails.each do |email|
csv << [email, url, link] if read_emails.add?(email)
end
end
+ end
+ end
+ end
+
+ private
+
+ def logger
+ @logger ||= begin
+ path = File.join(ENV["HOME"], "email_crawler.log")
+ file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
+ logger = ::Logger.new(file).tap do |logger|
+ logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
end
end
end
end
end