lib/email_crawler.rb in email_crawler-0.0.13 vs lib/email_crawler.rb in email_crawler-0.1.0
- old
+ new
@@ -1,6 +1,5 @@
-require "thread"
require "logger"
require "csv"
require "set"
require "thread_safe"
@@ -12,11 +11,12 @@
module EmailCrawler
class Runner
MAX_CONCURRENCY = 50
- attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains
+ attr_writer :max_results, :max_links, :max_concurrency, :logger,
+ :blacklisted_domains
def initialize(google_website)
@google_website = google_website
yield(self)
end
@@ -25,11 +25,11 @@
urls = Scraper.new(@google_website,
max_results: @max_results,
blacklisted_domains: @blacklisted_domains).
search_result_urls_for(q)
urls.each { |url| logger.info "#{url}" }
- queue = Queue.new
+ queue = Thread::Queue.new
urls.each { |url| queue.push(url) }
links_by_url = ThreadSafe::Array.new
threads = (1..[urls.length, @max_concurrency].min).map do |i|
Thread.new(i) do |i|
@@ -60,11 +60,11 @@
rescue ThreadError; end
while arr
url, links = arr
logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
- emails = EmailScanner.new(url, logger).scan(links)
+ emails = EmailScanner.new(logger).scan(links)
emails_by_url[url] = emails
arr = begin
queue.pop(true)
rescue ThreadError; end
@@ -90,14 +90,14 @@
end
end
end
end
- private
+ private
def logger
@logger ||= begin
- path = File.join(ENV["HOME"], "email_crawler.log")
+ path = File.join(ENV["HOME"], "email-crawler.log")
file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
logger = ::Logger.new(file).tap do |logger|
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
end
end