lib/email_crawler.rb in email_crawler-0.0.13 vs lib/email_crawler.rb in email_crawler-0.1.0

- old
+ new

@@ -1,6 +1,5 @@ -require "thread" require "logger" require "csv" require "set" require "thread_safe" @@ -12,11 +11,12 @@ module EmailCrawler class Runner MAX_CONCURRENCY = 50 - attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains + attr_writer :max_results, :max_links, :max_concurrency, :logger, + :blacklisted_domains def initialize(google_website) @google_website = google_website yield(self) end @@ -25,11 +25,11 @@ urls = Scraper.new(@google_website, max_results: @max_results, blacklisted_domains: @blacklisted_domains). search_result_urls_for(q) urls.each { |url| logger.info "#{url}" } - queue = Queue.new + queue = Thread::Queue.new urls.each { |url| queue.push(url) } links_by_url = ThreadSafe::Array.new threads = (1..[urls.length, @max_concurrency].min).map do |i| Thread.new(i) do |i| @@ -60,11 +60,11 @@ rescue ThreadError; end while arr url, links = arr logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)" - emails = EmailScanner.new(url, logger).scan(links) + emails = EmailScanner.new(logger).scan(links) emails_by_url[url] = emails arr = begin queue.pop(true) rescue ThreadError; end @@ -90,14 +90,14 @@ end end end end - private + private def logger @logger ||= begin - path = File.join(ENV["HOME"], "email_crawler.log") + path = File.join(ENV["HOME"], "email-crawler.log") file = File.open(path, File::WRONLY | File::APPEND | File::CREAT) logger = ::Logger.new(file).tap do |logger| logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR end end