lib/tansaku/crawler.rb in tansaku-1.3.0 vs lib/tansaku/crawler.rb in tansaku-1.4.0

- old
+ new

@@ -9,11 +9,11 @@ require "tansaku/monkey_patch" module Tansaku class Crawler - DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" + DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36" # @return [String] attr_reader :base_uri attr_reader :additional_list @@ -49,11 +49,11 @@ ) @base_uri = URI.parse(base_uri.downcase) raise ArgumentError, "Invalid URI" unless valid_uri? @additional_list = additional_list - raise ArgumentError, "Invalid path" if !additional_list.nil? && !valid_path? + raise ArgumentError, "Invalid path" unless valid_additional_path? @method = method.upcase raise ArgumentError, "Invalid HTTP method" unless valid_method? @headers = headers @@ -69,10 +69,12 @@ end def crawl results = {} + log_conditions + Async do |task| barrier = Async::Barrier.new semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier) internet = Internet.new @@ -97,29 +99,44 @@ results end private + def log_conditions + Tansaku.logger.info("Start crawling with the following conditions:") + Tansaku.logger.info("URLs: #{paths.length} URLs to crawl") + Tansaku.logger.info("Method: #{method}") + Tansaku.logger.info("Timeout: #{timeout || "nil"}") + Tansaku.logger.info("Headers: #{request_headers}") + Tansaku.logger.info("Body: #{body}") + Tansaku.logger.info("Ignore certificate errors: #{ignore_certificate_errors}") + Tansaku.logger.info("Concurrency: #{max_concurrent_requests} requests at max") + end + def online?(status) [200, 204, 301, 302, 307, 401, 403].include? status.to_i end def valid_uri? ["http", "https"].include? base_uri.scheme end - def valid_path? + def valid_additional_path? + return true if additional_list.nil? + File.exist?(additional_list) end def valid_method? Protocol::HTTP::Methods.valid? method end def paths - paths = Path.get_by_type(type) - paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list - paths.filter_map(&:chomp) + @paths ||= [].tap do |out| + paths = Path.get_by_type(type) + paths += File.readlines(additional_list) if additional_list + out << paths.filter_map(&:chomp) + end.flatten.uniq end def url_for(path) URI(base_uri + CGI.escape(path)).to_s end