lib/tansaku/crawler.rb in tansaku-1.3.0 vs lib/tansaku/crawler.rb in tansaku-1.4.0
- old
+ new
@@ -9,11 +9,11 @@
require "tansaku/monkey_patch"
module Tansaku
class Crawler
- DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
# @return [String]
attr_reader :base_uri
attr_reader :additional_list
@@ -49,11 +49,11 @@
)
@base_uri = URI.parse(base_uri.downcase)
raise ArgumentError, "Invalid URI" unless valid_uri?
@additional_list = additional_list
- raise ArgumentError, "Invalid path" if !additional_list.nil? && !valid_path?
+ raise ArgumentError, "Invalid path" unless valid_additional_path?
@method = method.upcase
raise ArgumentError, "Invalid HTTP method" unless valid_method?
@headers = headers
@@ -69,10 +69,12 @@
end
def crawl
results = {}
+ log_conditions
+
Async do |task|
barrier = Async::Barrier.new
semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
internet = Internet.new
@@ -97,29 +99,44 @@
results
end
private
+ def log_conditions
+ Tansaku.logger.info("Start crawling with the following conditions:")
+ Tansaku.logger.info("URLs: #{paths.length} URLs to crawl")
+ Tansaku.logger.info("Method: #{method}")
+ Tansaku.logger.info("Timeout: #{timeout || "nil"}")
+ Tansaku.logger.info("Headers: #{request_headers}")
+ Tansaku.logger.info("Body: #{body}")
+ Tansaku.logger.info("Ignore certificate errors: #{ignore_certificate_errors}")
+ Tansaku.logger.info("Concurrency: #{max_concurrent_requests} requests at max")
+ end
+
def online?(status)
[200, 204, 301, 302, 307, 401, 403].include? status.to_i
end
def valid_uri?
["http", "https"].include? base_uri.scheme
end
- def valid_path?
+ def valid_additional_path?
+ return true if additional_list.nil?
+
File.exist?(additional_list)
end
def valid_method?
Protocol::HTTP::Methods.valid? method
end
def paths
- paths = Path.get_by_type(type)
- paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list
- paths.filter_map(&:chomp)
+ @paths ||= [].tap do |out|
+ paths = Path.get_by_type(type)
+ paths += File.readlines(additional_list) if additional_list
+ out << paths.filter_map(&:chomp)
+ end.flatten.uniq
end
def url_for(path)
URI(base_uri + CGI.escape(path)).to_s
end