lib/tansaku/crawler.rb in tansaku-1.1.0 vs lib/tansaku/crawler.rb in tansaku-1.3.0
- old
+ new
@@ -1,8 +1,7 @@
# frozen_string_literal: true
-require "async/http/internet"
require "async"
require "async/barrier"
require "async/semaphore"
require "cgi"
require "etc"
@@ -12,62 +11,91 @@
module Tansaku
class Crawler
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
+ # @return [String]
attr_reader :base_uri
attr_reader :additional_list
- attr_reader :headers
- attr_reader :host
+
+ # @return [Integer]
attr_reader :max_concurrent_requests
+
+ # @return [String]
attr_reader :type
- attr_reader :user_agent
+ # @return [String]
+ attr_reader :method
+
+ # @return [String, nil]
+ attr_reader :body
+
+ # @return [Float, nil]
+ attr_reader :timeout
+
+ # @return [Boolean]
+ attr_reader :ignore_certificate_errors
+
def initialize(
base_uri,
additional_list: nil,
headers: {},
- host: nil,
+ method: "HEAD",
+ body: nil,
+ timeout: nil,
max_concurrent_requests: nil,
- type: "all",
- user_agent: DEFAULT_USER_AGENT
+ ignore_certificate_errors: false,
+ type: "all"
)
- @base_uri = URI.parse(base_uri)
+ @base_uri = URI.parse(base_uri.downcase)
raise ArgumentError, "Invalid URI" unless valid_uri?
@additional_list = additional_list
- unless additional_list.nil?
- raise ArgumentError, "Invalid path" unless valid_path?
- end
+ raise ArgumentError, "Invalid path" if !additional_list.nil? && !valid_path?
+ @method = method.upcase
+ raise ArgumentError, "Invalid HTTP method" unless valid_method?
+
@headers = headers
- @host = host
- @max_concurrent_requests = max_concurrent_requests || Etc.nprocessors * 8
+ @body = body
+
+ @timeout = timeout.nil? ? nil : timeout.to_f
+
+ @max_concurrent_requests = max_concurrent_requests || (Etc.nprocessors * 8)
+
+ @ignore_certificate_errors = ignore_certificate_errors
+
@type = type
- @user_agent = user_agent
end
def crawl
results = {}
- Async do
+
+ Async do |task|
barrier = Async::Barrier.new
semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
- internet = Async::HTTP::Internet.new
+ internet = Internet.new
paths.each do |path|
semaphore.async do
url = url_for(path)
- res = internet.head(url, default_request_headers)
- results[url] = res.status if online?(res.status)
+ res = dispatch_http_request(task, internet, url)
+ next unless online?(res.status)
+
+ log = [method, url, res.status].join(",")
+ Tansaku.logger.info(log)
+
+ results[url] = res.status
rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
next
end
end
barrier.wait
end
+
results
end
private
@@ -81,24 +109,60 @@
def valid_path?
File.exist?(additional_list)
end
+ def valid_method?
+ Protocol::HTTP::Methods.valid? method
+ end
+
def paths
paths = Path.get_by_type(type)
paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list
- paths.map(&:chomp).compact
+ paths.filter_map(&:chomp)
end
def url_for(path)
URI(base_uri + CGI.escape(path)).to_s
end
def urls
paths.map { |path| url_for path }
end
- def default_request_headers
- @default_request_headers ||= headers.merge({ "host" => host, "user-agent" => user_agent }.compact)
+ def request_headers
+ @request_headers ||= @headers.tap do |headers|
+ upcase_keys = headers.keys.map(&:downcase).map(&:to_s)
+ headers["user-agent"] = DEFAULT_USER_AGENT unless upcase_keys.include?("user-agent")
+ end.compact
+ end
+
+ def ssl_verify_mode
+ ignore_certificate_errors ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER
+ end
+
+ def ssl_context
+ @ssl_context ||= OpenSSL::SSL::SSLContext.new.tap do |context|
+ context.set_params(verify_mode: ssl_verify_mode)
+ end
+ end
+
+ #
+ # Dispatch an HTTP request
+ #
+ # @param [Async::Task] task
+ # @param [Tansaku::Internet] internet
+ # @param [String] url
+ #
+ # @return [Async::HTTP::Protocol::Response]
+ #
+ def dispatch_http_request(task, internet, url)
+ endpoint = Async::HTTP::Endpoint.parse(url, ssl_context: ssl_context)
+
+ return internet.call(method, endpoint, request_headers, body) if timeout.nil?
+
+ task.with_timeout(timeout) do
+ internet.call(method, endpoint, request_headers, body)
+ end
end
end
end