lib/tansaku/crawler.rb in tansaku-1.1.0 vs lib/tansaku/crawler.rb in tansaku-1.3.0

- old
+ new

@@ -1,8 +1,7 @@ # frozen_string_literal: true -require "async/http/internet" require "async" require "async/barrier" require "async/semaphore" require "cgi" require "etc" @@ -12,62 +11,91 @@ module Tansaku class Crawler DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" + # @return [String] attr_reader :base_uri attr_reader :additional_list - attr_reader :headers - attr_reader :host + + # @return [Integer] attr_reader :max_concurrent_requests + + # @return [String] attr_reader :type - attr_reader :user_agent + # @return [String] + attr_reader :method + + # @return [String, nil] + attr_reader :body + + # @return [Float, nil] + attr_reader :timeout + + # @return [Boolean] + attr_reader :ignore_certificate_errors + def initialize( base_uri, additional_list: nil, headers: {}, - host: nil, + method: "HEAD", + body: nil, + timeout: nil, max_concurrent_requests: nil, - type: "all", - user_agent: DEFAULT_USER_AGENT + ignore_certificate_errors: false, + type: "all" ) - @base_uri = URI.parse(base_uri) + @base_uri = URI.parse(base_uri.downcase) raise ArgumentError, "Invalid URI" unless valid_uri? @additional_list = additional_list - unless additional_list.nil? - raise ArgumentError, "Invalid path" unless valid_path? - end + raise ArgumentError, "Invalid path" if !additional_list.nil? && !valid_path? + @method = method.upcase + raise ArgumentError, "Invalid HTTP method" unless valid_method? + @headers = headers - @host = host - @max_concurrent_requests = max_concurrent_requests || Etc.nprocessors * 8 + @body = body + + @timeout = timeout.nil? ? nil : timeout.to_f + + @max_concurrent_requests = max_concurrent_requests || (Etc.nprocessors * 8) + + @ignore_certificate_errors = ignore_certificate_errors + @type = type - @user_agent = user_agent end def crawl results = {} - Async do + + Async do |task| barrier = Async::Barrier.new semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier) - internet = Async::HTTP::Internet.new + internet = Internet.new paths.each do |path| semaphore.async do url = url_for(path) - res = internet.head(url, default_request_headers) - results[url] = res.status if online?(res.status) + res = dispatch_http_request(task, internet, url) + next unless online?(res.status) + + log = [method, url, res.status].join(",") + Tansaku.logger.info(log) + + results[url] = res.status rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError next end end barrier.wait end + results end private @@ -81,24 +109,60 @@ def valid_path? File.exist?(additional_list) end + def valid_method? + Protocol::HTTP::Methods.valid? method + end + def paths paths = Path.get_by_type(type) paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list - paths.map(&:chomp).compact + paths.filter_map(&:chomp) end def url_for(path) URI(base_uri + CGI.escape(path)).to_s end def urls paths.map { |path| url_for path } end - def default_request_headers - @default_request_headers ||= headers.merge({ "host" => host, "user-agent" => user_agent }.compact) + def request_headers + @request_headers ||= @headers.tap do |headers| + upcase_keys = headers.keys.map(&:downcase).map(&:to_s) + headers["user-agent"] = DEFAULT_USER_AGENT unless upcase_keys.include?("user-agent") + end.compact + end + + def ssl_verify_mode + ignore_certificate_errors ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER + end + + def ssl_context + @ssl_context ||= OpenSSL::SSL::SSLContext.new.tap do |context| + context.set_params(verify_mode: ssl_verify_mode) + end + end + + # + # Dispatch an HTTP request + # + # @param [Async::Task] task + # @param [Tansaku::Internet] internet + # @param [String] url + # + # @return [Async::HTTP::Protocol::Response] + # + def dispatch_http_request(task, internet, url) + endpoint = Async::HTTP::Endpoint.parse(url, ssl_context: ssl_context) + + return internet.call(method, endpoint, request_headers, body) if timeout.nil? + + task.with_timeout(timeout) do + internet.call(method, endpoint, request_headers, body) + end end end end