lib/tansaku/crawler.rb in tansaku-0.2.1 vs lib/tansaku/crawler.rb in tansaku-0.3.0
- old
+ new
@@ -5,28 +5,40 @@
require "parallel"
require "uri"
module Tansaku
class Crawler
- DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
attr_reader :base_uri
- attr_reader :additional_list, :threads, :user_agent, :type
- def initialize(base_uri, additional_list: nil, threads: Parallel.processor_count, user_agent: DEFAULT_USER_AGENT, type: "all")
+ attr_reader :additional_list
+ attr_reader :host
+ attr_reader :threads
+ attr_reader :type
+ attr_reader :user_agent
+
+ def initialize(
+ base_uri,
+ additional_list: nil,
+ host: nil,
+ threads: Parallel.processor_count,
+ type: "all",
+ user_agent: DEFAULT_USER_AGENT
+ )
@base_uri = URI.parse(base_uri)
raise ArgumentError, "Invalid URI" unless valid_uri?
@additional_list = additional_list
unless additional_list.nil?
raise ArgumentError, "Invalid path" unless valid_path?
end
+ @host = host
@threads = threads
- @user_agent = user_agent
-
@type = type
+ @user_agent = user_agent
end
def online?(url)
res = head(url)
[200, 401, 302].include? res.code.to_i
@@ -70,9 +82,11 @@
end
def head(url)
head = Net::HTTP::Head.new(url)
head["User-Agent"] = user_agent
+ head["Host"] = host unless host.nil?
+
request(head)
end
end
end