lib/tansaku/crawler.rb in tansaku-0.1.2 vs lib/tansaku/crawler.rb in tansaku-0.2.0
- old
+ new
@@ -8,33 +8,37 @@
module Tansaku
class Crawler
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
attr_reader :base_uri
- attr_reader :additional_list, :threads, :user_agent
+ attr_reader :additional_list, :threads, :user_agent, :type
- def initialize(base_uri, additional_list: nil, threads: 10, user_agent: DEFAULT_USER_AGENT)
+ def initialize(base_uri, additional_list: nil, threads: 10, user_agent: DEFAULT_USER_AGENT, type: "all")
@base_uri = URI.parse(base_uri)
raise ArgumentError, "Invalid URI" unless valid_uri?
@additional_list = additional_list
unless additional_list.nil?
raise ArgumentError, "Invalid path" unless valid_path?
end
@threads = threads
@user_agent = user_agent
+
+ @type = type
end
def online?(url)
res = head(url)
[200, 401, 302].include? res.code.to_i
end
def crawl
results = Parallel.map(urls, in_threads: threads) do |url|
url if online?(url)
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => _e
+ nil
end
results.compact
end
private
@@ -46,10 +50,10 @@
def valid_path?
File.exist?(additional_list)
end
def paths
- paths = File.readlines(File.expand_path("./fixtures/paths.txt", __dir__))
+ paths = Path.get_by_type(type)
paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list
paths.map(&:chomp).compact
end
def url_for(path)