Sha256: 587ce12968799185ac63a0aebdfaf6988d715e0976b0051090f2c4eb273a0ff0

Contents?: true

Size: 1.9 KB

Versions: 1

Compression:

Stored size: 1.9 KB

Contents

# frozen_string_literal: true

require "cgi"
require "net/http"
require "parallel"
require "uri"

module Tansaku
  class Crawler
    DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"

    attr_reader :base_uri
    attr_reader :additional_list, :threads, :user_agent, :type

    def initialize(base_uri, additional_list: nil, threads: 10, user_agent: DEFAULT_USER_AGENT, type: "all")
      @base_uri = URI.parse(base_uri)
      raise ArgumentError, "Invalid URI" unless valid_uri?

      @additional_list = additional_list
      unless additional_list.nil?
        raise ArgumentError, "Invalid path" unless valid_path?
      end

      @threads = threads
      @user_agent = user_agent

      @type = type
    end

    def online?(url)
      res = head(url)
      [200, 401, 302].include? res.code.to_i
    end

    def crawl
      results = Parallel.map(urls, in_threads: threads) do |url|
        url if online?(url)
      rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError => _e
        nil
      end
      results.compact
    end

    private

    def valid_uri?
      ["http", "https"].include? base_uri.scheme
    end

    def valid_path?
      File.exist?(additional_list)
    end

    def paths
      paths = Path.get_by_type(type)
      paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list
      paths.map(&:chomp).compact
    end

    def url_for(path)
      URI(base_uri + CGI.escape(path)).to_s
    end

    def urls
      paths.map { |path| url_for path }
    end

    def request(req)
      Net::HTTP.start(base_uri.host, base_uri.port) { |http| http.request(req) }
    end

    def head(url)
      head = Net::HTTP::Head.new(url)
      head["User-Agent"] = user_agent
      request(head)
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
tansaku-0.2.0 lib/tansaku/crawler.rb