# frozen_string_literal: true module ProxyFetcher module Providers # Base class for all the ProxyFetcher providers. class Base # Loads proxy provider page content, extract proxy list from it # and convert every entry to proxy object. def fetch_proxies(filters = {}) raw_proxies = load_proxy_list(filters) proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact proxies.reject { |proxy| proxy.addr.nil? } end # For retro-compatibility alias fetch_proxies! fetch_proxies def provider_url raise NotImplementedError, "#{__method__} must be implemented in a descendant class!" end def provider_method :get end def provider_params {} end # @return [Hash] # Provider headers required to fetch the proxy list # def provider_headers {} end def xpath raise NotImplementedError, "#{__method__} must be implemented in a descendant class!" end # Just synthetic sugar to make it easier to call #fetch_proxies! method. def self.fetch_proxies!(*args) new.fetch_proxies!(*args) end protected # Loads raw provider HTML with proxies. # # @param url [String] # Provider URL # # @param filters [#to_h] # Provider filters (Hash-like object) # # @return [String] # HTML body from the response # def load_html(url, filters = {}) unless filters.respond_to?(:to_h) raise ArgumentError, "filters must be a Hash or respond to #to_h" end if filters&.any? # TODO: query for post request? uri = URI.parse(url) uri.query = URI.encode_www_form(provider_params.merge(filters.to_h)) url = uri.to_s end ProxyFetcher.config.http_client.fetch( url, method: provider_method, headers: provider_headers, params: provider_params ) end # Loads provider HTML and parses it with internal document object. # # @param url [String] # URL to fetch # # @param filters [Hash] # filters for proxy provider # # @return [ProxyFetcher::Document] # ProxyFetcher document object # def load_document(url, filters = {}) html = load_html(url, filters) ProxyFetcher::Document.parse(html) end # Fetches HTML content by sending HTTP request to the provider URL and # parses the document (built as abstract ProxyFetcher::Document) # to return all the proxy entries (HTML nodes). # # @return [Array] # Collection of extracted HTML nodes with full proxy info # def load_proxy_list(filters = {}) doc = load_document(provider_url, filters) doc.xpath(xpath) end def build_proxy(*args) to_proxy(*args) rescue StandardError => e ProxyFetcher.logger.warn( "Failed to build Proxy object for #{self.class.name} due to error: #{e.message}" ) nil end # Convert HTML element with proxy info to ProxyFetcher::Proxy instance. # # Abstract method. Must be implemented in a descendant class # # @return [Proxy] # new proxy object from the HTML node # def to_proxy(*) raise NotImplementedError, "#{__method__} must be implemented in a descendant class!" end end end end