load File.dirname(__FILE__) + '/net/http.rb' load File.dirname(__FILE__) + '/net/response.rb' module HttpCrawler class HTTP < Net::HTTP # 自动获取代理,true 表示自动获取代理 、false 表示不自动获取 attr_accessor :auto_proxy # 代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API attr_accessor :proxy_api # 调用自己的代理池所需要的主键 key attr_accessor :proxy_key # 请求错误后的重复最大请求次数 attr_accessor :max_error_num def initialize(address, port = nil) super(address, port) @max_error_num = 2 @error_num = 0 @proxy_key = "default" end def http_error_sleep sleep(0.5) end def server_error_sleep sleep(3) end def proxy_api @proxy_api ||= "my" end @@proxy_list = [] # 为 @http 重设代理 def proxy(p = {}) raise '代理设置 p_addr 不能为空' unless p["p_addr"] raise '代理设置 p_port 不能为空' unless p["p_port"] p["p_user"] ||= nil p["p_pass"] ||= nil Rails.logger.info("切换代理至 => #{p}") # 设为 false 否则不会启用代理 @proxy_from_env = false # 初始化代理数据 @proxy_address = p["p_addr"] @proxy_port = p["p_port"] @proxy_user = p["p_user"] @proxy_pass = p["p_pass"] end # 通过调用 api 获取代理或者通过自定义设置代理 def get_proxy while @@proxy_list.blank? Rails.logger.debug("@@proxy_list 为空进行更新") proxy_client = HttpCrawler::Proxy.for(proxy_api) proxy_r = proxy_client.get_proxy(key: proxy_key) @@proxy_list << proxy_r.parsing Rails.logger.debug("@@proxy_list => #{@@proxy_list}") sleep(1) end p = @@proxy_list.delete_at(0) Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}") unless p && p["p_addr"] && p["p_port"] Rails.logger.warn "无最新代理等待5秒后重新获取" sleep(5) p = get_proxy end if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"]) Rails.logger.warn "无最新代理等待5秒后重新获取" sleep(5) p = get_proxy end p end def update_proxy(p = {}) if p.blank? proxy(get_proxy) else proxy(p) end end # 如果自动更新代理 则更新代理返回 true,否则返回false def update_proxy?(p = {}) if auto_proxy if p.blank? proxy(get_proxy) else proxy(p) end return true else return false end end # 重定向请求 def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) # You should choose a better exception. raise ArgumentError, 'too many HTTP repeated' if limit == 0 # 更新uri_or_path uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii" response = get(uri_or_path, initheader, dest, &block) case response when Net::HTTPSuccess then response when Net::HTTPRedirection then location = response['location'] Rails.logger.warn "redirected to #{location}" # 传入 location 进行跳转 get_fetch(location, initheader, dest, limit - 1, &block) when Net::HTTPServerError then Rails.logger.warn "Net::HTTPServerError 5XX to #{address}" server_error_sleep # 重新请求 get_fetch(uri_or_path, initheader, dest, &block) when Net::HTTPProxyAuthenticationRequired then Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}" if update_proxy? server_error_sleep # 重新请求 get_fetch(uri_or_path, initheader, dest, &block) else response.error! end else server_error_sleep response.error! end end # 重定向请求 def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码 uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii" # Rails.logger.debug "post_fetch => #{uri_or_path}" response = post(uri_or_path, data, initheader, dest, &block) case response when Net::HTTPSuccess then response when Net::HTTPRedirection then location = response['location'] Rails.logger.warn "redirected to #{location}" # 传入 location 进行跳转 get_fetch(location, initheader, dest, 9, &block) when Net::HTTPServerError then Rails.logger.warn "Net::HTTPServerError 5XX to #{address}" server_error_sleep # 重新请求 post_fetch(uri_or_path, initheader, dest, &block) when Net::HTTPProxyAuthenticationRequired then Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{http.proxy_address}:#{http.proxy_port}] =>#{address}" if update_proxy? server_error_sleep # 重新请求 post_fetch(uri_or_path, initheader, dest, &block) else response.error! end else server_error_sleep response.error! end end # def post_fetch # # 重写 发送请求的方法 # def request(req, body = nil, &block) begin Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started? Rails.logger.debug("body => #{body}") if started? && body super(req, body, &block) rescue => error if started? # started? 是为了判断是否结束http请求,如果不添加则会处理2次异常 raise error else # 最大错误尝试次数 if @error_num < @max_error_num @error_num += 1 http_error_sleep retry # 这将把控制移到 begin 的开头 else # 超过最大错误限制 判断错误类型 case error when Net::HTTPFatalError raise error when EOFError Rails.logger.warn "EOFError!" if update_proxy? proxy(get_proxy) http_error_sleep retry # 这将把控制移到 begin 的开头 else raise error end when Timeout::Error Rails.logger.warn "请求超时!" if update_proxy? @error_num = 0 http_error_sleep retry # 这将把控制移到 begin 的开头 else raise error end else raise error end end end end # begin end # def request(req, body = nil, &block) end end