lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.5.0 vs lib/regexp_crawler/crawler.rb in flyerhzm-regexp_crawler-0.6.0

- old
+ new

@@ -29,28 +29,36 @@ @results end private def parse_page(uri) - response = Net::HTTP.start(uri.host, uri.port) do |http| - http.get(uri.request_uri, headers) - end + response = Net::HTTP.get_response_with_headers(uri, @headers) parse_response(response, uri) end + def continue_uri(uri, page) + if page.start_with?(uri.scheme) + URI.parse(page) + elsif page.start_with?('/') + URI.join(uri.scheme + '://' + uri.host, page) + else + URI.parse(uri.to_s.split('/')[0..-2].join('/') + '/' + page) + end + end + def parse_response(response, uri) - response_body = Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first if encoding + response_body = encoding.nil? ? response.body : Iconv.iconv("UTF-8//IGNORE", "#{encoding}//IGNORE", response.body).first if response.is_a? Net::HTTPSuccess if continue_regexp response_body.scan(continue_regexp).each do |page| page = page.first if page.is_a? Array - continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page) + continue_uri = continue_uri(uri, page) @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri) end end md = @capture_regexp.match(response_body) if md - captures = md.captures if md + captures = md.captures result = {} captures.each_index do |i| result[named_captures[i].to_sym] = captures[i] end if @save_method