lib/retriever/target.rb in rubyretriever-0.1.4 vs lib/retriever/target.rb in rubyretriever-1.0.0

- old
+ new

@@ -1,41 +1,52 @@ require 'open-uri' module Retriever + class Target + HTTP_RE = Regexp.new(/^http/i).freeze DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze - attr_reader :host, :target, :host_re, :source - def initialize(url) + + attr_reader :host, :target, :host_re, :source, :file_re + + def initialize(url,file_re=nil) url = "http://#{url}" if (!(HTTP_RE =~ url)) fail "Bad URL" if (!(/\./ =~ url)) new_uri = URI(url) @target = new_uri.to_s @host = new_uri.host - @host_re = Regexp.new(@host).freeze + @host_re = Regexp.new(@host.sub('www.','')) + @file_re ||= file_re end def source resp = false begin resp = open(@target) rescue StandardError => e - #puts e.message + " ## " + url - #the trap abrt is nescessary to handle the SSL error - #for some ungodly reason it's the only way I found to handle it trap("ABRT"){ puts "#{@target} failed SSL Certification Verification" } return false end - if (@target != resp.base_uri.to_s) - fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s)) + resp_url = resp.base_uri.to_s + if (@target != resp_url) + if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL + new_t = Retriever::Target.new(resp_url) + @target = new_t.target + @host = new_t.host + return new_t.source + end + fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail end resp = resp.read if resp == "" fail "Domain is not working. Try the non-WWW version." end - return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm. + fail "Domain not working. Try HTTPS???" if !resp + return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things end end + end