lib/retriever/target.rb in rubyretriever-0.1.4 vs lib/retriever/target.rb in rubyretriever-1.0.0
- old
+ new
@@ -1,41 +1,52 @@
require 'open-uri'
module Retriever
+
class Target
+
HTTP_RE = Regexp.new(/^http/i).freeze
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
- attr_reader :host, :target, :host_re, :source
- def initialize(url)
+
+ attr_reader :host, :target, :host_re, :source, :file_re
+
+ def initialize(url,file_re=nil)
url = "http://#{url}" if (!(HTTP_RE =~ url))
fail "Bad URL" if (!(/\./ =~ url))
new_uri = URI(url)
@target = new_uri.to_s
@host = new_uri.host
- @host_re = Regexp.new(@host).freeze
+ @host_re = Regexp.new(@host.sub('www.',''))
+ @file_re ||= file_re
end
def source
resp = false
begin
resp = open(@target)
rescue StandardError => e
- #puts e.message + " ## " + url
- #the trap abrt is nescessary to handle the SSL error
- #for some ungodly reason it's the only way I found to handle it
trap("ABRT"){
puts "#{@target} failed SSL Certification Verification"
}
return false
end
- if (@target != resp.base_uri.to_s)
- fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
+ resp_url = resp.base_uri.to_s
+ if (@target != resp_url)
+ if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
+ new_t = Retriever::Target.new(resp_url)
+ @target = new_t.target
+ @host = new_t.host
+ return new_t.source
+ end
+ fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
end
resp = resp.read
if resp == ""
fail "Domain is not working. Try the non-WWW version."
end
- return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
+ fail "Domain not working. Try HTTPS???" if !resp
+ return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
end
end
+
end