lib/retriever/target.rb in rubyretriever-1.1.0 vs lib/retriever/target.rb in rubyretriever-1.2.0
- old
+ new
@@ -1,23 +1,24 @@
require 'open-uri'
+require 'addressable/uri'
module Retriever
#
class Target
- HTTP_RE = Regexp.new(/^http/i).freeze
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+ HTTP_RE = Regexp.new(/^http/i).freeze
- attr_reader :host, :target, :host_re, :source, :file_re
+ attr_reader :host, :target, :host_re, :source, :file_re, :scheme
def initialize(url, file_re = nil)
- url = "http://#{url}" unless HTTP_RE =~ url
- fail 'Bad URL' unless /\./ =~ url
- new_uri = URI(url)
- @target = new_uri.to_s
- @host = new_uri.host
- @host_re = Regexp.new(@host.sub('www.', ''))
- @file_re ||= file_re
+ fail 'Bad URL' unless url.include?('.')
+ url = "http://#{url}" unless HTTP_RE =~ url
+ target_uri = Addressable::URI.parse(url)
+ @target = target_uri.to_s
+ @host = target_uri.host
+ @host_re = Regexp.new(@host.sub('www.', ''))
+ @file_re ||= file_re
+ @scheme = target_uri.scheme
end
def source
resp = open(@target)
resp_url = resp.base_uri.to_s
@@ -29,16 +30,17 @@
resp = resp.read
#
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
fail 'Domain not working. Try HTTPS???' unless resp
# consider using scrub from ruby 2.1? this misses some things
- resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
+ resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
end
def resync_target_and_return_source(url)
- new_t = Retriever::Target.new(url)
+ new_t = Retriever::Target.new(url)
@target = new_t.target
- @host = new_t.host
+ @host = new_t.host
+ @scheme = new_t.scheme
new_t.source
end
end
end