target.rb in rubyretriever-1.2.0

- old
+ new

@@ -1,23 +1,24 @@
 require 'open-uri'
+require 'addressable/uri'
 
 module Retriever
   #
   class Target
-    HTTP_RE = Regexp.new(/^http/i).freeze
-    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+    HTTP_RE    = Regexp.new(/^http/i).freeze
 
-    attr_reader :host, :target, :host_re, :source, :file_re
+    attr_reader :host, :target, :host_re, :source, :file_re, :scheme
 
     def initialize(url, file_re = nil)
-      url = "http://#{url}" unless HTTP_RE =~ url
-      fail 'Bad URL' unless /\./ =~ url
-      new_uri = URI(url)
-      @target = new_uri.to_s
-      @host = new_uri.host
-      @host_re = Regexp.new(@host.sub('www.', ''))
-      @file_re ||= file_re
+      fail 'Bad URL' unless url.include?('.')
+      url         = "http://#{url}" unless HTTP_RE =~ url
+      target_uri  = Addressable::URI.parse(url)
+      @target     = target_uri.to_s
+      @host       = target_uri.host
+      @host_re    = Regexp.new(@host.sub('www.', ''))
+      @file_re  ||= file_re
+      @scheme     = target_uri.scheme
     end
 
     def source
       resp = open(@target)
       resp_url = resp.base_uri.to_s
@@ -29,16 +30,17 @@
       resp = resp.read
       #
       fail 'Domain is not working. Try the non-WWW version.' if resp == ''
       fail 'Domain not working. Try HTTPS???' unless resp
       # consider using scrub from ruby 2.1? this misses some things
-      resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace)
+      resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
     end
 
     def resync_target_and_return_source(url)
-      new_t = Retriever::Target.new(url)
+      new_t   = Retriever::Target.new(url)
       @target = new_t.target
-      @host = new_t.host
+      @host   = new_t.host
+      @scheme = new_t.scheme
       new_t.source
     end
   end
 end