Sha256: 093a9a5c1cca1c6552810d20a7728533780e6e70bff867bdc663ccba39d1ecc3

Contents?: true

Size: 1.52 KB

Versions: 4

Compression:

Stored size: 1.52 KB

Contents

require 'open-uri'

module Retriever
  
  class Target
    
    HTTP_RE = Regexp.new(/^http/i).freeze
    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
    
    attr_reader :host, :target, :host_re, :source, :file_re

    def initialize(url,file_re=nil)
      url = "http://#{url}" if (!(HTTP_RE =~ url))
      fail "Bad URL" if (!(/\./ =~ url))
      new_uri = URI(url)
      @target = new_uri.to_s
      @host = new_uri.host
      @host_re = Regexp.new(@host.sub('www.',''))
      @file_re ||= file_re
    end

    def source
      resp = false
      begin
        resp = open(@target)
      rescue StandardError => e
        trap("ABRT"){
          puts "#{@target} failed SSL Certification Verification"
        }
        return false
      end
      resp_url = resp.base_uri.to_s
      if (@target != resp_url)
          if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
            new_t = Retriever::Target.new(resp_url)
            @target = new_t.target
            @host = new_t.host
            return new_t.source
          end
          fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail 
      end
      resp = resp.read
      if resp == ""
        fail "Domain is not working. Try the non-WWW version."
      end
      fail "Domain not working. Try HTTPS???" if !resp
      return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
    end

  end

end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
rubyretriever-1.0.3 lib/retriever/target.rb
rubyretriever-1.0.2 lib/retriever/target.rb
rubyretriever-1.0.1 lib/retriever/target.rb
rubyretriever-1.0.0 lib/retriever/target.rb