Sha256: e05ebbe0ff39deb5cae22cbdc2cb493c07ee1cc54e4ef04868bf4e00cadc8480

Contents?: true

Size: 1.42 KB

Versions: 8

Compression:

Stored size: 1.42 KB

Contents

require 'open-uri'
require 'addressable/uri'

module Retriever
  #
  class Target
    HTTP_RE    = Regexp.new(/^http/i).freeze

    attr_reader :host, :target, :host_re, :source, :file_re, :scheme

    def initialize(url, file_re = nil)
      fail 'Bad URL' unless url.include?('.')
      url         = "http://#{url}" unless HTTP_RE =~ url
      target_uri  = Addressable::URI.parse(Addressable::URI.encode(url))
      @target     = target_uri.to_s
      @host       = target_uri.host
      @host_re    = Regexp.new(@host.sub('www.', ''))
      @file_re  ||= file_re
      @scheme     = target_uri.scheme
    end

    def source
      resp = open(@target)
      resp_url = resp.base_uri.to_s
      if @target != resp_url
        fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
        # if redirect URL is same host, we want to re-sync @target
        return resync_target_and_return_source(resp_url)
      end
      resp = resp.read
      #
      fail 'Domain is not working. Try the non-WWW version.' if resp == ''
      fail 'Domain not working. Try HTTPS???' unless resp
      # consider using scrub from ruby 2.1? this misses some things
      resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
    end

    def resync_target_and_return_source(url)
      new_t   = Retriever::Target.new(url)
      @target = new_t.target
      @host   = new_t.host
      @scheme = new_t.scheme
      new_t.source
    end
  end
end

Version data entries

8 entries across 8 versions & 1 rubygems

Version Path
rubyretriever-1.4.5 lib/retriever/target.rb
rubyretriever-1.4.4 lib/retriever/target.rb
rubyretriever-1.4.3 lib/retriever/target.rb
rubyretriever-1.4.2 lib/retriever/target.rb
rubyretriever-1.4.1 lib/retriever/target.rb
rubyretriever-1.4.0 lib/retriever/target.rb
rubyretriever-1.3.0 lib/retriever/target.rb
rubyretriever-1.2.4 lib/retriever/target.rb