Sha256: 1b0f68cf82f43d2c70cb68ae243e8e51dcc8de96d03ef4b7001e2fd853e0ee61

Contents?: true

Size: 1.88 KB

Versions: 4

Compression:

Stored size: 1.88 KB

Contents

require 'net/http'
require 'net/https'
require 'uri'

class URI::Generic

  def path_query
    path + (query ? "?#{query}" : '')
  end

end

module Murlsh

  module_function

  def get_content_type(url, options={})
    options[:headers] = default_headers(url).merge(
      options.fetch(:headers, {}))

    options = {
      :failproof => true,
      :redirects => 0,
      }.merge(options)

    unless options[:redirects] > 3
      begin
        url = parse_uri(url)

        make_net_http(url, options).start do |http|
          resp = get_resp(http, url, options[:headers])
          case resp
            when Net::HTTPSuccess then return resp['content-type']
            when Net::HTTPRedirection then
              options[:redirects] += 1
              return get_content_type(resp['location'], options)
          end
        end
      rescue Exception => e
        raise unless options[:failproof]
      end
    end
    ''
  end

  # Parse a URI if it's not already parsed.
  def parse_uri(uri)
    uri.is_a?(URI::HTTP) ? uri : URI(uri)
  end

  def make_net_http(url, options={})
    net_http = Net::HTTP.new(url.host, url.port)
    net_http.use_ssl = (url.scheme == 'https')
    net_http.set_debug_output(options[:debug]) if options[:debug]
    net_http
  end

  # Get the response to HTTP HEAD. If HEAD not allowed do GET.
  def get_resp(http, url, headers={})
    resp = http.request_head(url.path_query, headers)
    if Net::HTTPMethodNotAllowed === resp
      http.request_get(url.path_query, headers)
    else
      resp
    end
  end

  def default_headers(url)
    result = {
      'User-Agent' =>
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
    }
    begin
      parsed_url = parse_uri(url)
      if (parsed_url.host || '')[/^www\.nytimes\.com/]
        result['Referer'] = 'http://news.google.com/'
      end
    rescue URI::InvalidURIError => e
    end

    result
  end

end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
murlsh-0.2.4 lib/murlsh/get_content_type.rb
murlsh-0.2.3 lib/murlsh/get_content_type.rb
murlsh-0.2.2 lib/murlsh/get_content_type.rb
murlsh-0.2.1 lib/murlsh/get_content_type.rb