Sha256: 185d3dc72309fe433f44f2283f9e690d4e30ef9665f09dded0a04fce5aa18ea2

Contents?: true

Size: 1.49 KB

Versions: 4

Compression:

Stored size: 1.49 KB

Contents

require 'rubygems'
require 'hpricot'
require 'htmlentities'

require 'iconv'
require 'open-uri'
require 'uri'

module Murlsh

  module_function

  def get_title(url, options={})
    options[:headers] = default_headers(url).merge(
      options.fetch(:headers, {}))

    options = {
      :failproof => true,
      }.merge(options)

    result = nil
    begin
      options[:content_type] ||= get_content_type(url, options)
      if might_have_title(options[:content_type])
        f = open(url, options[:headers])

        doc = Hpricot(f)

        result = HTMLEntities.new.decode(Iconv.conv('utf-8',
          get_charset(doc) || f.charset, find_title(doc)))
      end
    rescue Exception => e
       raise unless options[:failproof]
    end
    (result and !result.empty?) ? result : url
  end

  def might_have_title(content_type)
    content_type[/^text\/html/]
  end

  # Find the title in an Hpricot document.
  def find_title(doc)
    %w{//html/head/title //head/title //html/title //title}.each do |xpath|
      return (doc/xpath).first.inner_html unless (doc/xpath).first.nil?
    end
    nil
  end

  # Get the character set of an Hpricot document.
  def get_charset(doc)
    %w{content-type Content-Type}.each do |ct|
      content_type = doc.at("meta[@http-equiv='#{ct}']")
      unless content_type.nil?
        content = content_type['content']
        unless content.nil?
          charset = content[/charset=([\w_.:-]+)/, 1]
          return charset if charset
        end
      end
    end
    nil
  end

end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
murlsh-0.2.4 lib/murlsh/get_title.rb
murlsh-0.2.3 lib/murlsh/get_title.rb
murlsh-0.2.2 lib/murlsh/get_title.rb
murlsh-0.2.1 lib/murlsh/get_title.rb