module MetaInspector # A MetaInspector::Document knows about its URL and its contents class Document attr_reader :html_content_only, :allow_redirections, :warn_level, :headers include MetaInspector::Exceptionable # Initializes a new instance of MetaInspector::Document, setting the URL # Options: # * connection_timeout: defaults to 20 seconds # * read_timeout: defaults to 20 seconds # * retries: defaults to 3 times # * html_content_type_only: if an exception should be raised if request # content-type is not text/html. Defaults to false. # * allow_redirections: when true, follow HTTP redirects. Defaults to true # * document: the html of the url as a string # * warn_level: what to do when encountering exceptions. # Can be :warn, :raise or nil # * headers: object containing custom headers for the request # * normalize_url: true by default def initialize(initial_url, options = {}) options = defaults.merge(options) @connection_timeout = options[:connection_timeout] @read_timeout = options[:read_timeout] @retries = options[:retries] @html_content_only = options[:html_content_only] @allow_redirections = options[:allow_redirections] @document = options[:document] @download_images = options[:download_images] @headers = options[:headers] @warn_level = options[:warn_level] @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level) @normalize_url = options[:normalize_url] @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log, normalize: @normalize_url) @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections, connection_timeout: @connection_timeout, read_timeout: @read_timeout, retries: @retries, exception_log: @exception_log, headers: @headers) unless @document @parser = MetaInspector::Parser.new(self, exception_log: @exception_log, download_images: @download_images) end extend Forwardable delegate [:url, :scheme, :host, :root_url, :tracked?, :untracked_url, :untrack!] => :@url delegate [:content_type, :response] => :@request delegate [:parsed, :title, :best_title, :description, :links, :images, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon, :head_links, :stylesheets, :canonicals] => :@parser # Returns all document data as a nested Hash def to_hash { 'url' => url, 'scheme' => scheme, 'host' => host, 'root_url' => root_url, 'title' => title, 'best_title' => best_title, 'description' => description, 'links' => links.to_hash, 'images' => images.to_a, 'charset' => charset, 'feed' => feed, 'content_type' => content_type, 'meta_tags' => meta_tags, 'favicon' => images.favicon, 'response' => { 'status' => response.status, 'headers' => response.headers } } end # Returns the contents of the document as a string def to_s document end private def defaults { :timeout => 20, :retries => 3, :html_content_only => false, :warn_level => :raise, :headers => { 'User-Agent' => default_user_agent, 'Accept-Encoding' => 'identity' }, :allow_redirections => true, :normalize_url => true, :download_images => true } end def default_user_agent "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)" end def document @document ||= if html_content_only && content_type != 'text/html' fail "The url provided contains #{content_type} content instead of text/html content" else @request.read end rescue Exception => e @exception_log << e end end end