Sha256: 3531e60d3e998d7d395a3882aebcfbae2ddaf270d663a084c7ee4bc184243701

Contents?: true

Size: 1.06 KB

Versions: 11

Compression:

Stored size: 1.06 KB

Contents

require 'hpricot'

module Relevance
  module Tarantula

    class HtmlDocumentHandler 
      extend Forwardable
      def_delegators("@crawler", :queue_link, :queue_form)

      def initialize(crawler)
        @crawler = crawler
      end              
      # HTML::Document shouts to stderr when it sees ugly HTML
      # We don't want this -- the InvalidHtmlHandler will deal with it
      def html_doc_without_stderr_noise(html)  
        body = nil
        Recording.stderr do
          body = Hpricot html
        end       
        body
      end
      def handle(result)
        response = result.response
        url = result.url
        return unless response.html?
        body = html_doc_without_stderr_noise(response.body)
        body.search('a').each do |tag|
          queue_link(tag, url)
        end
        body.search('link').each do |tag|
          queue_link(tag, url)
        end
        body.search('form').each do |form|
          form['action'] = url unless form['action']
          queue_form(form, url)
        end
        nil
      end
    end

  end
end

Version data entries

11 entries across 11 versions & 2 rubygems

Version Path
codez-tarantula-0.5.5 lib/relevance/tarantula/html_document_handler.rb
codez-tarantula-0.5.4 lib/relevance/tarantula/html_document_handler.rb
codez-tarantula-0.5.3 lib/relevance/tarantula/html_document_handler.rb
codez-tarantula-0.5.1 lib/relevance/tarantula/html_document_handler.rb
codez-tarantula-0.5.0 lib/relevance/tarantula/html_document_handler.rb
tarantula-0.5.1 lib/relevance/tarantula/html_document_handler.rb
tarantula-0.5.0 lib/relevance/tarantula/html_document_handler.rb
tarantula-0.4.3 lib/relevance/tarantula/html_document_handler.rb
tarantula-0.4.2 lib/relevance/tarantula/html_document_handler.rb
tarantula-0.4.1 lib/relevance/tarantula/html_document_handler.rb
tarantula-0.4.0 lib/relevance/tarantula/html_document_handler.rb