Sha256: 8867acfdf7869755b785538ce41e2ef7986777432d733820cd62069007399720

Contents?: true

Size: 928 Bytes

Versions: 33

Compression:

Stored size: 928 Bytes

Contents

require 'hpricot'

class Relevance::Tarantula::HtmlDocumentHandler 
  extend Forwardable
  def_delegators("@crawler", :queue_link, :queue_form)
  
  def initialize(crawler)
    @crawler = crawler
  end              
  # HTML::Document shouts to stderr when it sees ugly HTML
  # We don't want this -- the InvalidHtmlHandler will deal with it
  def html_doc_without_stderr_noise(html)  
    body = nil
    Recording.stderr do
      body = Hpricot html
    end       
    body
  end
  def handle(result)
    response = result.response
    url = result.url
    return unless response.html?
    body = html_doc_without_stderr_noise(response.body)
    body.search('a').each do |tag|
      queue_link(tag, url)
    end
    body.search('link').each do |tag|
      queue_link(tag, url)
    end
    body.search('form').each do |form|
      form['action'] = url unless form['action']
      queue_form(form, url)
    end
    nil
  end
end

Version data entries

33 entries across 33 versions & 5 rubygems

Version Path
relevance-tarantula-0.0.1 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.2 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.3 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.5 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.6 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.7.1 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.7 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.8.0 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.0.8.1 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.0 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.1 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.2 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.3 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.4 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.5 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.6 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.7 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.1.8 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.2.0 lib/relevance/tarantula/html_document_handler.rb
relevance-tarantula-0.2.1 lib/relevance/tarantula/html_document_handler.rb