Sha256: 3531e60d3e998d7d395a3882aebcfbae2ddaf270d663a084c7ee4bc184243701
Contents?: true
Size: 1.06 KB
Versions: 11
Compression:
Stored size: 1.06 KB
Contents
require 'hpricot' module Relevance module Tarantula class HtmlDocumentHandler extend Forwardable def_delegators("@crawler", :queue_link, :queue_form) def initialize(crawler) @crawler = crawler end # HTML::Document shouts to stderr when it sees ugly HTML # We don't want this -- the InvalidHtmlHandler will deal with it def html_doc_without_stderr_noise(html) body = nil Recording.stderr do body = Hpricot html end body end def handle(result) response = result.response url = result.url return unless response.html? body = html_doc_without_stderr_noise(response.body) body.search('a').each do |tag| queue_link(tag, url) end body.search('link').each do |tag| queue_link(tag, url) end body.search('form').each do |form| form['action'] = url unless form['action'] queue_form(form, url) end nil end end end end
Version data entries
11 entries across 11 versions & 2 rubygems