EOF ] else response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved") response["Location"] = "http://localhost/redirect" [response, ""] end end scraper = new_scraper(URI.parse("http://localhost/source")) scraper.scrape assert_equal "http://localhost/source", scraper.page_info.original_url.to_s assert_equal "http://localhost/redirect", scraper.page_info.url.to_s assert_equal time, scraper.page_info.last_modified assert_equal "etag", scraper.page_info.etag assert_equal "other-encoding", scraper.page_info.encoding end def test_scraping_end_to_end Net::HTTP.on_get do |address, path, headers| [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF

EOF ] end scraper = new_scraper(URI.parse("http://localhost/")) do process "div" do |element| @concat = (@concat || "") << (element.attributes["id"] || "") end attr :concat end scraper.scrape assert_equal "12", scraper.concat end # # Tests extractor methods. # def test_extractors html = %Q{

} scraper = new_scraper(html) do process "div", extractor(:div_id=>"@id") attr :div_id end scraper.scrape assert_equal "1", scraper.div_id scraper = new_scraper(html) do process "div", :div_id=>"@id" attr :div_id end scraper.scrape assert_equal "1", scraper.div_id end def test_text_and_element_extractors html = %Q{

some text

} # Extract the node itself. scraper = new_scraper(html) do process "div", extractor(:value=>:element) attr :value end scraper.scrape assert_equal "div", scraper.value.name # Extract the text value of the node. scraper = new_scraper(html) do process "div", extractor(:value=>:text) attr :value end scraper.scrape assert_equal "some text", scraper.value end def test_extractors_objects html = <<-EOF EOF # Extract both elements based on class, return the second one. scraper = new_scraper(html) do process ".header", extractor(:header=>:element) attr :header end scraper.scrape assert_equal "h2", scraper.header.name # Extracting a specific element skips the second match. html = <<-EOF EOF scraper = new_scraper(html) do process ".header", extractor(:header=>"h1") attr :header end scraper.scrape assert_equal "h1", scraper.header.name end def test_attribute_extractors # Extracting the attribute skips the second match. html = <<-EOF bar

foo EOF scraper = new_scraper(html) do process "abbr", extractor(:title=>"@title") attr :title end scraper.scrape assert_equal "foo", scraper.title # Extracting a specific element skips the second match. html = <<-EOF EOF scraper = new_scraper(html) do process ".header", extractor(:header=>"h1@id") attr :header end scraper.scrape assert_equal "1", scraper.header end def test_class_extractors headers = Class.new(Scraper::Base) headers.instance_eval do root_element nil process "h1,h2", :h1=>"h1", :h2=>"h2" attr :h1 attr :h2 end html = <<-EOF

first

second

EOF scraper = new_scraper(html) do process "div", extractor(:headers=>headers) attr :headers end scraper.scrape assert scraper.headers assert_equal "h1", scraper.headers.h1.name assert_equal "h2", scraper.headers.h2.name end def test_array_extractors html = <<-EOF

first

second

EOF scraper = new_scraper(html) do process "h1", extractor("headers[]"=>:text) attr :headers end scraper.scrape assert scraper.headers.is_a?(Array) assert_equal 2, scraper.headers.size assert_equal "first", scraper.headers[0] assert_equal "second", scraper.headers[1] end def test_hash_extractors html = <<-EOF

EOF scraper = new_scraper(html) do process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text}) attr :header end scraper.scrape assert scraper.header.is_a?(Hash) assert_equal 3, scraper.header.size assert_equal "1", scraper.header[:id] assert_equal "header", scraper.header[:class] assert_equal "first", scraper.header[:text] end def test_multi_value_extractors html = <<-EOF

EOF scraper = new_scraper(html) do process "h1", [:text, :kls]=>Scraper.define { process "*", :text=>:text, :kls=>"@class" } end result = scraper.scrape assert "first", result.text assert "header", result.kls end def test_conditional_extractors # Look for id attribute (second header only), # if not found look for class attribute (first # two headers), otherwise just get text (third # header). html = <<-EOF

first

second

third

EOF scraper = new_scraper(html) do process "h1", extractor("headers[]"=>["@id", "@class", :text]) attr :headers end scraper.scrape assert scraper.headers.is_a?(Array) assert_equal 3, scraper.headers.size assert_equal "foo", scraper.headers[0] assert_equal "bar", scraper.headers[1] assert_equal "third", scraper.headers[2] end DIVS_ST_ND = <<-EOF

first

second

EOF def test_accessors_from_extractor scraper = new_scraper(DIVS_ST_ND) do process_first "div", :div_id=>"@id", :div_text=>:text result :div_id end value = scraper.scrape assert_equal "1", value scraper = new_scraper(DIVS_ST_ND) do process_first "div", :div_id=>"@id", :div_text=>:text result :div_id, :div_text end value = scraper.scrape assert_equal "1", value.div_id assert_equal "first", value.div_text scraper = new_scraper(DIVS_ST_ND) do process_first "div", :div_id=>"@id", :div_text=>:text end value = scraper.scrape assert_equal "1", value.div_id assert_equal "first", value.div_text scraper = new_scraper(DIVS_ST_ND) do attr_accessor :div_class process_first "div", :div_id=>"@id", :div_text=>:text result :div_id, :div_class end value = scraper.scrape assert_equal "1", value.div_id assert_raise(NoMethodError) { value.div_text } scraper = new_scraper(DIVS_ST_ND) do process "div", "div_ids[]"=>"@id" result :div_ids end value = scraper.scrape assert_equal "1", value[0] assert_equal "2", value[1] end def test_array_accessors scraper = new_scraper(DIVS_ST_ND) do array :div_id, :div_text process "div", :div_id=>"@id", :div_text=>:text result :div_id, :div_text end value = scraper.scrape assert_equal 2, value.div_id.size assert_equal 2, value.div_text.size assert_equal "1", value.div_id[0] assert_equal "2", value.div_id[1] assert_equal "first", value.div_text[0] assert_equal "second", value.div_text[1] end # # Root element tests. # HTML_EMPTY = <<-EOF EOF def test_scrape_body_by_default scraper = Class.new(Scraper::Base).new(HTML_EMPTY) scraper.class.instance_eval do process "head" do |element| @head = element end attr :head process "body" do |element| @body = element end attr :body end scraper.scrape assert scraper.head assert scraper.body end def test_changing_root_element only_header = new_scraper(HTML_EMPTY) do root_element "head" process "head" do |element| @head = element end attr :head process "body" do |element| @body = element end attr :body end only_body = Class.new(only_header.class).new(HTML_EMPTY) only_body.class.root_element "body" both_parts = Class.new(only_body.class).new(HTML_EMPTY) both_parts.class.root_element nil # We set this scraper to begin with the head element, # so we can see the head element, but not the body. only_header.scrape assert only_header.head assert only_header.body.nil? # Now switch to a scraper that processes the body element, # skipping the header. only_body.scrape assert only_body.head.nil? assert only_body.body # Now switch to a scraper that doesn't specify a root element, # and it will process both header and body. both_parts.scrape assert both_parts.head assert both_parts.body end # Test prepare/result. def test_prepare_and_result # Extracting the attribute skips the second match. scraper = new_scraper(DIVS123) do process("div") { |element| @count +=1 } define_method(:prepare) { @count = 1 } define_method(:result) { @count } end result = scraper.scrape assert_equal 4, result end def test_changing_document_from_prepare # Extracting the attribute skips the second match. scraper = new_scraper(DIVS123) do selector :divs, "div" define_method :prepare do |document| @document = divs(document)[1] end array :ids process "div", :ids=>"@id" result :ids end result = scraper.scrape assert_equal 1, result.size assert_equal "2", result[0] end def test_anonymous_scrapers scraper = Scraper.define do array :ids process "div", :ids=>"@id" result :ids end result = scraper.scrape(DIVS123) assert_equal "1", result[0] assert_equal "2", result[1] assert_equal "3", result[2] end def test_named_rules scraper = Scraper.define do array :ids1, :ids2 process :main, "div", :ids1=>"@id" process :main, "div", :ids2=>"@id" result :ids1, :ids2 end result = scraper.scrape(DIVS123) assert_equal nil, result.ids1 assert_equal 3, result.ids2.size assert_equal "1", result.ids2[0] assert_equal "2", result.ids2[1] assert_equal "3", result.ids2[2] end protected def new_scraper(what, &block) cls = Class.new(Scraper::Base) cls.root_element nil cls.parser :html_parser cls.class_eval &block if block cls.new(what) end end # Repeats the same set of tests, but using Tidy instead of HTMLParser. class ScraperUsingTidyTest < ScraperTest protected def new_scraper(what, &block) cls = Class.new(Scraper::Base) cls.root_element nil cls.parser :tidy cls.class_eval &block if block cls.new(what) end end