# ScrAPI toolkit for Ruby
#
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
# Developed for http://co.mments.com
# Code and documention: http://labnotes.org
require "rubygems"
require "time"
require "test/unit"
require File.join(File.dirname(__FILE__), "mock_net_http")
require File.join(File.dirname(__FILE__), "../lib", "scrapi")
class ScraperTest < Test::Unit::TestCase
DIVS123 = <<-EOF
EOF
DIVS1_23 = <<-EOF
EOF
def setup
Net::HTTP.reset_on_get
end
def teardown
Net::HTTP.reset_on_get
end
#
# Tests selector methods.
#
def test_define_selectors
scraper = new_scraper(DIVS123) do
selector :test, "div"
end
assert_equal 3, scraper.test(scraper.document).size
3.times do |i|
assert_equal String(i + 1), scraper.test(scraper.document)[i].attributes["id"]
end
end
def test_selector_blocks
scraper = new_scraper(DIVS123) do
selector :test, "div" do |elements|
return elements[0..-2]
elements[0..-2]
end
end
assert_equal 2, scraper.test(scraper.document).size
end
def test_array_selectors
scraper = new_scraper(DIVS123) do
selector :test, "#?", "2"
end
assert_equal 1, scraper.test(scraper.document).size
assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
end
def test_object_selectors
scraper = new_scraper(DIVS123) do
selector :test, HTML::Selector.new("div")
end
assert_equal 3, scraper.test(scraper.document).size
end
def test_selector_returns_array
scraper = new_scraper(DIVS123) do
selector :test0, "#4"
selector :test1, "#1"
selector :test3, "div"
end
assert_equal 0, scraper.test0(scraper.document).size # No elements (empty)
assert_equal 1, scraper.test1(scraper.document).size # One element (array)
assert_equal 3, scraper.test3(scraper.document).size # Array of elements
end
def test_select_in_document_order
scraper = new_scraper(DIVS123) do
selector :test, "#2,#1"
end
assert_equal 2, scraper.test(scraper.document).size
assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
end
def test_selecting_first_element
scraper = new_scraper(DIVS123) do
selector :test, "div"
end
assert_equal 3, scraper.test(scraper.document).size
assert scraper.first_test(scraper.document)
assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
scraper = new_scraper(DIVS123) do
selector :test, "div" do |element|
element[0].attributes["id"]
end
end
assert scraper.first_test(scraper.document)
assert_equal "1", scraper.first_test(scraper.document)
end
#
# Tests process methods.
#
def test_processing_rule
scraper = new_scraper(DIVS123) do
process "div" do |element|
@count = (@count || 0) + 1
end
attr :count
end
scraper.scrape
assert_equal 3, scraper.count
end
def test_processing_rule_with_array
scraper = new_scraper(DIVS123) do
process "#?", "1" do |element|
@count = (@count || 0) + 1
end
attr :count
end
scraper.scrape
assert_equal 1, scraper.count
end
def test_processing_rule_with_selector
scraper = new_scraper(DIVS123) do
process HTML::Selector.new("div") do |element|
@count = (@count || 0) + 1
end
attr :count
end
scraper.scrape
assert_equal 3, scraper.count
end
def test_extracting_in_code
scraper = new_scraper(DIVS123) do
process "div" do |element|
@concat = (@concat || "") << element.attributes["id"]
end
attr :concat
end
scraper.scrape
assert_equal "123", scraper.concat
end
def test_processing_in_document_order
scraper = new_scraper(DIVS123) do
process "#2,#1" do |element|
@concat = (@concat || "") << element.attributes["id"]
end
attr :concat
end
scraper.scrape
assert_equal "12", scraper.concat
end
def test_process_once_if_skipped
scraper = new_scraper(DIVS123) do
def prepare(document)
@found = []
end
process("#1") { |element| @found[0] = true }
process("#1") { |element| @found[1] = true ; skip element }
process("#1") { |element| @found[2] = true }
process("#2", :skip=>true){ |element| @found[3] = true }
process("#2") { |element| @found[4] = true }
attr_reader :found
end
scraper.scrape
assert_equal [true, true, nil, true], scraper.found
end
def test_skip_children
scraper = new_scraper(DIVS1_23) do
process "div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
skip id2(element)
end
selector :id2, "#2"
attr :concat
end
scraper.scrape
assert_equal "13", scraper.concat
end
def test_skip_descendants
# Root, child of root, grandchild of root.
scraper = new_scraper(DIVS1_23) do
process "div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
end
attr :concat
end
scraper.scrape
assert_equal "123", scraper.concat
# Stop at root.
scraper = new_scraper(DIVS1_23) do
process "div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
skip
end
attr :concat
end
scraper.scrape
assert_equal "1", scraper.concat
scraper.scrape
# Child of root, and child of root's child
scraper = new_scraper(DIVS1_23) do
process "div>div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
end
attr :concat
end
scraper.scrape
assert_equal "23", scraper.concat
# Stop at child of root.
scraper = new_scraper(DIVS1_23) do
process "div>div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
skip element.next_element
end
attr :concat
end
scraper.scrape
assert_equal "2", scraper.concat
# Child of root, the child of child of root.
scraper = new_scraper(DIVS1_23) do
process "div div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
end
attr :concat
end
scraper.scrape
assert_equal "23", scraper.concat
# Child of root.
scraper = new_scraper(DIVS1_23) do
process "div div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
skip element.next_element
end
attr :concat
end
scraper.scrape
assert_equal "2", scraper.concat
end
def test_skip_from_extractor
html = %Q{this
"}
scraper = new_scraper(html) do
process "#1", :this1=>:text
process "#1", :this2=>:text
end
scraper.scrape
assert_equal "this", scraper.this1
assert_equal "this", scraper.this2
scraper = new_scraper(html) do
process "#1", :this1=>:text, :skip=>false
process "#1", :this2=>:text
end
scraper.scrape
assert_equal "this", scraper.this1
assert_equal "this", scraper.this2
scraper = new_scraper(html) do
process "#1", :this1=>:text, :skip=>true do
false
end
process "#1", :this2=>:text
end
scraper.scrape
assert_equal "this", scraper.this1
assert_equal nil, scraper.this2
end
def test_stop
scraper = new_scraper(DIVS123) do
process "div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
stop
end
attr :concat
end
scraper.scrape
assert_equal "1", scraper.concat
end
def test_process_first
scraper = new_scraper(DIVS123) do
process "div" do |element|
@all = (@all || 0) + 1
end
process_first "div" do |element|
@first = (@first || 0) + 1
end
attr_accessor :all, :first
end
scraper.scrape
assert_equal 3, scraper.all
assert_equal 1, scraper.first
end
def test_accessors
time = Time.new.rfc2822
Net::HTTP.on_get do |address, path, headers|
if path == "/redirect"
response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
response["Last-Modified"] = time
response["ETag"] = "etag"
[response, <<-EOF
EOF
]
else
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
response["Location"] = "http://localhost/redirect"
[response, ""]
end
end
scraper = new_scraper(URI.parse("http://localhost/source"))
scraper.scrape
assert_equal "http://localhost/source", scraper.page_info.original_url.to_s
assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
assert_equal time, scraper.page_info.last_modified
assert_equal "etag", scraper.page_info.etag
assert_equal "other-encoding", scraper.page_info.encoding
end
def test_scraping_end_to_end
Net::HTTP.on_get do |address, path, headers|
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
EOF
]
end
scraper = new_scraper(URI.parse("http://localhost/")) do
process "div" do |element|
@concat = (@concat || "") << (element.attributes["id"] || "")
end
attr :concat
end
scraper.scrape
assert_equal "12", scraper.concat
end
#
# Tests extractor methods.
#
def test_extractors
html = %Q{}
scraper = new_scraper(html) do
process "div", extractor(:div_id=>"@id")
attr :div_id
end
scraper.scrape
assert_equal "1", scraper.div_id
scraper = new_scraper(html) do
process "div", :div_id=>"@id"
attr :div_id
end
scraper.scrape
assert_equal "1", scraper.div_id
end
def test_text_and_element_extractors
html = %Q{some text
}
# Extract the node itself.
scraper = new_scraper(html) do
process "div", extractor(:value=>:element)
attr :value
end
scraper.scrape
assert_equal "div", scraper.value.name
# Extract the text value of the node.
scraper = new_scraper(html) do
process "div", extractor(:value=>:text)
attr :value
end
scraper.scrape
assert_equal "some text", scraper.value
end
def test_extractors_objects
html = <<-EOF
EOF
# Extract both elements based on class, return the second one.
scraper = new_scraper(html) do
process ".header", extractor(:header=>:element)
attr :header
end
scraper.scrape
assert_equal "h2", scraper.header.name
# Extracting a specific element skips the second match.
html = <<-EOF
EOF
scraper = new_scraper(html) do
process ".header", extractor(:header=>"h1")
attr :header
end
scraper.scrape
assert_equal "h1", scraper.header.name
end
def test_attribute_extractors
# Extracting the attribute skips the second match.
html = <<-EOF
bar
foo
EOF
scraper = new_scraper(html) do
process "abbr", extractor(:title=>"@title")
attr :title
end
scraper.scrape
assert_equal "foo", scraper.title
# Extracting a specific element skips the second match.
html = <<-EOF
EOF
scraper = new_scraper(html) do
process ".header", extractor(:header=>"h1@id")
attr :header
end
scraper.scrape
assert_equal "1", scraper.header
end
def test_class_extractors
headers = Class.new(Scraper::Base)
headers.instance_eval do
root_element nil
process "h1,h2", :h1=>"h1", :h2=>"h2"
attr :h1
attr :h2
end
html = <<-EOF
first
second
EOF
scraper = new_scraper(html) do
process "div", extractor(:headers=>headers)
attr :headers
end
scraper.scrape
assert scraper.headers
assert_equal "h1", scraper.headers.h1.name
assert_equal "h2", scraper.headers.h2.name
end
def test_array_extractors
html = <<-EOF
first
second
EOF
scraper = new_scraper(html) do
process "h1", extractor("headers[]"=>:text)
attr :headers
end
scraper.scrape
assert scraper.headers.is_a?(Array)
assert_equal 2, scraper.headers.size
assert_equal "first", scraper.headers[0]
assert_equal "second", scraper.headers[1]
end
def test_hash_extractors
html = <<-EOF
EOF
scraper = new_scraper(html) do
process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
attr :header
end
scraper.scrape
assert scraper.header.is_a?(Hash)
assert_equal 3, scraper.header.size
assert_equal "1", scraper.header[:id]
assert_equal "header", scraper.header[:class]
assert_equal "first", scraper.header[:text]
end
def test_multi_value_extractors
html = <<-EOF
EOF
scraper = new_scraper(html) do
process "h1", [:text, :kls]=>Scraper.define {
process "*", :text=>:text, :kls=>"@class"
}
end
result = scraper.scrape
assert "first", result.text
assert "header", result.kls
end
def test_conditional_extractors
# Look for id attribute (second header only),
# if not found look for class attribute (first
# two headers), otherwise just get text (third
# header).
html = <<-EOF
first
second
third
EOF
scraper = new_scraper(html) do
process "h1", extractor("headers[]"=>["@id", "@class", :text])
attr :headers
end
scraper.scrape
assert scraper.headers.is_a?(Array)
assert_equal 3, scraper.headers.size
assert_equal "foo", scraper.headers[0]
assert_equal "bar", scraper.headers[1]
assert_equal "third", scraper.headers[2]
end
DIVS_ST_ND = <<-EOF
first
second
EOF
def test_accessors_from_extractor
scraper = new_scraper(DIVS_ST_ND) do
process_first "div", :div_id=>"@id", :div_text=>:text
result :div_id
end
value = scraper.scrape
assert_equal "1", value
scraper = new_scraper(DIVS_ST_ND) do
process_first "div", :div_id=>"@id", :div_text=>:text
result :div_id, :div_text
end
value = scraper.scrape
assert_equal "1", value.div_id
assert_equal "first", value.div_text
scraper = new_scraper(DIVS_ST_ND) do
process_first "div", :div_id=>"@id", :div_text=>:text
end
value = scraper.scrape
assert_equal "1", value.div_id
assert_equal "first", value.div_text
scraper = new_scraper(DIVS_ST_ND) do
attr_accessor :div_class
process_first "div", :div_id=>"@id", :div_text=>:text
result :div_id, :div_class
end
value = scraper.scrape
assert_equal "1", value.div_id
assert_raise(NoMethodError) { value.div_text }
scraper = new_scraper(DIVS_ST_ND) do
process "div", "div_ids[]"=>"@id"
result :div_ids
end
value = scraper.scrape
assert_equal "1", value[0]
assert_equal "2", value[1]
end
def test_array_accessors
scraper = new_scraper(DIVS_ST_ND) do
array :div_id, :div_text
process "div", :div_id=>"@id", :div_text=>:text
result :div_id, :div_text
end
value = scraper.scrape
assert_equal 2, value.div_id.size
assert_equal 2, value.div_text.size
assert_equal "1", value.div_id[0]
assert_equal "2", value.div_id[1]
assert_equal "first", value.div_text[0]
assert_equal "second", value.div_text[1]
end
#
# Root element tests.
#
HTML_EMPTY = <<-EOF
EOF
def test_scrape_body_by_default
scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
scraper.class.instance_eval do
process "head" do |element| @head = element end
attr :head
process "body" do |element| @body = element end
attr :body
end
scraper.scrape
assert scraper.head
assert scraper.body
end
def test_changing_root_element
only_header = new_scraper(HTML_EMPTY) do
root_element "head"
process "head" do |element| @head = element end
attr :head
process "body" do |element| @body = element end
attr :body
end
only_body = Class.new(only_header.class).new(HTML_EMPTY)
only_body.class.root_element "body"
both_parts = Class.new(only_body.class).new(HTML_EMPTY)
both_parts.class.root_element nil
# We set this scraper to begin with the head element,
# so we can see the head element, but not the body.
only_header.scrape
assert only_header.head
assert only_header.body.nil?
# Now switch to a scraper that processes the body element,
# skipping the header.
only_body.scrape
assert only_body.head.nil?
assert only_body.body
# Now switch to a scraper that doesn't specify a root element,
# and it will process both header and body.
both_parts.scrape
assert both_parts.head
assert both_parts.body
end
# Test prepare/result.
def test_prepare_and_result
# Extracting the attribute skips the second match.
scraper = new_scraper(DIVS123) do
process("div") { |element| @count +=1 }
define_method(:prepare) { @count = 1 }
define_method(:result) { @count }
end
result = scraper.scrape
assert_equal 4, result
end
def test_changing_document_from_prepare
# Extracting the attribute skips the second match.
scraper = new_scraper(DIVS123) do
selector :divs, "div"
define_method :prepare do |document|
@document = divs(document)[1]
end
array :ids
process "div", :ids=>"@id"
result :ids
end
result = scraper.scrape
assert_equal 1, result.size
assert_equal "2", result[0]
end
def test_anonymous_scrapers
scraper = Scraper.define do
array :ids
process "div", :ids=>"@id"
result :ids
end
result = scraper.scrape(DIVS123)
assert_equal "1", result[0]
assert_equal "2", result[1]
assert_equal "3", result[2]
end
def test_named_rules
scraper = Scraper.define do
array :ids1, :ids2
process :main, "div", :ids1=>"@id"
process :main, "div", :ids2=>"@id"
result :ids1, :ids2
end
result = scraper.scrape(DIVS123)
assert_equal nil, result.ids1
assert_equal 3, result.ids2.size
assert_equal "1", result.ids2[0]
assert_equal "2", result.ids2[1]
assert_equal "3", result.ids2[2]
end
protected
def new_scraper(what, &block)
cls = Class.new(Scraper::Base)
cls.root_element nil
cls.parser :html_parser
cls.class_eval &block if block
cls.new(what)
end
end
# Repeats the same set of tests, but using Tidy instead of HTMLParser.
class ScraperUsingTidyTest < ScraperTest
protected
def new_scraper(what, &block)
cls = Class.new(Scraper::Base)
cls.root_element nil
cls.parser :tidy
cls.class_eval &block if block
cls.new(what)
end
end