# o hai! I need your help.
module WWMD
LINKS_REGEXP = [
/window\.open\s*\(([^\)]+)/i,
/open_window\s*\(([^\)]+)/i,
/window\.location\s*=\s*(['"][^'"]+['"])/i,
/.*location.href\s*=\s*(['"][^'"]+['"])/i,
/document.forms.*action\s*=\s*(['"][^'"]+['"])/i,
/Ajax\.Request\s*\((['"][^'"]+['"])/i,
]
class Scrape
attr_accessor :debug
attr_accessor :warn
attr_accessor :links # links found on page
attr_accessor :jlinks # links to javascript includes
attr_reader :hdoc
# create a new scrape object using passed HTML
def initialize(page='<>')
@page = page
@hdoc = HDOC.parse(@page)
@links = Array.new
@debug = false
@warn = false
end
# reset this scrape object (called by WWMD::Page)
def reset(page)
@page = page
@hdoc = HDOC.parse(@page)
@links = Array.new
end
# scan the passed string for the configured regular expressions
# and return them as an array
def urls_from_regexp(content,re,split=0)
ret = []
scrape = content.scan(re)
scrape.each do |url|
# cheat and take split string(,)[split]
add = url.to_s.split(',')[split].gsub(/['"]/, '')
next if (add == '' || add.nil?)
ret << add
end
return ret
end
# xpath search for tags and return the passed attribute
# urls_from_xpath("//a","href")
def urls_from_xpath(xpath,attr)
ret = []
@hdoc.search(xpath).each do |elem|
url = elem[attr]
next if url.empty?
ret << url.strip
end
return ret
end
# NEED to move this to external configuration
#
# list of urls we don't care to store in our links list
def reject_links
putw "WARN: override reject_links in helper script" if @warn
default_reject_links
end
# default reject links (override using reject_links in helper script)
def default_reject_links
@links.reject! do |url|
url.nil? ||
url.extname == ".css" ||
url.extname == ".pdf" ||
url =~ /javascript:/i ||
url =~ /mailto:/i ||
url =~ /[\[\]]/ ||
url =~ /^#/
end
end
# return an array of Form objects for forms on page
def for_forms
ret = []
@hdoc.search("//form").each { |f| ret << Form.new(f) }
ret
end
# use xpath searches to get
# * //a href
# * //area href
# * //frame src
# * //iframe src
# * //form action
# * //meta refresh content urls
# then get //script tags and regexp out links in javascript function calls
# from elem.inner_html
def for_links(reject=true)
self.urls_from_xpath("//a","href").each { |url| @links << url }; # get elements
self.urls_from_xpath("//area","href").each { |url| @links << url }; # get elements
self.urls_from_xpath("//frame","src").each { |url| @links << url }; # get elements
self.urls_from_xpath("//iframe","src").each { |url| @links << url }; # get