# o hai! I need your help. module WWMD LINKS_REGEXP = [ /window\.open\s*\(([^\)]+)/i, /open_window\s*\(([^\)]+)/i, /window\.location\s*=\s*(['"][^'"]+['"])/i, /.*location.href\s*=\s*(['"][^'"]+['"])/i, /document.forms.*action\s*=\s*(['"][^'"]+['"])/i, /Ajax\.Request\s*\((['"][^'"]+['"])/i, ] class Scrape attr_accessor :debug attr_accessor :warn attr_accessor :links # links found on page attr_accessor :jlinks # links to javascript includes attr_reader :hdoc # create a new scrape object using passed HTML def initialize(page='<>') @page = page @hdoc = HDOC.parse(@page) @links = Array.new @debug = false @warn = false end # reset this scrape object (called by WWMD::Page) def reset(page) @page = page @hdoc = HDOC.parse(@page) @links = Array.new end # scan the passed string for the configured regular expressions # and return them as an array def urls_from_regexp(content,re,split=0) ret = [] scrape = content.scan(re) scrape.each do |url| # cheat and take split string(,)[split] add = url.to_s.split(',')[split].gsub(/['"]/, '') next if (add == '' || add.nil?) ret << add end return ret end # xpath search for tags and return the passed attribute # urls_from_xpath("//a","href") def urls_from_xpath(xpath,attr) ret = [] @hdoc.search(xpath).each do |elem| url = elem[attr] next if url.empty? ret << url.strip end return ret end # NEED to move this to external configuration # # list of urls we don't care to store in our links list def reject_links putw "WARN: override reject_links in helper script" if @warn default_reject_links end # default reject links (override using reject_links in helper script) def default_reject_links @links.reject! do |url| url.nil? || url.extname == ".css" || url.extname == ".pdf" || url =~ /javascript:/i || url =~ /mailto:/i || url =~ /[\[\]]/ || url =~ /^#/ end end # return an array of Form objects for forms on page def for_forms ret = [] @hdoc.search("//form").each { |f| ret << Form.new(f) } ret end # use xpath searches to get # * //a href # * //area href # * //frame src # * //iframe src # * //form action # * //meta refresh content urls # then get //script tags and regexp out links in javascript function calls # from elem.inner_html def for_links(reject=true) self.urls_from_xpath("//a","href").each { |url| @links << url }; # get elements self.urls_from_xpath("//area","href").each { |url| @links << url }; # get elements self.urls_from_xpath("//frame","src").each { |url| @links << url }; # get elements self.urls_from_xpath("//iframe","src").each { |url| @links << url }; # get