module WWMD # when a WWMD::Page object is created, it created its own WWMD::Spider object # which can be accessed using page.spider.method. The page.set_data # method calls page.spider.add with the current url and a list of scraped # links from the page. This class doesn't do any real heavy lifting. # # a simple spider can be written just by recursing through page.spider.next until # it's empty. class Spider attr_accessor :queued attr_accessor :visited attr_accessor :bypass attr_accessor :local_only attr_reader :opts attr_accessor :ignore attr_accessor :csrf_token DEFAULT_IGNORE = [ /logoff/i, /logout/i, ] # pass me opts and an array of regexps to ignore # we have a set of sane(ish) defaults here def initialize(opts={},ignore=nil) @opts = opts @visited = [] @queued = [] @local_only = true @csrf_token = nil if !opts[:spider_local_only].nil? @local_only = opts[:spider_local_only] end @ignore = ignore || DEFAULT_IGNORE end # push an url onto the queue def push_url(url) return false if _check_ignore(url) if @local_only return false if !(url =~ /#{@opts[:base_url]}/) end return false if (@visited.include?(url) or @queued.include?(url)) @queued.push(url) true end alias_method :push, :push_url # skip items in the queue def skip(tim=1) tim.times { |i| @queued.shift } true end # get the next url in the queue def get_next queued.shift end alias_method :next, :get_next # more elements in the queue? def next? !queued.empty? end # get the last ul we visited? this doesn't look right def get_last(url) tmp = @visited.reject { |v| v =~ /#{url}/ } return tmp[-1] end # show the visited list (or the entry in the list at [id]) def show_visited(id=nil) if id.nil? @visited.each_index { |i| putx i.to_s + " :: " + @visited[i].to_s } return nil else return @visited[id] end end alias_method :v, :show_visited # return the current queue (or the entry in the queue at [id] def show_queue(id=nil) if id.nil? @queued.each_index { |i| putx i.to_s + " :: " + @queued[i].to_s } return nil else return @queued[id] end end alias_method :q, :show_queue # add url to queue def add(url='',links=[]) return nil if @visited.include?(url) @visited.push(url) links.each { |l| self.push_url l } nil end # set up the ignore list # ignore list is an array of regexp objects # remember to set this up before calling any Page methods def set_ignore(arr) @ignore = arr end def _de_csrf(url) return url if @csrf_token.nil? act,params = url.clopa form = params.to_form return url if !form.has_key?(@csrf_token) form[@csrf_token] = '' url = act + form.to_get end def _check_ignore(url) @ignore.each { |x| return true if (url =~ x) } return false end end end