require 'mechanize' require 'logger' require 'json' require 'pry' class SpiderMech attr_reader :queue attr_reader :crawled attr_reader :data def initialize(start_page) @logger = Logger.new 'spidermech.log' @start_page = start_page @queue = [] @crawled = [] @data = [] @queue << @start_page @bot = Mechanize.new end def left_in_queue i = 0 @queue.each do |link| if @crawled.include? link # we don't need to crawl this one else i += 1 end end i end def save_json filename = "#{URI.parse(@start_page).host}.json" @logger.info "Writing sitemap data to #{filename}" json = @data.to_json File.open(filename, 'w') { |f| f.write json } end def run while !@queue.empty? crawl end @data end def crawl url = @queue.shift if @crawled.include? url # @logger.warn "Already crawled #{url}" return else @logger.info "Crawling #{url}" @logger.info "Left in Queue: #{left_in_queue}" end page = @bot.get url if page.class != Mechanize::Page @logger.info "File crawling is not supported." return end @crawled << url # get all the assets data = { :url => url, :assets => { :scripts => find_scripts(page), :images => find_images(page), :css => find_css(page) }, :links => [] } page.links.each do |link| begin if link.href[0] == '/' # this is a relative link @queue << link.href data[:links] << link.href elsif link.href[0..@start_page.length] == @start_page # still part of this domain @queue << link.href data[:links] << link.href else # @logger.info "This link did not fall under our jurisdiction: #{link.href}" end rescue Exception => e # @logger.error e end end @data << data end def find_scripts(page) page.search('script').map do |script| begin script.attributes['src'].value rescue Exception => e # @logger.error e end end end def find_images(page) page.search('img').map do |img| begin img.attributes['src'].value rescue Exception => e # @logger.error e end end end def find_css(page) page.search('link').map do |css| begin css.attributes['href'].value rescue Exception => e # @logger.error e end end end end