#!/usr/bin/env ruby lib = File.expand_path(File.dirname(__FILE__) + '/../lib') $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib) require 'cohesion' require 'slop' require 'csv' opts = Slop.parse(:help => true) do banner 'Usage: cohesion [options]' on 'url=', 'URL to start crawl from' on 'internal_urls=', 'Url patterns to include', :as => Array on 'external_urls=', 'Url patterns to exclude', :as => Array on 'seed_urls=', "Seed urls", :as => Array on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer on 'thread_count=', "Set the number of threads used", :as => Integer on 'timeout=', "Sets the timeout for http requests", :as => Integer on 'cache=', "Sets the timeout for the cache, leave blank for no cache" on 'authentication=', "Sets the authentication type used" on 'username=', "Username to use with authentication" on 'password=', "Password to use with authentication" on 'output=', 'Path to output data to' on 'output_format=', "Output format, csv or json" on 'v', 'verbose', 'Display crawl information' on 'd', 'debug', 'Display debug information' on 'w', 'web_statistics', 'Start web stats server' end if opts[:url] options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url} options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0]) options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0]) options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0]) failures = Cohesion::Check.site(opts[:url], options) if failures.count == 0 exit(true) else if opts[:output] output = [] failures.each do |failure| output << {:error_page => failure[:issue][:url], :inbound_links => failure[:inbound]} end opts[:output_format] = "json" unless opts[:output_format] if opts[:output_format] == "json" File.open(opts[:output], 'w') do |f| f.write output.to_json end elsif opts[:output_format] == "csv" CSV.open(opts[:output], "wb") do |csv| csv << ["404 Url", "Page that contains link"] output.each do |line| line[:inbound_links].each do |link| csv << [line[:error_page], link] end end end end end exit(false) end else puts end