=begin Copyright 2010-2013 Tasos Laskos <tasos.laskos@gmail.com> Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. =end require 'typhoeus' require 'singleton' module Arachni lib = Options.dir['lib'] require lib + 'typhoeus/utils' require lib + 'typhoeus/hydra' require lib + 'typhoeus/request' require lib + 'typhoeus/response' require lib + 'utilities' require lib + 'mixins/observable' # # Provides a system-wide, simple and high-performance HTTP interface. # # @author Tasos "Zapotek" Laskos <tasos.laskos@gmail.com> # class HTTP include Singleton include Module::Output include Utilities include Mixins::Observable # # {HTTP} error namespace. # # All {HTTP} errors inherit from and live under it. # # @author Tasos "Zapotek" Laskos <tasos.laskos@gmail.com> # class Error < Arachni::Error end require Options.dir['lib'] + 'http/cookie_jar' # Default maximum concurrency for HTTP requests. MAX_CONCURRENCY = 20 # Default maximum redirect limit. REDIRECT_LIMIT = 20 # Don't let the request queue grow more than this amount, if it does then # run the queued requests to unload it MAX_QUEUE_SIZE = 5000 HTTP_TIMEOUT = 50000 CUSTOM_404_CACHE_SIZE = 250 # @return [String] framework seed/target URL attr_reader :url # @return [Hash] default headers for each request attr_reader :headers # @return [CookieJar] attr_reader :cookie_jar # @return [Integer] amount of performed requests attr_reader :request_count # @return [Integer] amount of received responses attr_reader :response_count # @return [Integer] amount of timed-out requests attr_reader :time_out_count # @return [Integer] sum of the response times of the running requests (of the current burst) attr_reader :curr_res_time # @return [Integer] amount of responses received for the running requests (of the current burst) attr_reader :curr_res_cnt def initialize reset end # # Re-initializes the singleton # # @return [Arachni::HTTP] self # def reset( hooks_too = true ) clear_observers if hooks_too opts = Options req_limit = opts.http_req_limit || MAX_CONCURRENCY hydra_opts = { max_concurrency: req_limit, method: :auto } if opts.url parsed_url = uri_parse( opts.url ) hydra_opts.merge!( username: parsed_url.user, password: parsed_url.password ) end @url = opts.url.to_s @url = nil if @url.empty? @hydra = Typhoeus::Hydra.new( hydra_opts ) @hydra_sync = Typhoeus::Hydra.new( hydra_opts.merge( max_concurrency: 1 ) ) @hydra.disable_memoization @hydra_sync.disable_memoization @headers = { 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding' => 'gzip, deflate', 'User-Agent' => opts.user_agent } @headers['From'] = opts.authed_by if opts.authed_by @headers.merge!( opts.custom_headers ) @cookie_jar = CookieJar.new( opts.cookie_jar ) update_cookies( opts.cookies ) if opts.cookies update_cookies( opts.cookie_string ) if opts.cookie_string proxy_opts = {} proxy_opts = { proxy: "#{opts.proxy_host}:#{opts.proxy_port}", proxy_username: opts.proxy_username, proxy_password: opts.proxy_password, proxy_type: opts.proxy_type } if opts.proxy_host opts.redirect_limit ||= REDIRECT_LIMIT @opts = { follow_location: false, max_redirects: opts.redirect_limit, disable_ssl_peer_verification: true, timeout: opts.http_timeout || HTTP_TIMEOUT }.merge( proxy_opts ) @request_count = 0 @response_count = 0 @time_out_count = 0 @curr_res_time = 0 @curr_res_cnt = 0 @burst_runtime = 0 @queue_size = 0 @after_run = [] @_404 = Hash.new self end # Runs all queued requests def run exception_jail { @burst_runtime = nil hydra_run @after_run.each { |block| block.call } @after_run.clear call_after_run_persistent # Prune the custom 404 cache after callbacks have been called. prune_custom_404_cache @curr_res_time = 0 @curr_res_cnt = 0 true } rescue SystemExit raise rescue nil end # Aborts the running requests on a best effort basis def abort exception_jail { @hydra.abort } end # @return [Integer] # Amount of time (in seconds) that the current burst has been running. def burst_runtime @burst_runtime.to_i > 0 ? @burst_runtime : Time.now - (@burst_runtime_start || Time.now) end # @return [Integer] # Average response time for the running requests (i.e. the current burst). def average_res_time return 0 if @curr_res_cnt == 0 @curr_res_time / @curr_res_cnt end # @return [Integer] # Responses/second for the running requests (i.e. the current burst). def curr_res_per_second if @curr_res_cnt > 0 && burst_runtime > 0 return (@curr_res_cnt / burst_runtime).to_i end 0 end # # Sets the maximum concurrency of HTTP requests # # @param [Integer] concurrency # def max_concurrency=( concurrency ) @hydra.max_concurrency = concurrency end # @return [Integer] Current maximum concurrency of HTTP requests. def max_concurrency @hydra.max_concurrency end # @return [Array<Arachni::Element::Cookie>] All cookies in the jar. def cookies @cookie_jar.cookies end # # Gets called each time a hydra run finishes. # # @return [Arachni::HTTP] self # def after_run( &block ) @after_run << block self end # # Like {#after_run} but will not be removed after it's run. # # @return [Arachni::HTTP] self # def after_run_persistent( &block ) add_after_run_persistent( &block ) self end # # Makes a generic request. # # @param [URI] url # @param [Hash] opts Request options. # @option opts [Hash] :params ({}) Request parameters. # @option opts [Hash] :train (false) # Force Arachni to analyze the HTML code looking for new elements. # @option opts [Hash] :async (true) Make the request async? # @option opts [Hash] :headers ({}) Extra HTTP request headers. # @param [Block] block Callback to be passed the response. # # @return [Typhoeus::Request] # def request( url = @url, opts = {}, &block ) fail ArgumentError, 'URL cannot be empty.' if !url params = opts[:params] || {} train = opts[:train] timeout = opts[:timeout] cookies = opts[:cookies] || {} async = opts[:async] async = true if async.nil? headers = opts[:headers] || {} update_cookies = opts[:update_cookies] follow_location = opts[:follow_location] || false # # The exception jail function wraps the block passed to it # in exception handling and runs it. # # How cool is Ruby? Seriously.... # exception_jail( false ) { if !opts[:no_cookiejar] cookies = begin @cookie_jar.for_url( url ).inject({}) do |h, c| h[c.name] = c.value h end.merge( cookies ) rescue => e print_error "Could not get cookies for URL '#{url}' from Cookiejar (#{e})." print_error_backtrace e cookies end end headers = @headers.merge( headers ) headers['Cookie'] ||= cookies.map { |k, v| "#{cookie_encode( k )}=#{cookie_encode( v )}" }.join( ';' ) headers.delete( 'Cookie' ) if headers['Cookie'].empty? headers.each { |k, v| headers[k] = Header.encode( v ) if v } # There are cases where the url already has a query and we also have # some params to work with. Some webapp frameworks will break # or get confused...plus the url will not be RFC compliant. # # Thus we need to merge the provided params with the # params of the url query and remove the latter from the url. cparams = params.dup curl = normalize_url( url ).dup if opts[:method] != :post begin parsed = uri_parse( curl ) cparams = parse_url_vars( curl ).merge( cparams ) curl.gsub!( "?#{parsed.query}", '' ) if parsed.query rescue return end else cparams = cparams.inject( {} ) do |h, (k, v)| h[form_encode( k )] = form_encode( v ) if v && k h end end opts = { headers: headers, params: cparams.empty? ? nil : cparams, method: opts[:method].nil? ? :get : opts[:method], body: opts[:body] }.merge( @opts ) opts[:follow_location] = follow_location if follow_location opts[:timeout] = timeout if timeout req = Typhoeus::Request.new( curl, opts ) req.train if train req.update_cookies if update_cookies queue( req, async, &block ) req } end # # Gets a URL passing the provided query parameters. # # @param (see #request) # @return (see #request) # # @see #request # def get( url = @url, opts = {}, &block ) request( url, opts, &block ) end # # Posts a form to a URL with the provided query parameters. # # @param (see #request) # @return (see #request) # # @see #request # def post( url = @url, opts = {}, &block ) request( url, opts.merge( method: :post ), &block ) end # # Sends an HTTP TRACE request to "url". # # @param (see #request) # @return (see #request) # # @see #request # def trace( url = @url, opts = {}, &block ) request( url, opts.merge( method: :trace ), &block ) end # # Gets a url with cookies and url variables # # @param (see #request) # @return (see #request) # # @see #request # def cookie( url = @url, opts = {}, &block ) opts[:cookies] = (opts[:params] || {}).dup opts[:params] = nil request( url, opts, &block ) end # # Gets a url with optional url variables and modified headers # # @param (see #request) # @return (see #request) # # @see #request # def header( url = @url, opts = {}, &block ) opts[:headers] ||= {} opts[:headers].merge! ((opts[:params] || {}).dup ) opts[:params] = nil request( url, opts, &block ) end # # Executes a `block` under a sandbox. # # Cookies or new callbacks set as a result of the block won't affect the # HTTP singleton. # # @param [Block] block # # @return [Object] Return value of the block. # def sandbox( &block ) h = {} instance_variables.each do |iv| val = instance_variable_get( iv ) h[iv] = val.deep_clone rescue val.dup rescue val end hooks = {} @__hooks.each { |k, v| hooks[k] = v.dup } ret = block.call( self ) h.each { |iv, val| instance_variable_set( iv, val ) } @__hooks = hooks ret end # # Updates the cookie-jar with the passed cookies. # # @param [Array<String, Hash, Arachni::Element::Cookie>] cookies # def update_cookies( cookies ) @cookie_jar.update( cookies ) # Update framework cookies. Arachni::Options.cookies = @cookie_jar.cookies end alias :set_cookies :update_cookies # # Extracts cookies from an HTTP response and updates the cookie-jar. # # It also executes callbacks added with `add_on_new_cookies( &block )`. # # @param [Typhoeus::Response] res # def parse_and_set_cookies( res ) cookies = Cookie.from_response( res ) update_cookies( cookies ) call_on_new_cookies( cookies, res ) end # @param [Block] block # To be passed the new cookies and the response that set them def on_new_cookies( &block ) add_on_new_cookies( &block ) end # # Checks whether or not the provided response is a custom 404 page # # @param [Typhoeus::Response] res The response to check. # @param [Block] block # To be passed true or false depending on the result. # def custom_404?( res, &block ) precision = 2 path = get_path( res.effective_url ) uri = uri_parse( res.effective_url ) trv_back = File.dirname( uri.path ) trv_back_url = uri.scheme + '://' + uri.host + ':' + uri.port.to_s + trv_back trv_back_url += '/' if trv_back_url[-1] != '/' # 404 probes generators = [ # get a random path with an extension proc{ path + random_string + '.' + random_string[0..precision] }, # get a random path without an extension proc{ path + random_string }, # move up a dir and get a random file proc{ trv_back_url + random_string }, # move up a dir and get a random file with an extension proc{ trv_back_url + random_string + '.' + random_string[0..precision] }, # get a random directory proc{ path + random_string + '/' } ] gathered = 0 body = res.body if !path_analyzed_for_custom_404?( path ) generators.each.with_index do |generator, i| _404_signatures_for_path( path )[i] ||= {} precision.times { get( generator.call, follow_location: true ) do |c_res| gathered += 1 if gathered == generators.size * precision path_analyzed_for_custom_404( path ) # save the hash of the refined responses, no sense # in wasting space _404_signatures_for_path( path ).each { |c404| c404[:rdiff] = c404[:rdiff].hash } block.call is_404?( path, body ) else _404_signatures_for_path( path )[i][:body] ||= c_res.body _404_signatures_for_path( path )[i][:rdiff] = _404_signatures_for_path( path )[i][:body].rdiff( c_res.body ) _404_signatures_for_path( path )[i][:rdiff_words] = _404_signatures_for_path( path )[i][:rdiff].words.map( &:hash ) end end } end else block.call is_404?( path, body ) end nil end def self.method_missing( sym, *args, &block ) instance.send( sym, *args, &block ) end private def prune_custom_404_cache return if @_404.size <= CUSTOM_404_CACHE_SIZE @_404.keys.each do |path| # If the path hasn't been analyzed yet don't even consider # removing it. Technically, at this point (after #hydra_run) there # should not be any non analyzed paths but better be sure. next if !@_404[path][:analyzed] # We've done enough... return if @_404.size < CUSTOM_404_CACHE_SIZE @_404.delete( path ) end end def _404_data_for_path( path ) @_404[path] ||= { analyzed: false, signatures: [] } end def _404_signatures_for_path( path ) _404_data_for_path( path )[:signatures] end def path_analyzed_for_custom_404?( path ) _404_data_for_path( path )[:analyzed] end def path_analyzed_for_custom_404( path ) _404_data_for_path( path )[:analyzed] = true end def hydra_run @running = true @burst_runtime ||= 0 @burst_runtime_start = Time.now @hydra.run @queue_size = 0 @running = false @burst_runtime += Time.now - @burst_runtime_start end # # Queues a {Typhoeus::Request} and calls the following callbacks: # * on_queue() -- intersects a queued request and gets passed the original # and the async method. If the block returns one or more request # objects these will be queued instead of the original request. # * on_complete() -- calls the block with the each requests as it arrives. # # @param [Typhoeus::Request] req the request to queue # @param [Bool] async run request async? # @param [Block] block callback # def queue( req, async = true, &block ) requests = call_on_queue( req, async ) requests ||= req [requests].flatten.reject { |p| !p.is_a?( Typhoeus::Request ) }. each { |request| forward_request( request, async, &block ) } end # # Performs the actual queueing of requests, passes them to Hydra and sets # up callbacks and hooks. # # @param [Typhoeus::Request] req # @param [Bool] async # @param [Block] block callback # def forward_request( req, async = true, &block ) req.id = @request_count @queue_size += 1 !async ? @hydra_sync.queue( req ) : @hydra.queue( req ) @request_count += 1 print_debug '------------' print_debug 'Queued request.' print_debug "ID#: #{req.id}" print_debug "URL: #{req.url}" print_debug "Method: #{req.method}" print_debug "Params: #{req.params}" print_debug "Headers: #{req.headers}" print_debug "Train?: #{req.train?}" print_debug '------------' req.on_complete( true ) do |res| @response_count += 1 @curr_res_cnt += 1 @curr_res_time += res.start_transfer_time call_on_complete( res ) parse_and_set_cookies( res ) if req.update_cookies? print_debug '------------' print_debug "Got response for request ID#: #{res.request.id}" print_debug "Status: #{res.code}" print_debug "Error msg: #{res.curl_error_message}" print_debug "URL: #{res.effective_url}" print_debug "Headers:\n#{res.headers}" print_debug "Parsed headers: #{res.headers_hash}" print_debug '------------' if res.timed_out? print_bad 'Request timed-out! -- ID# ' + res.request.id.to_s @time_out_count += 1 end end req.on_complete( &block ) if block_given? if emergency_run? print_info 'Request queue reached its maximum size, performing an emergency run.' hydra_run end exception_jail { @hydra_sync.run } if !async end def emergency_run? @queue_size >= MAX_QUEUE_SIZE && !@running end def is_404?( path, body ) # give the rDiff algo a shot first hoping that a comparison of # refined responses will be enough to give us a clear-cut positive @_404[path][:signatures].each do |_404| return true if _404[:body].rdiff( body ).hash == _404[:rdiff] end # if the comparison of the refinements fails, compare them based on how # many words are different between them @_404[path][:signatures].each do |_404| rdiff_body_words = _404[:body].rdiff( body ).words.map( &:hash ) return true if ( (_404[:rdiff_words] - rdiff_body_words) - (rdiff_body_words - _404[:rdiff_words]) ).size < 25 end false end def random_string Digest::SHA1.hexdigest( rand( 9999999 ).to_s ) end def self.info { name: 'HTTP' } end end end