lib/wmap/utils/url_magic.rb in wmap-2.5.2 vs lib/wmap/utils/url_magic.rb in wmap-2.5.4

- old
+ new

@@ -3,17 +3,21 @@ # # A pure Ruby library for Internet web application discovery and tracking. # # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org> #++ -# require "uri" +require "watir" +require "selenium-webdriver" module Wmap - module Utils - module UrlMagic + module Utils + module UrlMagic extend self + # set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s) + Max_http_timeout=8000 + # Simple sanity check on a 'claimed' URL string. def is_url?(url) puts "Validate the URL format is valid: #{url}" if @verbose begin if url =~ /(http|https)\:\/\/((.)+)/i @@ -31,11 +35,11 @@ rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return false end end - + # Simple sanity check on a 'claimed' SSL enabled URL string def is_ssl?(url) puts "Validate if SSL is enabled on: #{url}" if @verbose begin url=url.strip @@ -47,66 +51,43 @@ rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return false end end - alias_method :is_https?, :is_ssl? - + alias_method :is_https?, :is_ssl? + # Simple sanity check on a 'claimed' web site base string. def is_site?(url) puts "Validate the website string format for: #{url}" if @verbose begin url=url.strip.downcase if is_url?(url) if url == url_2_site(url) return true else return false - end + end else puts "Unknown site format: #{url}" if @verbose return false end rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end end - - # Check if URL is an absolute one - #def is_absolute?(url) - # puts "Validate if the url is absolute: #{url}" if @verbose - # begin - # url.strip! - # URI.absolute?(url) - # rescue => ee - # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose - # return false - # end - #end - - # Check if URL is relative one - #def is_relative?(url) - # begin - # url.strip! - # !is_absolute?(url) - # rescue => ee - # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose - # return false - # end - #end - + # Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com" def url_2_host (url) begin url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "") record1 = url.split('/') if record1[0].nil? - puts "Error process url: #{url}" + puts "Error process url: #{url}" return nil else - record2 = record1[0].split(':') + record2 = record1[0].split(':') return record2[0] end rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil @@ -118,12 +99,12 @@ puts "Retrieve service port on URL: #{url}" if @verbose begin ssl = (url =~ /https/i) url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "") record1 = url.split('/') - record2 = record1[0].split(':') - if (record2.length == 2) + record2 = record1[0].split(':') + if (record2.length == 2) puts "The service port: #{record2[1]}" if @verbose return record2[1].to_i elsif ssl puts "The service port: 443" if @verbose return 443 @@ -162,27 +143,27 @@ end # additional logic to handle uncommon url base structures unless is_fqdn?(host) case host # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk" - when /\?|\#/ + when /\?|\#/ host=host.split(%r{\?|\#})[0] else #do nothing end end - # step 2, put the host:port pair back to the normal site format + # step 2, put the host:port pair back to the normal site format prot="https:" if port==443 if port==80 || port==443 site=prot+"//"+host+"/" else site=prot+"//"+host+":"+port.to_s+"/" end if site=~ /http/i #puts "Base found: #{site}" if @verbose return site - else + else raise "Problem encountered on method url_2_site: Unable to convert #{url}" return nil end rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose @@ -200,41 +181,39 @@ #puts "Path component found: #{path}" if @verbose return path rescue => ee puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose end - + end - + # Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true def urls_on_same_domain?(url1, url2) puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose - begin - host1=url_2_host(url1) - host2=url_2_host(url2) - return get_domain_root(host1) == get_domain_root(host2) - rescue => ee - puts "Error searching the object content: #{ee}" if @verbose - return nil - end - end + host1=url_2_host(url1) + host2=url_2_host(url2) + return get_domain_root(host1) == get_domain_root(host2) + rescue => ee + puts "Error searching the object content: #{ee}" if @verbose + return nil + end # Input is host and open port, output is a URL for valid http response code or nil def host_2_url (host,port=80) puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose begin host=host.strip - if port.to_i == 80 + if port.to_i == 80 url_1 = "http://" + host + "/" elsif port.to_i ==443 url_1 = "https://" + host + "/" else url_1 = "http://" + host + ":" + port.to_s + "/" url_2 = "https://" + host + ":" + port.to_s + "/" end puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose - checker=Wmap::UrlChecker.new + checker=Wmap::UrlChecker.new if checker.response_code(url_1) != 10000 puts "Found URL: #{url_1}" if @verbose return url_1 elsif checker.response_code(url_2) != 10000 puts "Found URL: #{url_2}" if @verbose @@ -245,12 +224,12 @@ end rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end - end - + end + # Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html' def make_absolute(base, relative_url) puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose begin absolute_url = nil; @@ -264,16 +243,16 @@ rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end end - + # Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders' def create_absolute_url_from_base(potential_base, relative_url) begin #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose - naked_base = url_2_site(potential_base).strip.chop + naked_base = url_2_site(potential_base).strip.chop puts "Found absolute URL: #{naked_base+relative_url}" if @verbose return naked_base + relative_url rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil @@ -307,23 +286,23 @@ rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end end - + # Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before # See http://en.wikipedia.org/wiki/URL_normalization for more explanation def normalize_url(url) begin url.strip! - # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/' + # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/' # Normalize the base - base=url_2_site(url) + base=url_2_site(url) # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/' base=base.sub(/\.\/$/,'/') # Normalize the relative path, case#1 - # retrieve the file path and remove the first '/' or '.', + # retrieve the file path and remove the first '/' or '.', # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath' path=url_2_path(url).sub(/^(\/|\.)*/,'') # Normalize the relative path, case#2 # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html' path=path.gsub(/\/\.{1,2}\//,'/') @@ -335,9 +314,138 @@ rescue => ee puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose return url end end - + + + # Test the URL and return the response code + def response_code (url) + puts "Check the http response code on the url: #{url}" if @verbose + code = 10000 # All unknown url connection exceptions go here + raise "Invalid url: #{url}" unless is_url?(url) + url=url.strip.downcase + timeo = Max_http_timeout/1000.0 + uri = URI.parse(url) + http = Net::HTTP.new(uri.host, uri.port) + http.open_timeout = timeo + http.read_timeout = timeo + if (url =~ /https\:/i) + http.use_ssl = true + #http.ssl_version = :SSLv3 + # Bypass the remote web server cert validation test + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + end + request = Net::HTTP::Get.new(uri.request_uri) + response = http.request(request) + puts "Server response the following: #{response}" if @verbose + code = response.code.to_i + #response.finish if response.started?() + @url_code=Hash.new unless @url_code + @url_code[url]=code + puts "Response code on #{url}: #{code}" if @verbose + return code + rescue Exception => ee + puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose + case ee + # rescue "Connection reset by peer" error type + when Errno::ECONNRESET + code=104 + when Errno::ECONNABORTED,Errno::ETIMEDOUT + #code=10000 + when Timeout::Error # Quick fix + if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure + http.ssl_version = :SSLv3 + response = http.request(request) + code = response.code.to_i + unless code.nil? + @ssl_version = http.ssl_version + end + end + else + #code=10000 + end + @url_code=Hash.new unless @url_code + @url_code[url]=code + return code + end + + # Given an URL, open the page, then return the DOM text from a normal user perspective + def open_page(url) + args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, read_timeout: Max_http_timeout/1000} + doc = Nokogiri::HTML(open(url, args)) + if doc.text.include?("Please enable JavaScript to view the page content") + puts "Invoke headless chrome through webdriver ..." if @verbose + #Selenium::WebDriver::Chrome.path = "/usr/local/bin/chromedriver" + #driver = Selenium::WebDriver.for :chrome + # http://watir.com/guides/chrome/ + args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate'] + browser = Watir::Browser.new :chrome, headless: true, options: {args: args} + browser.goto(url) + sleep(2) # wait for the loading + doc = Nokogiri::HTML(browser.html) + browser.close + end + puts doc.text if @verbose + return doc + rescue => ee + puts "Exception on method #{__method__} for #{url}: #{ee}" + browser.close unless browser.nil? + return doc.text + end + + # Test the URL / site and return the redirection location (3xx response code only) + def redirect_location (url) + puts "Test the redirection location for the url: #{url}" if @verbose + location="" + raise "Invalid url: #{url}" unless is_url?(url) + url=url.strip.downcase + timeo = Max_http_timeout/1000.0 + uri = URI.parse(url) + code = response_code (url) + if code >= 300 && code < 400 + http = Net::HTTP.new(uri.host, uri.port) + http.open_timeout = timeo + http.read_timeout = timeo + if (url =~ /https\:/i) + http.use_ssl = true + # Bypass the remote web server cert validation test + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + http.ssl_version = @ssl_version + end + request = Net::HTTP::Get.new(uri.request_uri) + response = http.request(request) + puts "Response: #{response}" if @verbose + case response + when Net::HTTPRedirection then + location = response['location'] + end + end + return location + rescue Exception => ee + puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose + return "" + end + alias_method :location, :redirect_location + + # Test the URL / Site and return the landing url location (recursive with the depth = 4 ) + def landing_location (depth=5, url) + depth -= 1 + return url if depth < 1 + timeo = Max_http_timeout/1000.0 + uri = URI.parse(url) + code = response_code (url) + if code >= 300 && code < 400 + url = redirect_location (url) + url = landing_location(depth,url) + else + return url + end + return url + rescue Exception => ee + puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose + end + + end end end