lib/wmap/utils/url_magic.rb in wmap-2.7.0 vs lib/wmap/utils/url_magic.rb in wmap-2.7.1
- old
+ new
@@ -12,180 +12,165 @@
module Utils
module UrlMagic
extend self
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
- Max_http_timeout=8000
+ Max_http_timeout=15000
# Simple sanity check on a 'claimed' URL string.
def is_url?(url)
puts "Validate the URL format is valid: #{url}" if @verbose
- begin
- if url =~ /(http|https)\:\/\/((.)+)/i
- host=$2.split('/')[0]
- host=host.split(':')[0]
- if is_ip?(host) or is_fqdn?(host)
- return true
- else
- return false
- end
+ if url =~ /(http|https)\:\/\/((.)+)/i
+ host=$2.split('/')[0]
+ host=host.split(':')[0]
+ if is_ip?(host) or is_fqdn?(host)
+ return true
else
- puts "Unknown URL format: #{url}" if @verbose
return false
end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
+ else
+ puts "Unknown URL format: #{url}" if @verbose
return false
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return false
end
# Simple sanity check on a 'claimed' SSL enabled URL string
def is_ssl?(url)
puts "Validate if SSL is enabled on: #{url}" if @verbose
- begin
- url=url.strip
- if is_url?(url) && url =~ /https/i
- return true
- else
- return false
- end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
+ url=url.strip
+ if is_url?(url) && url =~ /https/i
+ return true
+ else
return false
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return false
end
alias_method :is_https?, :is_ssl?
# Simple sanity check on a 'claimed' web site base string.
def is_site?(url)
- puts "Validate the website string format for: #{url}" if @verbose
- begin
- url=url.strip.downcase
- if is_url?(url)
- if url == url_2_site(url)
- return true
- else
- return false
- end
+ puts "Validate the website string format for: #{url}" if @verbose
+ url=url.strip.downcase
+ if is_url?(url)
+ if url == url_2_site(url)
+ return true
else
- puts "Unknown site format: #{url}" if @verbose
return false
end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
- return nil
+ else
+ puts "Unknown site format: #{url}" if @verbose
+ return false
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
end
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
def url_2_host (url)
- begin
- url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
- record1 = url.split('/')
- if record1[0].nil?
- puts "Error process url: #{url}"
- return nil
- else
- record2 = record1[0].split(':')
- return record2[0]
- end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
+ url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
+ record1 = url.split('/')
+ if record1[0].nil?
+ puts "Error process url: #{url}"
return nil
+ else
+ record2 = record1[0].split(':')
+ return record2[0]
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
end
# Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
def url_2_port (url)
puts "Retrieve service port on URL: #{url}" if @verbose
- begin
- ssl = (url =~ /https/i)
- url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
- record1 = url.split('/')
- record2 = record1[0].split(':')
- if (record2.length == 2)
- puts "The service port: #{record2[1]}" if @verbose
- return record2[1].to_i
- elsif ssl
- puts "The service port: 443" if @verbose
- return 443
- else
- puts "The service port: 80" if @verbose
- return 80
- end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
- return nil
+ ssl = (url =~ /https/i)
+ url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
+ record1 = url.split('/')
+ record2 = record1[0].split(':')
+ if (record2.length == 2)
+ puts "The service port: #{record2[1]}" if @verbose
+ return record2[1].to_i
+ elsif ssl
+ puts "The service port: 443" if @verbose
+ return 443
+ else
+ puts "The service port: 80" if @verbose
+ return 80
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
end
# Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
def url_2_site (url)
puts "Retrieve the web site base for url: #{url}" if @verbose
- begin
- url = url.downcase
- url = url.sub(/^(.*?)http/i,'http')
- entry = url.split(%r{\/\/})
- prot=entry[0]
- # step 1, extract the host:port pair from the url
- host_port=entry[1].split(%r{\/})[0]
- if host_port =~ /\:/
- host=host_port.split(%r{\:})[0]
- port=host_port.split(%r{\:})[1].to_i
- elsif prot =~ /https/i
- host=host_port
- port=443
- elsif prot =~ /http/i
- host=host_port
- port=80
- else
- host=host_port
- #raise "Unknown url format: #{url}"
+ url = url.downcase
+ url = url.sub(/^(.*?)http/i,'http')
+ entry = url.split(%r{\/\/})
+ prot=entry[0]
+ # step 1, extract the host:port pair from the url
+ host_port=entry[1].split(%r{\/})[0]
+ if host_port =~ /\:/
+ host=host_port.split(%r{\:})[0]
+ port=host_port.split(%r{\:})[1].to_i
+ elsif prot =~ /https/i
+ host=host_port
+ port=443
+ elsif prot =~ /http/i
+ host=host_port
+ port=80
+ else
+ host=host_port
+ #raise "Unknown url format: #{url}"
+ end
+ # additional logic to handle uncommon url base structures
+ unless is_fqdn?(host)
+ case host
+ # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
+ when /\?|\#/
+ host=host.split(%r{\?|\#})[0]
+ else
+ #do nothing
end
- # additional logic to handle uncommon url base structures
- unless is_fqdn?(host)
- case host
- # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
- when /\?|\#/
- host=host.split(%r{\?|\#})[0]
- else
- #do nothing
- end
- end
- # step 2, put the host:port pair back to the normal site format
- prot="https:" if port==443
- if port==80 || port==443
- site=prot+"//"+host+"/"
- else
- site=prot+"//"+host+":"+port.to_s+"/"
- end
- if site=~ /http/i
- #puts "Base found: #{site}" if @verbose
- return site
- else
- raise "Problem encountered on method url_2_site: Unable to convert #{url}"
- return nil
- end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
+ end
+ # step 2, put the host:port pair back to the normal site format
+ prot="https:" if port==443
+ if port==80 || port==443
+ site=prot+"//"+host+"/"
+ else
+ site=prot+"//"+host+":"+port.to_s+"/"
+ end
+ if site=~ /http/i
+ #puts "Base found: #{site}" if @verbose
+ return site
+ else
+ raise "Problem encountered on method url_2_site: Unable to convert #{url}"
return nil
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
end
# Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
def url_2_path(url)
#puts "Retrieve the relative path component of the url: #{url}" if @verbose
- begin
- url.strip!
- base = url_2_site(url).chop
- path=url.sub(base,'')
- #puts "Path component found: #{path}" if @verbose
- return path
- rescue => ee
- puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
- end
-
+ url.strip!
+ base = url_2_site(url).chop
+ path=url.sub(base,'')
+ #puts "Path component found: #{path}" if @verbose
+ return path
+ rescue => ee
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
end
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
def urls_on_same_domain?(url1, url2)
puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
@@ -198,124 +183,114 @@
end
# Input is host and open port, output is a URL for valid http response code or nil
def host_2_url (host,port=80)
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
- begin
- host=host.strip
- if port.to_i == 80
- url_1 = "http://" + host + "/"
- elsif port.to_i ==443
- url_1 = "https://" + host + "/"
- else
- url_1 = "http://" + host + ":" + port.to_s + "/"
- url_2 = "https://" + host + ":" + port.to_s + "/"
- end
- puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
- checker=Wmap::UrlChecker.new
- if checker.response_code(url_1) != 10000
- puts "Found URL: #{url_1}" if @verbose
- return url_1
- elsif checker.response_code(url_2) != 10000
- puts "Found URL: #{url_2}" if @verbose
- return url_2
- else
- puts "No http(s) service found on: #{host}:#{port}" if @verbose
- return nil
- end
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
+ host=host.strip
+ if port.to_i == 80
+ url_1 = "http://" + host + "/"
+ elsif port.to_i ==443
+ url_1 = "https://" + host + "/"
+ else
+ url_1 = "http://" + host + ":" + port.to_s + "/"
+ url_2 = "https://" + host + ":" + port.to_s + "/"
+ end
+ puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
+ checker=Wmap::UrlChecker.new
+ if checker.response_code(url_1) != 10000
+ puts "Found URL: #{url_1}" if @verbose
+ return url_1
+ elsif checker.response_code(url_2) != 10000
+ puts "Found URL: #{url_2}" if @verbose
+ return url_2
+ else
+ puts "No http(s) service found on: #{host}:#{port}" if @verbose
return nil
end
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
end
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
def make_absolute(base, relative_url)
- puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
- begin
- absolute_url = nil;
- if relative_url =~ /^\//
- absolute_url = create_absolute_url_from_base(base, relative_url)
- else
- absolute_url = create_absolute_url_from_context(base, relative_url)
- end
- puts "Found absolute URL: #{absolute_url}" if @verbose
- return absolute_url
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
- return nil
- end
- end
+ puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
+ absolute_url = nil;
+ if relative_url =~ /^\//
+ absolute_url = create_absolute_url_from_base(base, relative_url)
+ else
+ absolute_url = create_absolute_url_from_context(base, relative_url)
+ end
+ puts "Found absolute URL: #{absolute_url}" if @verbose
+ return absolute_url
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
+ end
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
def create_absolute_url_from_base(potential_base, relative_url)
- begin
- #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
- naked_base = url_2_site(potential_base).strip.chop
- puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
- return naked_base + relative_url
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
- return nil
- end
- end
+ #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
+ naked_base = url_2_site(potential_base).strip.chop
+ puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
+ return naked_base + relative_url
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
+ end
# Construct the absolute URL by comparing a known URL and the relative file path
def create_absolute_url_from_context(potential_base, relative_url)
- puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
- begin
- absolute_url = nil
- # make relative URL naked by removing the beginning '/'
- relative_url.sub!(/^\//,'')
- if potential_base =~ /\/$/
- absolute_url = potential_base+relative_url.strip
+ puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
+ absolute_url = nil
+ # make relative URL naked by removing the beginning '/'
+ relative_url.sub!(/^\//,'')
+ if potential_base =~ /\/$/
+ absolute_url = potential_base+relative_url.strip
+ else
+ last_index_of_slash = potential_base.rindex('/')
+ if potential_base[last_index_of_slash-2, 2] == ':/'
+ absolute_url = potential_base+relative_url
else
- last_index_of_slash = potential_base.rindex('/')
- if potential_base[last_index_of_slash-2, 2] == ':/'
- absolute_url = potential_base+relative_url
+ last_index_of_dot = potential_base.rindex('.')
+ if last_index_of_dot < last_index_of_slash
+ absolute_url = potential_base.strip.chop+relative_url
else
- last_index_of_dot = potential_base.rindex('.')
- if last_index_of_dot < last_index_of_slash
- absolute_url = potential_base.strip.chop+relative_url
- else
- absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
- end
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
end
end
- puts "Found absolute URL: #{absolute_url}" if @verbose
- return absolute_url
- rescue => ee
- puts "Exception on method #{__method__}: #{ee}" if @verbose
- return nil
- end
- end
+ end
+ puts "Found absolute URL: #{absolute_url}" if @verbose
+ return absolute_url
+ rescue => ee
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
+ return nil
+ end
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
def normalize_url(url)
- begin
- url.strip!
- # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
- # Normalize the base
- base=url_2_site(url)
- # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
- base=base.sub(/\.\/$/,'/')
- # Normalize the relative path, case#1
- # retrieve the file path and remove the first '/' or '.',
- # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
- path=url_2_path(url).sub(/^(\/|\.)*/,'')
- # Normalize the relative path, case#2
- # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
- path=path.gsub(/\/\.{1,2}\//,'/')
- if path.nil?
- return base
- else
- return base+path
- end
- rescue => ee
- puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
- return url
+ url.strip!
+ # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
+ # Normalize the base
+ base=url_2_site(url)
+ # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
+ base=base.sub(/\.\/$/,'/')
+ # Normalize the relative path, case#1
+ # retrieve the file path and remove the first '/' or '.',
+ # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
+ path=url_2_path(url).sub(/^(\/|\.)*/,'')
+ # Normalize the relative path, case#2
+ # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
+ path=path.gsub(/\/\.{1,2}\//,'/')
+ if path.nil?
+ return base
+ else
+ return base+path
end
+ rescue => ee
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
+ return url
end
# Test the URL and return the response code
def response_code (url)