lib/wmap/utils/domain_root.rb in wmap-2.5.2 vs lib/wmap/utils/domain_root.rb in wmap-2.5.4

- old
+ new

@@ -19,167 +19,193 @@ File_tld=File.dirname(__FILE__)+'/../../../dicts/tlds.txt' # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk" def get_domain_root (host) puts "Retrieve the root domain for host: #{host}" if @verbose - begin - # Comnplete Top Level Domain List - loading once - @tlds=file_2_hash(File_tld) if @tlds.nil? - # Generic Top Level Domain List - loading once - @gtld=file_2_hash(File_gtld) if @gtld.nil? - # Country code top-level domain list - loading once - @cctld=file_2_hash(File_cctld) if @cctld.nil? - # Country code second level domain - loading once - @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil? - - if host.strip.nil? - puts "Error: empty record found. Please check your input and remove any empty line." if @verbose - return nil - else - host=host.downcase.strip - end - found_tld=false - found_cctld=false - # search the top level domain list first - root_domain="" - dn=host.split(".") - if @tlds.key?(dn.last) - cc_found=false - if @cctld.key?(dn[dn.length-2]) - cc_found=true - end - if cc_found - root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last - else - root_domain=dn[dn.length-2] + "." + dn.last - end - found_tld=true - end - # search the country code top level domain list secondly - if @cctld.key?(dn.last) - found=false - # reverse search of general top level domain - if @gtld.key?(dn[dn.length-2]) - found=true - end - # search country code second level domain list - if @ccsld.key?(dn.last) - @ccsld[dn.last].each do |v| - if ( v =~ /#{dn[dn.length-2]}/i ) - found=true - break - end - end - # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string - #unless found - # if @gtld.key?(dn[dn.length-2]) - # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}" - # return nil - # end - #end - end - if found - root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last - else - root_domain=dn[dn.length-2] + "." + dn.last - end - found_cctld=true - end - unless (found_tld or found_cctld) - puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose - return nil - else - puts "Domain root found: #{root_domain}" if @verbose - return root_domain - end - rescue => ee - puts "Exception on method #{__method__}: #{ee}" if @verbose + if host.strip.nil? + puts "Error: empty record found. Please check your input and remove any empty line." if @verbose return nil + else + host=host.downcase.strip end + # First order - search country code second level domain list + root_domain = get_domain_root_by_ccsld(host) + if root_domain.nil? + # Second order - search the country code top level domain list + root_domain = get_domain_root_by_cctld(host) + if root_domain.nil? + # Third order - search top level domain list + root_domain = get_domain_root_by_tlds(host) + if root_domain.nil? + # do nothing - no further search + else + return root_domain + end + else + return root_domain + end + else + return root_domain + end + puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose + return nil + #rescue => ee + # puts "Exception on method #{__method__}: #{ee}" if @verbose + # return nil end alias_method :get_root_domain, :get_domain_root alias_method :root_domain, :get_domain_root alias_method :domain_root, :get_domain_root alias_method :host_2_domain, :get_domain_root + # get domain root by lookup Country Code Second Level Domain list + def get_domain_root_by_ccsld(host) + puts "First order search - domain root lookup by Country Code Second Level Domain list ..." if @verbose + root_domain = nil + dn = host.split(".") + # Country code second level domain - loading once + @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil? + # search country code second level domain list + if @ccsld.key?(dn.last) + @ccsld[dn.last].each do |v| + if ( v =~ /#{dn[dn.length-2]}/i ) + return dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last + end + end + end + return root_domain + #rescue => ee + # puts "Exception on method #{__method__}: #{ee}" if @verbose + # return nil + end + + # get domain root by lookup Country Code Top Level Domain list + def get_domain_root_by_cctld(host) + puts "Second order search - domain root lookup by Country Code Top Level Domain list ..." if @verbose + root_domain = nil + dn = host.split(".") + # Country code top-level domain list - loading once + @cctld=file_2_hash(File_cctld) if @cctld.nil? + # Generic Top Level Domain List - loading once + @gtld=file_2_hash(File_gtld) if @gtld.nil? + # Country code second level domain - loading once + @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil? + # search the country code top level domain list + if @cctld.key?(dn.last) + # reverse search of general top level domain + if @gtld.key?(dn[dn.length-2]) + root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last + else + root_domain=dn[dn.length-2] + "." + dn.last + end + end + return root_domain + #rescue => ee + # puts "Exception on method #{__method__}: #{ee}" if @verbose + # return nil + end + + # get domain root by lookup Top Level Domain list + def get_domain_root_by_tlds(host) + puts "Third order search - domain root lookup by Top Level Domain list ..." if @verbose + root_domain = nil + dn = host.split(".") + # Comnplete Top Level Domain List - loading once + @tlds=file_2_hash(File_tld) if @tlds.nil? + # Country code top-level domain list - loading once + @cctld=file_2_hash(File_cctld) if @cctld.nil? + cc_found=false + if @tlds.key?(dn.last) + if @cctld.key?(dn[dn.length-2]) + cc_found=true + end + if cc_found + root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last + else + root_domain=dn[dn.length-2] + "." + dn.last + end + end + return root_domain + #rescue => ee + # puts "Exception on method #{__method__}: #{ee}" if @verbose + # return nil + end + # 'setter' to parse and load the known country code second level domain table from the file # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]} def load_ccsld_from_file (file_ccsld) - begin - ccsld=Hash.new - puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose - f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit - f.each do |line| - next unless line =~ /^\s+\.\w/ - line=line.chomp.strip.downcase - entry=line.split(' ')[0].split('.') - if entry.length > 2 - key=entry.last - ccsld[key] = Array.new if not ccsld.key?(key) - val=entry[entry.length-2] - #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose - ccsld[key].push(val) unless key.nil? - end + ccsld=Hash.new + puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose + f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit + f.each do |line| + next unless line =~ /^\s+\.\w/ + line=line.chomp.strip.downcase + entry=line.split(' ')[0].split('.') + if entry.length > 2 + key=entry.last + ccsld[key] = Array.new if not ccsld.key?(key) + val=entry[entry.length-2] + #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose + ccsld[key].push(val) unless key.nil? end - f.close - # Sort the blocks once in descendant order once for better performance - return ccsld - rescue => ee - puts "Exception on method #{__method__}: #{ee}" if @verbose end + f.close + # Sort the blocks once in descendant order once for better performance + return ccsld + rescue => ee + puts "Exception on method #{__method__}: #{ee}" if @verbose end # Test a host string to see if it's a valid Internet root domain def is_domain_root? (domain) - puts "Validate the domain name is valid: #{domain}" if @verbose - begin - domain=domain.strip.downcase - return domain == get_domain_root(domain) - rescue => ee - puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose - return false - end + puts "Validate the domain name is valid: #{domain}" if @verbose + domain=domain.strip.downcase + return domain == get_domain_root(domain) + rescue => ee + puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose + return false end alias_method :is_root_domain?, :is_domain_root? alias_method :is_domain?, :is_domain_root? alias_method :is_root?, :is_domain_root? # Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk" def get_sub_domain (host) puts "Retrieve sub-domain from host: #{host}" if @verbose - begin - subdomain=String.new - host=host.strip.downcase - domain=get_domain_root(host) - record_h=host.split(".") - record_d=domain.split(".") - if (record_h.length - record_d.length) >= 2 - subdomain=record_h[record_h.length-record_d.length-1]+"."+domain - puts "Sub domain found: #{subdomain}" if @verbose - return subdomain - else - return nil - end - rescue Exception => ee - puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose + subdomain=String.new + host=host.strip.downcase + domain=get_domain_root(host) + record_h=host.split(".") + record_d=domain.split(".") + if (record_h.length - record_d.length) >= 2 + subdomain=record_h[record_h.length-record_d.length-1]+"."+domain + puts "Sub domain found: #{subdomain}" if @verbose + return subdomain + else return nil end + rescue Exception => ee + puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose + return nil end alias_method :get_subdomain, :get_sub_domain # Function to print instance variable - General top level domain list def print_gtld puts @gtld + return @gtld end # Function to print instance variable - Country code top-level domain list def print_cctld puts @cctld + return @cctld end # Function to print instance variable - Country code second-level domain list def print_ccsld puts @ccsld + return @ccsld end private :load_ccsld_from_file end