#! /usr/bin/env ruby # # Vendors the USA.gov-maintained list of US domains into domains.txt # Source: https://github.com/GSA-OCSIT/govt-urls # # Normalizes and cleans inputs, validates domains, rejects academic domains, and # sorts, ensures uniqueness, and merges into the existing lib/domains.txt list # # Usage: script/vendor-us # # Will automatically fetch latest version of the list and merge # You can check for changes and commit via `git status` # # It's also probably a good idea to run `script/ci-build` for good measure require 'rubygems' require 'public_suffix' require 'swot' require 'yaml' require 'open-uri' require './lib/gman' require './lib/gman/parser' SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"] domain_hash = {} domain_hash = YAML.load(open(SOURCE).read) puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..." # Normalize ALL THE THINGS domain_hash.each do |group, domains| domains.map! { |domain| domain.strip } # Strip trailing slashes domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes domains.map! { |domain| domain.downcase } # make lower case domains.reject! { |domain| domain.empty? } # Reject empty strings end # filter domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist domain_hash.each do |group, domains| puts "Filtering #{group}..." domains.reject! { |domain| domain.match /\// } # Reject URLs domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains end puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains" # Grab existing list current = Gman::Parser.file_to_array( Gman::list_path ) current_hash = Gman::Parser.array_to_hash(current) puts "Current list contains #{current.size} domains... merging" # Lazy deep merge domain_hash.each do |group,domains| current_hash[group] = [] if current_hash[group].nil? current_hash[group].concat domains current_hash[group].sort! # Alphabetize current_hash[group].uniq! # Ensure uniqueness end # Sort by group current_hash = current_hash.sort_by { |group, domains| group.downcase } # PublicSuffix Formatted Output current_group = "" output = "" current_hash.each do |group, domains| if group != current_group output << "\n\n" unless current_group.empty? # first entry output << "// #{group}\n" current_group = group end output << domains.join("\n") end puts "merged. Writing..." File.open(Gman.list_path, "w") { |file| file.write output } result = Gman::Parser.file_to_array( Gman::list_path ) puts "New list contains #{result.size} domains. Fin."