#! /usr/bin/env ruby
#
# Vendors the USA.gov-maintained list of US domains into domains.txt
# Source: https://github.com/GSA-OCSIT/govt-urls
#
# Normalizes and cleans inputs, validates domains, rejects academic domains, and
# sorts, ensures uniqueness, and merges into the existing lib/domains.txt list
#
# Usage: script/vendor-us
#
# Will automatically fetch latest version of the list and merge
# You can check for changes and commit via `git status`
#
# It's also probably a good idea to run `script/ci-build` for good measure

require 'rubygems'
require 'public_suffix'
require 'swot'
require 'yaml'
require 'open-uri'
require './lib/gman'
require './lib/gman/parser'

SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"]
domain_hash = {}

domain_hash = YAML.load(open(SOURCE).read)
puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..."

# Normalize ALL THE THINGS
domain_hash.each do |group, domains|
  domains.map! { |domain| domain.strip } # Strip trailing slashes
  domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes
  domains.map! { |domain| domain.downcase } # make lower case
  domains.reject! { |domain| domain.empty? }      # Reject empty strings
end

# filter
domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist
domain_hash.each do |group, domains|
  puts "Filtering #{group}..."
  domains.reject! { |domain| domain.match /\// }           # Reject URLs
  domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain
  domains.reject! { |domain| Swot::is_academic?(domain) }  # Reject academic domains
end
puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains"

# Grab existing list
current = Gman::Parser.file_to_array( Gman::list_path )
current_hash = Gman::Parser.array_to_hash(current)
puts "Current list contains #{current.size} domains... merging"

# Lazy deep merge
domain_hash.each do |group,domains|
  current_hash[group] = [] if current_hash[group].nil?
  current_hash[group].concat domains
  current_hash[group].sort! # Alphabetize
  current_hash[group].uniq! # Ensure uniqueness
end

# Sort by group
current_hash = current_hash.sort_by { |group, domains| group.downcase }

# PublicSuffix Formatted Output
current_group = ""
output = ""
current_hash.each do |group, domains|
  if group != current_group
    output << "\n\n" unless current_group.empty? # first entry
    output << "// #{group}\n"
    current_group = group
  end
  output << domains.join("\n")
end

puts "merged. Writing..."

File.open(Gman.list_path, "w") { |file| file.write output }

result = Gman::Parser.file_to_array( Gman::list_path )
puts "New list contains #{result.size} domains. Fin."