Sha256: 5641c09bdd0b07b00f1627ccc660b725121cb08281245eb10db87d6fa982de17

Contents?: true

Size: 1.94 KB

Versions: 3

Compression:

Stored size: 1.94 KB

Contents

#!/usr/bin/env ruby
#
# Reconciles the USA.gov-maintained list of US domains with domains.txt
# to show domains listed in the USA.gov-maintained list that we reject and why
#
# Usage: script/reconcile-us

require './lib/gman/importer'
require 'yaml'

ENV["RECONCILING"] = "true"
blacklist = ["usagovQUASI"]
source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"

data = open(source).read
data = data.split("__________________________________________________________________________")
data = data.last.strip
data = data.split(/\r?\n/).reject { |r| r.empty? }

domains = {}
group = ""
data.each do |row|
  if row =~ /^\w/
    group = row
    domains[group] = []
  else
    domains[group].push row.sub("\.\t", "").strip
  end
end

domains.reject! { |group,domain| blacklist.include?(group) }
importer = Gman::Importer.new(domains)

importer.logger.info "Starting with #{importer.domains.count} domains"

importer.domains.list.each do |group, domains|
  domains.map! { |domain| Gman.new(domain).to_s }
  domains.map! { |domain| importer.normalize_domain(domain) }
end

importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"

missing = {}
importer.domains.list.each do |group, usagovdomains|
  next unless importer.current.list[group]
  missing[group] = importer.current.list[group] - usagovdomains
end

missing.reject! { |key, value| value.empty? }

importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
puts "Here's the list of missing domains:"
puts YAML.dump(missing)

domains = importer.domains.domains
domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
domains.delete(true)
domains.delete(false)
domains.delete("locality")

importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"

puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
puts YAML.dump(domains)

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
gman-5.0.9 script/reconcile-us
gman-5.0.8 script/reconcile-us
gman-5.0.7 script/reconcile-us