Sha256: 3d2b814e5bf58ac44bb3f9508a8e486cbbc94bd54d12bf50959882765569052a

Contents?: true

Size: 1.84 KB

Versions: 5

Compression:

Stored size: 1.84 KB

Contents

#!/usr/bin/env ruby
#
# Reconciles the USA.gov-maintained list of US domains with domains.txt
# to show domains listed in the USA.gov-maintained list that we reject and why
#
# Usage: script/reconcile-us

require './lib/gman/importer'
require 'yaml'

ENV['RECONCILING'] = 'true'
blacklist = ['usagovQUASI']
source = 'https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt'

data = open(source).read
data = data.split('_' * 74)
data = data.last.strip
data = data.split(/\r?\n/).reject(&:empty?)

domains = {}
group = ''
data.each do |row|
  if row =~ /^\w/
    group = row
    domains[group] = []
  else
    domains[group].push row.sub("\.\t", '').strip
  end
end

domains.reject! { |g, _domain| blacklist.include?(g) }
importer = Gman::Importer.new(domains)

importer.logger.info "Starting with #{importer.domains.count} domains"

importer.domains.list.each do |_group, d|
  d.map! { |domain| Gman.new(domain).to_s }
  d.map! { |domain| importer.normalize_domain(domain) }
end

count = importer.domains.domains.count
importer.logger.info "Filtered down to #{count} normalized domains"

missing = {}
importer.domains.list.each do |g, usagovdomains|
  next unless importer.current.list[g]
  missing[g] = importer.current.list[g] - usagovdomains
end

missing.reject! { |_key, value| value.empty? }

count = missing.values.count
importer.logger.info "Found #{count} domains not on the USA.gov list"
puts "Here's the list of missing domains:"
puts YAML.dump(missing)

domains = importer.domains.domains
domains = domains.group_by do |domain|
  importer.valid_domain?(domain, skip_dupe: true)
end
domains.delete(true)
domains.delete(false)
domains.delete('locality')

count = domains.values.flatten.count
importer.logger.info "Calling out #{count} rejected domains"

puts 'Here are the rejected domains and why they were rejected:'
puts YAML.dump(domains)

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
gman-7.0.2 script/reconcile-us
gman-7.0.1 script/reconcile-us
gman-7.0.0 script/reconcile-us
gman-6.0.1 script/reconcile-us
gman-6.0.0 script/reconcile-us