Sha256: 5641c09bdd0b07b00f1627ccc660b725121cb08281245eb10db87d6fa982de17
Contents?: true
Size: 1.94 KB
Versions: 3
Compression:
Stored size: 1.94 KB
Contents
#!/usr/bin/env ruby # # Reconciles the USA.gov-maintained list of US domains with domains.txt # to show domains listed in the USA.gov-maintained list that we reject and why # # Usage: script/reconcile-us require './lib/gman/importer' require 'yaml' ENV["RECONCILING"] = "true" blacklist = ["usagovQUASI"] source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt" data = open(source).read data = data.split("__________________________________________________________________________") data = data.last.strip data = data.split(/\r?\n/).reject { |r| r.empty? } domains = {} group = "" data.each do |row| if row =~ /^\w/ group = row domains[group] = [] else domains[group].push row.sub("\.\t", "").strip end end domains.reject! { |group,domain| blacklist.include?(group) } importer = Gman::Importer.new(domains) importer.logger.info "Starting with #{importer.domains.count} domains" importer.domains.list.each do |group, domains| domains.map! { |domain| Gman.new(domain).to_s } domains.map! { |domain| importer.normalize_domain(domain) } end importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains" missing = {} importer.domains.list.each do |group, usagovdomains| next unless importer.current.list[group] missing[group] = importer.current.list[group] - usagovdomains end missing.reject! { |key, value| value.empty? } importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list" puts "Here's the list of missing domains:" puts YAML.dump(missing) domains = importer.domains.domains domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) } domains.delete(true) domains.delete(false) domains.delete("locality") importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains" puts "Here are the rejected domains and why they were rejected (excluding locality regexs):" puts YAML.dump(domains)
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
gman-5.0.9 | script/reconcile-us |
gman-5.0.8 | script/reconcile-us |
gman-5.0.7 | script/reconcile-us |