Sha256: 80793a48cb2989d0917986150fbfdbd6d073252db760b3526d336e89765762e0
Contents?: true
Size: 1.54 KB
Versions: 5
Compression:
Stored size: 1.54 KB
Contents
#! /usr/bin/env ruby require 'mechanize' require 'csv' require 'swot' require './lib/gman' require './lib/gman/parser' url = "http://www.myndighetsregistret.scb.se/Myndighet.aspx" agent = Mechanize.new page = agent.get(url) form = page.form("form1") form.radiobuttons.find { |r| r.value = "Textfil" }.check submit_button = form.buttons.find { |b| b.type == "submit" } response = agent.submit(form, submit_button) domains = [] rows = CSV.parse(response.content, :headers => true, :col_sep => "\t") puts "Starting with #{rows.count} domains..." rows.each do |row| next if Swot.valid?(row["Webbadress"]) # Filter out Swot'd domains next if row["Namn"] =~ /UNIVERSITET/ # Filter out domains that are clearly edu domains.push Gman.new(row["Webbadress"]).domain.to_s.gsub(/^www\./,"") end domains.reject! { |domain| domain.empty? } domains.compact! domains.uniq! domains.select! { |domain| PublicSuffix.valid?(".#{domain}") } puts "Ended up with #{domains.count} domains." current = Gman::Parser.file_to_array( Gman::list_path ) current_hash = Gman::Parser.array_to_hash(current) current_hash["Swedish Administrative Authorities"] = domains current_hash = current_hash.sort_by { |group, domains| group.downcase } # PublicSuffix Formatted Output current_group = "" output = "" current_hash.each do |group, domains| if group != current_group output << "\n\n" unless current_group.empty? # first entry output << "// #{group}\n" current_group = group end output << domains.join("\n") end File.open(Gman.list_path, "w") { |file| file.write output }
Version data entries
5 entries across 5 versions & 1 rubygems
Version | Path |
---|---|
gman-4.6.5 | script/vendor-se |
gman-4.6.4 | script/vendor-se |
gman-4.6.3 | script/vendor-se |
gman-4.6.2 | script/vendor-se |
gman-4.6.1 | script/vendor-se |