Sha256: e9b7c34b311c6e78feeee15b4f4cf19e2da146b5f565d1b910295bc6be41e87b

Contents?: true

Size: 1.46 KB

Versions: 58

Compression:

Stored size: 1.46 KB

Contents

require 'net/http'
require 'open-uri'




namespace :load_maps do

  desc "Load MARC geo codes by screen-scraping LC"
  task :marc_geographic do
    begin
      require 'nokogiri'
    rescue LoadError => e
      $stderr.puts "\n  load_maps:marc_geographic task requires nokogiri"
      $stderr.puts "  Try `gem install nokogiri` and try again. Exiting...\n\n"
      exit 1
    end

    source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"

    filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
    file = File.open( filename, "w:utf-8" )

    $stderr.puts "Writing to `#{filename}` ..."

    html = Nokogiri::HTML(open(source_url).read)

    file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"    
    file.puts "# Scraped from #{source_url} at #{Time.now}"
    file.puts "# Intentionally includes discontinued codes."

    file.puts "\n"
    html.css("tr").each do |line|
      code = line.css("td.code").inner_text.strip
      unless code.nil? || code.empty?
        code.gsub!(/^\-/, '') # treat discontinued code like any other

        label = line.css("td[2]").inner_text.strip

        label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
        label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true. 

        file.puts "'#{code}': '#{label}'"
      end
    end
    $stderr.puts "Done."
  end
end

Version data entries

58 entries across 58 versions & 1 rubygems

Version Path
traject-3.8.1 lib/tasks/load_maps.rake
traject-3.8.0 lib/tasks/load_maps.rake
traject-3.7.0 lib/tasks/load_maps.rake
traject-3.6.0 lib/tasks/load_maps.rake
traject-3.5.0 lib/tasks/load_maps.rake
traject-3.4.0 lib/tasks/load_maps.rake
traject-3.3.0 lib/tasks/load_maps.rake
traject-3.2.0 lib/tasks/load_maps.rake
traject-3.1.0 lib/tasks/load_maps.rake
traject-3.1.0.rc1 lib/tasks/load_maps.rake
traject-3.0.0 lib/tasks/load_maps.rake
traject-3.0.0.alpha.2 lib/tasks/load_maps.rake
traject-3.0.0.alpha.1 lib/tasks/load_maps.rake
traject-2.3.4-java lib/tasks/load_maps.rake
traject-2.3.4 lib/tasks/load_maps.rake
traject-2.3.3 lib/tasks/load_maps.rake
traject-2.3.3-java lib/tasks/load_maps.rake
traject-2.3.2-java lib/tasks/load_maps.rake
traject-2.3.2 lib/tasks/load_maps.rake
traject-2.3.1-java lib/tasks/load_maps.rake