Sha256: e9b7c34b311c6e78feeee15b4f4cf19e2da146b5f565d1b910295bc6be41e87b
Contents?: true
Size: 1.46 KB
Versions: 58
Compression:
Stored size: 1.46 KB
Contents
require 'net/http' require 'open-uri' namespace :load_maps do desc "Load MARC geo codes by screen-scraping LC" task :marc_geographic do begin require 'nokogiri' rescue LoadError => e $stderr.puts "\n load_maps:marc_geographic task requires nokogiri" $stderr.puts " Try `gem install nokogiri` and try again. Exiting...\n\n" exit 1 end source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html" filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__) file = File.open( filename, "w:utf-8" ) $stderr.puts "Writing to `#{filename}` ..." html = Nokogiri::HTML(open(source_url).read) file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task" file.puts "# Scraped from #{source_url} at #{Time.now}" file.puts "# Intentionally includes discontinued codes." file.puts "\n" html.css("tr").each do |line| code = line.css("td.code").inner_text.strip unless code.nil? || code.empty? code.gsub!(/^\-/, '') # treat discontinued code like any other label = line.css("td[2]").inner_text.strip label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah. label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true. file.puts "'#{code}': '#{label}'" end end $stderr.puts "Done." end end
Version data entries
58 entries across 58 versions & 1 rubygems