=data_miner Programmatically import useful data into your ActiveRecord models. (see http://wiki.github.com/seamusabshere/data_miner for more examples) ==Quick start You define data_miner blocks in your ActiveRecord models. For example, in app/models/country.rb: class Country < ActiveRecord::Base set_primary_key :iso_3166_code data_miner do import 'the official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';', :encoding => 'ISO-8859-1' do key :iso_3166_code, :field_number => 1 store :name, :field_number => 0 end end end Now you can run: irb(main):001:0> Country.run_data_miner! => nil == Creating tables from scratch (changed in 1.2) We recommend using the mini_record-compat gem (https://github.com/seamusabshere/mini_record) This replaces the schema method that was available before. It didn't make sense for data_miner to provide this natively. class Car < ActiveRecord::Base # the mini_record way col :make col :model data_miner do # DEPRECATED - see above # schema do # string :make # string :model # end # the mini_record way process :auto_upgrade! # [... other data mining steps] end end ==Advanced usage This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft: class Aircraft < ActiveRecord::Base # Tell ActiveRecord that we want to use a string primary key. # This makes it easier to repeatedly truncate and re-import this # table without breaking associations. set_primary_key :icao_code # Use the mini_record-compat gem to define the database schema in-line. # It will destructively and automatically add/remove columns. # This is "OK" because you can always just re-run the import script to get the data back. # PS. If you're using DataMapper, you don't need this col :icao_code col :manufacturer_name col :name col :bts_name col :bts_aircraft_type_code col :brighter_planet_aircraft_class_code col :fuel_use_aircraft_name col :m3, :type => :float col :m3_units col :m2, :type => :float col :m2_units col :m1, :type => :float col :m1_units col :endpoint_fuel, :type => :float col :endpoint_fuel_units col :seats, :type => :float col :distance, :type => :float col :distance_units col :load_factor, :type => :float col :freight_share, :type => :float col :payload, :type => :float col :weighting, :type => :float col :bts_aircraft_type_code, :type => :index # A dictionary between BTS aircraft type codes and ICAO aircraft # codes that uses string similarity instead of exact matching. # This is preferable to typing everything out. def self.bts_name_dictionary # Sorry for documenting the LooseTightDictionary gem here, but it's useful @_bts_dictionary ||= LooseTightDictionary.new( # The first argument is the source... the possible matches. Most Enumerables will do. RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }), # Tightenings optionally pull out what is important on both sides of a potential match :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false), # Identities optionally require a particular capture from both sides of a match to be equal :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false), # Blockings restrict comparisons to a subset where everything matches the blocking :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false), # This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking. # This is useful because we say /boeing/ and only boeings are matched against other boeings. :blocking_only => true, # Tell the dictionary how read things from the source. :right_reader => lambda { |record| record['Description'] } ) end # A dictionary between what appear to be ICAO aircraft names and # objects of this class itself. # Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step. def self.icao_name_dictionary @_icao_dictionary ||= LooseTightDictionary.new Aircraft.all, :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false), :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false), :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false), :right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s } end # This responds to the "Matcher" interface as defined by DataMiner. # In other words, it takes Matcher#match(*args) and returns something. class BtsMatcher attr_reader :wants def initialize(wants) @wants = wants end def match(raw_faa_icao_record) @_match ||= Hash.new return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record) faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ] bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record retval = case wants when :bts_aircraft_type_code bts_record['Code'] when :bts_name bts_record['Description'] end if bts_record @_match[raw_faa_icao_record] = retval end end # Another class that implements the "Matcher" interface as expected by DataMiner. class FuelUseMatcher def match(raw_fuel_use_record) @_match ||= Hash.new return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record) # First try assuming we have an ICAO code aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/ Aircraft.find_by_icao_code raw_fuel_use_record['ICAO'] end # No luck? then try a fuzzy match aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present? Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ] end if aircraft_record @_match[raw_fuel_use_record] = aircraft_record.icao_code else # While we're developing the dictionary, we want it to blow up until we have 100% matchability raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet" end end end # This responds to the "Responder" interface as expected by Errata. # Basically it lets you say "Is a DC plane" in the errata file and # have it map to a Ruby method. class Guru def is_a_dc_plane?(row) row['Designator'] =~ /^DC\d/i end def is_a_g159?(row) row['Designator'] =~ /^G159$/ end def is_a_galx?(row) row['Designator'] =~ /^GALX$/ end def method_missing(method_id, *args, &block) if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/ manufacturer_name = $1 manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE) matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil? else super end end end data_miner do # In our app, we defined DataMiner::Run.allowed? to return false if a run # has taken place in the last hour (among other things). # By raising DataMiner::Skip, we skip this run but call it a success. process "Don't re-import too often" do raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft end # The FAA publishes a document to help people identify aircraft by different names. ('A'..'Z').each do |letter| import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA", # The master URL of the source file (one for every letter) :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm", # The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details. :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new), # If it's not UTF-8, you should say what it is so that we can iconv it! :encoding => 'windows-1252', # Nokogiri is being used to grab each row starting from the second :row_xpath => '//table/tr[2]/td/table/tr', # ditto... XPath for Nokogiri :column_xpath => 'td' ) do # The code that they use is in fact the ICAO code! key 'icao_code', :field_name => 'Designator' # We get this for free store 'manufacturer_name', :field_name => 'Manufacturer' # ditto store 'name', :field_name => 'Model' # Use the loose-tight dictionary. # It gets the entire input row to play with before deciding on an output. store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code) store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name) end end # Pull in some data that might only be important to Brighter Planet import "Brighter Planet's aircraft class codes", :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type' store 'brighter_planet_aircraft_class_code' end # Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel). # This data comes from the EEA. import "pre-calculated fuel use equation coefficients", :url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods', :select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do # We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above. key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new # Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft. store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name' store 'm3' store 'm2' store 'm1' store 'endpoint_fuel', :field_name => 'b' end # Use arel and the weighted_average gem to do some crazy averaging. # This assumes that you're dealing with the BTS T-100 flight segment data. # See http://data.brighterplanet.com/flight_segments for a pre-sanitized version. process "Derive some average flight characteristics from flight segments" do FlightSegment.run_data_miner! aircraft = Aircraft.arel_table segments = FlightSegment.arel_table conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code]) update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})" update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})" update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})" update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})" update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})" update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})" end # And finally re-run the import of resources that depend on this resource. # Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block; # that's the right way to do dependencies. It won't get called twice in the same run. [ AircraftManufacturer ].each do |synthetic_resource| process "Synthesize #{synthetic_resource}" do synthetic_resource.run_data_miner! end end end end ==Authors * Seamus Abshere * Andy Rossmeissl ==Copyright Copyright (c) 2010 Brighter Planet. See LICENSE for details.