=data_miner
Programmatically import useful data into your ActiveRecord models.
(see http://wiki.github.com/seamusabshere/data_miner for more examples)
==Quick start
You define data_miner blocks in your ActiveRecord models. For example, in app/models/country.rb:
class Country < ActiveRecord::Base
set_primary_key :iso_3166_code
data_miner do
import 'the official ISO country list',
:url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
:skip => 2,
:headers => false,
:delimiter => ';',
:encoding => 'ISO-8859-1' do
key :iso_3166_code, :field_number => 1
store :name, :field_number => 0
end
end
end
Now you can run:
irb(main):001:0> Country.run_data_miner!
=> nil
== Creating tables from scratch (changed in 1.2)
We recommend using the mini_record-compat gem (https://github.com/seamusabshere/mini_record)
This replaces the schema method that was available before. It didn't make sense for data_miner to provide this natively.
class Car < ActiveRecord::Base
# the mini_record way
col :make
col :model
data_miner do
# DEPRECATED - see above
# schema do
# string :make
# string :model
# end
# the mini_record way
process :auto_upgrade!
# [... other data mining steps]
end
end
==Advanced usage
This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft:
class Aircraft < ActiveRecord::Base
# Tell ActiveRecord that we want to use a string primary key.
# This makes it easier to repeatedly truncate and re-import this
# table without breaking associations.
set_primary_key :icao_code
# Use the mini_record-compat gem to define the database schema in-line.
# It will destructively and automatically add/remove columns.
# This is "OK" because you can always just re-run the import script to get the data back.
# PS. If you're using DataMapper, you don't need this
col :icao_code
col :manufacturer_name
col :name
col :bts_name
col :bts_aircraft_type_code
col :brighter_planet_aircraft_class_code
col :fuel_use_aircraft_name
col :m3, :type => :float
col :m3_units
col :m2, :type => :float
col :m2_units
col :m1, :type => :float
col :m1_units
col :endpoint_fuel, :type => :float
col :endpoint_fuel_units
col :seats, :type => :float
col :distance, :type => :float
col :distance_units
col :load_factor, :type => :float
col :freight_share, :type => :float
col :payload, :type => :float
col :weighting, :type => :float
col :bts_aircraft_type_code, :type => :index
# A dictionary between BTS aircraft type codes and ICAO aircraft
# codes that uses string similarity instead of exact matching.
# This is preferable to typing everything out.
def self.bts_name_dictionary
# Sorry for documenting the LooseTightDictionary gem here, but it's useful
@_bts_dictionary ||= LooseTightDictionary.new(
# The first argument is the source... the possible matches. Most Enumerables will do.
RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
# Tightenings optionally pull out what is important on both sides of a potential match
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
# Identities optionally require a particular capture from both sides of a match to be equal
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
# Blockings restrict comparisons to a subset where everything matches the blocking
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
# This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
# This is useful because we say /boeing/ and only boeings are matched against other boeings.
:blocking_only => true,
# Tell the dictionary how read things from the source.
:right_reader => lambda { |record| record['Description'] }
)
end
# A dictionary between what appear to be ICAO aircraft names and
# objects of this class itself.
# Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
def self.icao_name_dictionary
@_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
:tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
:identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
:blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
:right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
end
# This responds to the "Matcher" interface as defined by DataMiner.
# In other words, it takes Matcher#match(*args) and returns something.
class BtsMatcher
attr_reader :wants
def initialize(wants)
@wants = wants
end
def match(raw_faa_icao_record)
@_match ||= Hash.new
return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
retval = case wants
when :bts_aircraft_type_code
bts_record['Code']
when :bts_name
bts_record['Description']
end if bts_record
@_match[raw_faa_icao_record] = retval
end
end
# Another class that implements the "Matcher" interface as expected by DataMiner.
class FuelUseMatcher
def match(raw_fuel_use_record)
@_match ||= Hash.new
return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
# First try assuming we have an ICAO code
aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
end
# No luck? then try a fuzzy match
aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
end
if aircraft_record
@_match[raw_fuel_use_record] = aircraft_record.icao_code
else
# While we're developing the dictionary, we want it to blow up until we have 100% matchability
raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
end
end
end
# This responds to the "Responder" interface as expected by Errata.
# Basically it lets you say "Is a DC plane" in the errata file and
# have it map to a Ruby method.
class Guru
def is_a_dc_plane?(row)
row['Designator'] =~ /^DC\d/i
end
def is_a_g159?(row)
row['Designator'] =~ /^G159$/
end
def is_a_galx?(row)
row['Designator'] =~ /^GALX$/
end
def method_missing(method_id, *args, &block)
if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
manufacturer_name = $1
manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
else
super
end
end
end
data_miner do
# In our app, we defined DataMiner::Run.allowed? to return false if a run
# has taken place in the last hour (among other things).
# By raising DataMiner::Skip, we skip this run but call it a success.
process "Don't re-import too often" do
raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
end
# The FAA publishes a document to help people identify aircraft by different names.
('A'..'Z').each do |letter|
import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
# The master URL of the source file (one for every letter)
:url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
# The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
:errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
# If it's not UTF-8, you should say what it is so that we can iconv it!
:encoding => 'windows-1252',
# Nokogiri is being used to grab each row starting from the second
:row_xpath => '//table/tr[2]/td/table/tr',
# ditto... XPath for Nokogiri
:column_xpath => 'td' ) do
# The code that they use is in fact the ICAO code!
key 'icao_code', :field_name => 'Designator'
# We get this for free
store 'manufacturer_name', :field_name => 'Manufacturer'
# ditto
store 'name', :field_name => 'Model'
# Use the loose-tight dictionary.
# It gets the entire input row to play with before deciding on an output.
store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
end
end
# Pull in some data that might only be important to Brighter Planet
import "Brighter Planet's aircraft class codes",
:url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
store 'brighter_planet_aircraft_class_code'
end
# Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
# This data comes from the EEA.
import "pre-calculated fuel use equation coefficients",
:url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
:select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
# We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new
# Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
store 'm3'
store 'm2'
store 'm1'
store 'endpoint_fuel', :field_name => 'b'
end
# Use arel and the weighted_average gem to do some crazy averaging.
# This assumes that you're dealing with the BTS T-100 flight segment data.
# See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
process "Derive some average flight characteristics from flight segments" do
FlightSegment.run_data_miner!
aircraft = Aircraft.arel_table
segments = FlightSegment.arel_table
conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})"
update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"
update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
end
# And finally re-run the import of resources that depend on this resource.
# Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
# that's the right way to do dependencies. It won't get called twice in the same run.
[ AircraftManufacturer ].each do |synthetic_resource|
process "Synthesize #{synthetic_resource}" do
synthetic_resource.run_data_miner!
end
end
end
end
==Authors
* Seamus Abshere
* Andy Rossmeissl
==Copyright
Copyright (c) 2010 Brighter Planet. See LICENSE for details.