README.rdoc in data_miner-0.4.42 vs README.rdoc in data_miner-0.4.43

- old
+ new

@@ -1,91 +1,266 @@ =data_miner -Mine remote data into your ActiveRecord models. +Programmatically import useful data into your ActiveRecord models. +(see http://wiki.github.com/seamusabshere/data_miner for more examples) + ==Quick start -Put this in <tt>config/environment.rb</tt>: +You define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>: - config.gem 'data_miner' - -You need to define <tt>data_miner</tt> blocks in your ActiveRecord models. For example, in <tt>app/models/country.rb</tt>: - class Country < ActiveRecord::Base - set_primary_key :iso_3166 + set_primary_key :iso_3166_code data_miner do - import 'The official ISO country list', :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';' do - key 'iso_3166' - store 'iso_3166', :field_number => 1 - store 'name', :field_number => 0 + schema do + string 'iso_3166_code' + string 'name' end - import 'A Princeton dataset with better capitalization for some countries', :url => 'http://www.cs.princeton.edu/introcs/data/iso3166.csv' do - key 'iso_3166' - store 'iso_3166', :field_name => 'country code' - store 'name', :field_name => 'country' + import 'the official ISO country list', + :url => 'http://www.iso.org/iso/list-en1-semic-3.txt', + :skip => 2, + :headers => false, + :delimiter => ';', + :encoding => 'ISO-8859-1' do + key 'iso_3166_code', :field_number => 1 + store 'name', :field_number => 0 end end end -...and in <tt>app/models/airport.rb</tt>: +Now you can run: - class Airport < ActiveRecord::Base - set_primary_key :iata_code + irb(main):001:0> Country.run_data_miner! + => nil + +==Advanced usage + +This is how we linked together (http://data.brighterplanet.com/aircraft) the FAA's list of aircraft with the US Department of Transportations list of aircraft: + + class Aircraft < ActiveRecord::Base + # Tell ActiveRecord that we want to use a string primary key. + # This makes it easier to repeatedly truncate and re-import this + # table without breaking associations. + set_primary_key :icao_code - data_miner do - import :url => 'http://openflights.svn.sourceforge.net/viewvc/openflights/openflights/data/airports.dat', :headers => false, :select => lambda { |row| row[4].present? } do - key 'iata_code' - store 'name', :field_number => 1 - store 'city', :field_number => 2 - store 'country_name', :field_number => 3 - store 'iata_code', :field_number => 4 - store 'latitude', :field_number => 6 - store 'longitude', :field_number => 7 + # A dictionary between BTS aircraft type codes and ICAO aircraft + # codes that uses string similarity instead of exact matching. + # This is preferable to typing everything out. + def self.bts_name_dictionary + # Sorry for documenting the LooseTightDictionary gem here, but it's useful + @_bts_dictionary ||= LooseTightDictionary.new( + # The first argument is the source... the possible matches. Most Enumerables will do. + RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }), + # Tightenings optionally pull out what is important on both sides of a potential match + :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false), + # Identities optionally require a particular capture from both sides of a match to be equal + :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false), + # Blockings restrict comparisons to a subset where everything matches the blocking + :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false), + # This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking. + # This is useful because we say /boeing/ and only boeings are matched against other boeings. + :blocking_only => true, + # Tell the dictionary how read things from the source. + :right_reader => lambda { |record| record['Description'] } + ) + end + + # A dictionary between what appear to be ICAO aircraft names and + # objects of this class itself. + # Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step. + def self.icao_name_dictionary + @_icao_dictionary ||= LooseTightDictionary.new Aircraft.all, + :tightenings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false), + :identities => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false), + :blockings => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false), + :right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s } + end + + # This responds to the "Matcher" interface as defined by DataMiner. + # In other words, it takes Matcher#match(*args) and returns something. + class BtsMatcher + attr_reader :wants + def initialize(wants) + @wants = wants end + def match(raw_faa_icao_record) + @_match ||= Hash.new + return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record) + faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ] + bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record + retval = case wants + when :bts_aircraft_type_code + bts_record['Code'] + when :bts_name + bts_record['Description'] + end if bts_record + @_match[raw_faa_icao_record] = retval + end end - end -Put this in <tt>lib/tasks/data_miner_tasks.rake</tt>: (unfortunately I don't know a way to automatically include gem tasks, so you have to do this manually for now) - - namespace :data_miner do - task :run => :environment do - resource_names = %w{R RESOURCES RESOURCE RESOURCE_NAMES}.map { |possible_key| ENV[possible_key].to_s }.join.split(/\s*,\s*/).flatten.compact - DataMiner.run :resource_names => resource_names + # Another class that implements the "Matcher" interface as expected by DataMiner. + class FuelUseMatcher + def match(raw_fuel_use_record) + @_match ||= Hash.new + return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record) + # First try assuming we have an ICAO code + aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/ + Aircraft.find_by_icao_code raw_fuel_use_record['ICAO'] + end + # No luck? then try a fuzzy match + aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present? + Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ] + end + if aircraft_record + @_match[raw_fuel_use_record] = aircraft_record.icao_code + else + # While we're developing the dictionary, we want it to blow up until we have 100% matchability + raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet" + end + end end - end + + # This responds to the "Responder" interface as expected by Errata. + # Basically it lets you say "Is a DC plane" in the errata file and + # have it map to a Ruby method. + class Guru + def is_a_dc_plane?(row) + row['Designator'] =~ /^DC\d/i + end + def is_a_g159?(row) + row['Designator'] =~ /^G159$/ + end + def is_a_galx?(row) + row['Designator'] =~ /^GALX$/ + end + def method_missing(method_id, *args, &block) + if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/ + manufacturer_name = $1 + manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE) + matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i + method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil? + else + super + end + end + end + + data_miner do + # In our app, we defined DataMiner::Run.allowed? to return false if a run + # has taken place in the last hour (among other things). + # By raising DataMiner::Skip, we skip this run but call it a success. + process "Don't re-import too often" do + raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft + end + + # Define the database schema in-line. + # It will destructively and automatically add/remove columns. + # This is "OK" because you can always just re-run the import script to get the data back. + # PS. if we were using DataMapper, we wouldn't need this. + schema :options => 'ENGINE=InnoDB default charset=utf8' do + string 'icao_code' + string 'manufacturer_name' + string 'name' + string 'bts_name' + string 'bts_aircraft_type_code' + string 'brighter_planet_aircraft_class_code' + string 'fuel_use_aircraft_name' + float 'm3' + string 'm3_units' + float 'm2' + string 'm2_units' + float 'm1' + string 'm1_units' + float 'endpoint_fuel' + string 'endpoint_fuel_units' + float 'seats' + float 'distance' + string 'distance_units' + float 'load_factor' + float 'freight_share' + float 'payload' + float 'weighting' + index 'bts_aircraft_type_code' + end -Once you have (1) set up the order of data mining and (2) defined <tt>data_miner</tt> blocks in your classes, you can: + # The FAA publishes a document to help people identify aircraft by different names. + ('A'..'Z').each do |letter| + import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA", + # The master URL of the source file (one for every letter) + :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm", + # The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details. + :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new), + # If it's not UTF-8, you should say what it is so that we can iconv it! + :encoding => 'windows-1252', + # Nokogiri is being used to grab each row starting from the second + :row_xpath => '//table/tr[2]/td/table/tr', + # ditto... XPath for Nokogiri + :column_xpath => 'td' ) do + # The code that they use is in fact the ICAO code! + key 'icao_code', :field_name => 'Designator' + # We get this for free + store 'manufacturer_name', :field_name => 'Manufacturer' + # ditto + store 'name', :field_name => 'Model' + # Use the loose-tight dictionary. + # It gets the entire input row to play with before deciding on an output. + store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code) + store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name) + end + end - $ rake data_miner:run RESOURCES=Airport,Country + # Pull in some data that might only be important to Brighter Planet + import "Brighter Planet's aircraft class codes", + :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do + key 'bts_aircraft_type_code', :field_name => 'bts_aircraft_type' + store 'brighter_planet_aircraft_class_code' + end -==Complete example + # Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel). + # This data comes from the EEA. + import "pre-calculated fuel use equation coefficients", + :url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods', + :select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do + # We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above. + key 'icao_code', :matcher => Aircraft::FuelUseMatcher.new + # Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft. + store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name' + store 'm3' + store 'm2' + store 'm1' + store 'endpoint_fuel', :field_name => 'b' + end + + # Use arel and the weighted_average gem to do some crazy averaging. + # This assumes that you're dealing with the BTS T-100 flight segment data. + # See http://data.brighterplanet.com/flight_segments for a pre-sanitized version. + process "Derive some average flight characteristics from flight segments" do + FlightSegment.run_data_miner! + aircraft = Aircraft.arel_table + segments = FlightSegment.arel_table - ~ $ rails testapp - ~ $ cd testapp/ - ~/testapp $ ./script/generate model Airport iata_code:string name:string city:string country_name:string latitude:float longitude:float - [...edit migration to make iata_code the primary key...] - ~/testapp $ ./script/generate model Country iso_3166:string name:string - [...edit migration to make iso_3166 the primary key...] - ~/testapp $ rake db:migrate - ~/testapp $ touch lib/tasks/data_miner_tasks.rb - [...edit per quick start...] - ~/testapp $ rake data_miner:run RESOURCES=Airport,Country - -Now you should have - - ~/testapp $ ./script/console - Loading development environment (Rails 2.3.3) - >> Airport.first.iata_code - => "GKA" - >> Airport.first.country_name - => "Papua New Guinea" - -==Wishlist - -* each_record do |record| ... which would use find_in_batches -* when proxying add_column, rename_column, etc. automatically include the table name + conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code]) + update_all "seats = (#{FlightSegment.weighted_average_relation(:seats, :weighted_by => :passengers ).where(conditional_relation).to_sql})" + update_all "distance = (#{FlightSegment.weighted_average_relation(:distance, :weighted_by => :passengers ).where(conditional_relation).to_sql})" + update_all "load_factor = (#{FlightSegment.weighted_average_relation(:load_factor, :weighted_by => :passengers ).where(conditional_relation).to_sql})" + update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers ).where(conditional_relation).to_sql})" + update_all "payload = (#{FlightSegment.weighted_average_relation(:payload, :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})" + + update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})" + end + + # And finally re-run the import of resources that depend on this resource. + # Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block; + # that's the right way to do dependencies. It won't get called twice in the same run. + [ AircraftManufacturer ].each do |synthetic_resource| + process "Synthesize #{synthetic_resource}" do + synthetic_resource.run_data_miner! + end + end + end + end ==Authors * Seamus Abshere <seamus@abshere.net> * Andy Rossmeissl <andy@rossmeissl.net>