lib/earth/air/flight_segment/data_miner.rb in earth-0.5.0 vs lib/earth/air/flight_segment/data_miner.rb in earth-0.5.2
- old
+ new
@@ -1,9 +1,17 @@
-# need this to run flight_segment.cache_aircraft!
-require 'loose_tight_dictionary/cached_result'
-
FlightSegment.class_eval do
+ # For import errata
+ class FlightSegment::Guru
+ def in_may_2009?(row)
+ row ['MONTH'] == 5 and row['YEAR'] == 2009
+ end
+
+ def in_july_2009?(row)
+ row ['MONTH'] == 7 and row['YEAR'] == 2009
+ end
+ end
+
URL = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0'
FORM_DATA = %{
UserTableName=T_100_Segment__All_Carriers&
DBShortName=Air_Carriers&
RawDataTable=T_T100_SEGMENT_ALL_CARRIER&
@@ -160,55 +168,12 @@
VarDesc=DataSource&
VarType=Char
}.gsub /[\s]+/,''
data_miner do
- schema Earth.database_options do
- string 'row_hash' # auto-generated primary key
- string 'origin_airport_iata_code' # iata code
- string 'origin_airport_city' # city
- string 'origin_country_iso_3166_code' # iso code
- string 'destination_airport_iata_code' # iata code
- string 'destination_airport_city' # city
- string 'destination_country_iso_3166_code' # iso code
- string 'airline_bts_code' # bts code
- string 'airline_icao_code' # icao code
- string 'airline_name' # text description derived from bts or icao code
- string 'aircraft_bts_code' # bts code
- string 'aircraft_description' # text description derived from BTS T100 or ICAO TFS
- integer 'flights' # number of flights over month or year
- integer 'passengers' # total passengers on all flights
- integer 'seats' # total seats on all flights
- float 'seats_per_flight' # average seats per flight; make this a float
- float 'load_factor' # passengers / seats
- float 'freight_share' # (freight + mail) / (freight + mail + (passengers * average passenger weight))
- float 'distance' # flight distance
- string 'distance_units' # 'kilometres'
- float 'payload_capacity' # aircraft maximum payload capacity rating; float b/c unit conversion
- string 'payload_capacity_units' # 'kilograms'
- float 'freight' # total freight on all flights performed; float b/c unit conversion
- string 'freight_units' # 'kilograms'
- float 'mail' # total mail on all flights performed; float b/c unit conversion
- string 'mail_units' # 'kilograms'
- integer 'month' # month of flight
- integer 'year' # year of flight
- date 'approximate_date' # assumed 14th day of month
- string 'source' # 'BTS T100' or 'ICAO TFS'
- index 'origin_airport_iata_code' # index for faster lookup by origin airport
- index 'origin_airport_city' # index for faster lookup by origin city
- index 'destination_airport_iata_code' # index for faster lookup by destination airport
- index 'destination_airport_city' # index for faster lookup by destination city
- index 'airline_bts_code' # index for faster lookup by airline bts code
- index 'airline_icao_code' # index for faster lookup by airline icao code
- index 'airline_name' # index for faster lookup by airline name
- index 'aircraft_bts_code' # index for faster lookup by aircraft bts code
- index 'aircraft_description' # index for faster lookup by aircraft
- index 'year' # index for faster lookup by year
- end
-
months = Hash.new
- (2009..2010).each do |year|
+ (2009..2011).each do |year|
(1..12).each do |month|
time = Time.gm year, month
form_data = FORM_DATA.dup
form_data.gsub! '__YEAR__', time.year.to_s
form_data.gsub! '__MONTH_NUMBER__', time.month.to_s
@@ -221,10 +186,12 @@
import "T100 flight segment data for #{month.strftime('%B %Y')}",
:url => URL,
:form_data => form_data,
:compression => :zip,
:glob => '/*.csv',
+ # FIXME TODO 6/6/2011 the errata doesn't work - still doesn't fill in missing airline in May and July 2009
+ :errata => { :url => 'https://spreadsheets.google.com/spreadsheet/pub?key=0AoQJbWqPrREqdGxpYU1qWFR3d0syTVMyQVVOaDd0V3c&output=csv', :responder => FlightSegment::Guru.new },
:select => lambda { |record| record['DEPARTURES_PERFORMED'].to_i > 0 } do
key 'row_hash'
store 'origin_airport_iata_code', :field_name => 'ORIGIN'
store 'origin_country_iso_3166_code', :field_name => 'ORIGIN_COUNTRY'
store 'destination_airport_iata_code', :field_name => 'DEST'
@@ -242,37 +209,28 @@
store 'year', :field_name => 'YEAR'
store 'source', :static => 'BTS T100'
end
end
- # verify origin_airport_iata_code is in airports
- # verify destination_airport_iata_code is in airports
- # verify origin_country_iso_3166_code is in countries
- # verify destination_country_iso_3166_code is in countries
- # verify airline_bts_code appears in airlines
- # verify aircraft_description is never missing
- # verify year is never missing
-
- process "Ensure Airline is populated" do
+ process "Ensure Airline and BtsAircraft are populated" do
Airline.run_data_miner!
+ BtsAircraft.run_data_miner!
end
process "Look up airline name based on BTS code" do
connection.select_values("SELECT DISTINCT airline_bts_code FROM flight_segments WHERE airline_bts_code IS NOT NULL").each do |bts_code|
- name = Airline.find_by_bts_code(bts_code).name
- update_all %{ airline_name = "#{name}" }, %{ airline_bts_code = "#{bts_code}" }
+ if airline = Airline.find_by_bts_code(bts_code)
+ update_all %{ airline_name = "#{airline.name}" }, %{ airline_bts_code = "#{bts_code}" }
+ end
end
end
- process "Ensure BtsAircraft is populated" do
- BtsAircraft.run_data_miner!
- end
-
process "Look up aircraft description based on BTS code" do
connection.select_values("SELECT DISTINCT aircraft_bts_code FROM flight_segments WHERE aircraft_bts_code IS NOT NULL").each do |bts_code|
- description = BtsAircraft.find_by_bts_code(bts_code).description.downcase
- update_all %{ aircraft_description = "#{description}" }, %{ aircraft_bts_code = "#{bts_code}" }
+ if aircraft = BtsAircraft.find_by_bts_code(bts_code)
+ update_all %{ aircraft_description = "#{aircraft.description.downcase}" }, %{ aircraft_bts_code = "#{bts_code}" }
+ end
end
end
%w{ payload_capacity freight mail }.each do |field|
process "Convert #{field} from pounds to kilograms" do
@@ -314,58 +272,18 @@
process "Add a useful date field" do
update_all 'approximate_date = DATE(CONCAT_WS("-", year, month, "14"))', 'month IS NOT NULL'
end
- process "Ensure Aircraft is populated" do
+ process "Data mine Aircraft to cache fuzzy matches" do
Aircraft.run_data_miner!
end
- process "Cache fuzzy matches between FlightSegment aircraft_description and Aircraft description" do
- LooseTightDictionary::CachedResult.setup
- FlightSegment.find_by_sql("SELECT DISTINCT aircraft_description FROM flight_segments WHERE aircraft_description IS NOT NULL").each do |flight_segment|
- original_description = flight_segment.aircraft_description
-
- # If the flight segment's aircraft_description contains '/' then it describes multiple aircraft.
- # We need to synthesize descriptions for those aircraft, find all Aircraft that fuzzily match the
- # synthesized descriptions, and associate those Aircraft with the original aircraft_description.
- # e.g. boeing 747-100/200
- if original_description.include?("/")
- # Pull out the complete first aircraft description
- # e.g. 'boeing 747-100'
- first_description = original_description.split('/')[0]
-
- # Pull out the root of the description - the text up to and including the last ' ' or '-'
- # e.g. 'boeing 747-'
- root_length = first_description.rindex('-')
- root = first_description.slice(0..root_length)
-
- # Pull out the suffixes - the text separated by forward slashes
- # e.g. ['100', '200']
- suffixes = original_description.split(root)[1].split('/')
-
- # Create an array of synthesized descriptions by appending each suffix to the root
- # e.g. ['boeing 747-100', 'boeing 747-200']
- suffixes.map{ |suffix| root + suffix }.each do |synthesized_description|
- # Look up the Aircraft that match each synthesized description and associate
- # them with the original flight segment aircraft_description
- Aircraft.loose_tight_dictionary.find_all(synthesized_description).each do |aircraft|
- attrs = {
- :a_class => "Aircraft",
- :a => aircraft.description,
- :b_class => "FlightSegment",
- :b => original_description
- }
- unless ::LooseTightDictionary::CachedResult.exists? attrs
- ::LooseTightDictionary::CachedResult.create! attrs
- end
- end
- end
- # If the flight segment's aircraft_description doesn't contain '/' we can use
- # a method provided by loose_tight_dictionary to associate it with Aircraft
- else
- flight_segment.cache_aircraft!
- end
- end
- end
+ # verify origin_airport_iata_code is in airports
+ # verify destination_airport_iata_code is in airports
+ # verify origin_country_iso_3166_code is in countries
+ # verify destination_country_iso_3166_code is in countries
+ # verify airline_name is never missing
+ # verify aircraft_description is never missing
+ # verify year is never missing
end
end