lib/earth/air/flight_segment/data_miner.rb in earth-0.5.0 vs lib/earth/air/flight_segment/data_miner.rb in earth-0.5.2

- old
+ new

@@ -1,9 +1,17 @@ -# need this to run flight_segment.cache_aircraft! -require 'loose_tight_dictionary/cached_result' - FlightSegment.class_eval do + # For import errata + class FlightSegment::Guru + def in_may_2009?(row) + row ['MONTH'] == 5 and row['YEAR'] == 2009 + end + + def in_july_2009?(row) + row ['MONTH'] == 7 and row['YEAR'] == 2009 + end + end + URL = 'http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=293&Has_Group=3&Is_Zipped=0' FORM_DATA = %{ UserTableName=T_100_Segment__All_Carriers& DBShortName=Air_Carriers& RawDataTable=T_T100_SEGMENT_ALL_CARRIER& @@ -160,55 +168,12 @@ VarDesc=DataSource& VarType=Char }.gsub /[\s]+/,'' data_miner do - schema Earth.database_options do - string 'row_hash' # auto-generated primary key - string 'origin_airport_iata_code' # iata code - string 'origin_airport_city' # city - string 'origin_country_iso_3166_code' # iso code - string 'destination_airport_iata_code' # iata code - string 'destination_airport_city' # city - string 'destination_country_iso_3166_code' # iso code - string 'airline_bts_code' # bts code - string 'airline_icao_code' # icao code - string 'airline_name' # text description derived from bts or icao code - string 'aircraft_bts_code' # bts code - string 'aircraft_description' # text description derived from BTS T100 or ICAO TFS - integer 'flights' # number of flights over month or year - integer 'passengers' # total passengers on all flights - integer 'seats' # total seats on all flights - float 'seats_per_flight' # average seats per flight; make this a float - float 'load_factor' # passengers / seats - float 'freight_share' # (freight + mail) / (freight + mail + (passengers * average passenger weight)) - float 'distance' # flight distance - string 'distance_units' # 'kilometres' - float 'payload_capacity' # aircraft maximum payload capacity rating; float b/c unit conversion - string 'payload_capacity_units' # 'kilograms' - float 'freight' # total freight on all flights performed; float b/c unit conversion - string 'freight_units' # 'kilograms' - float 'mail' # total mail on all flights performed; float b/c unit conversion - string 'mail_units' # 'kilograms' - integer 'month' # month of flight - integer 'year' # year of flight - date 'approximate_date' # assumed 14th day of month - string 'source' # 'BTS T100' or 'ICAO TFS' - index 'origin_airport_iata_code' # index for faster lookup by origin airport - index 'origin_airport_city' # index for faster lookup by origin city - index 'destination_airport_iata_code' # index for faster lookup by destination airport - index 'destination_airport_city' # index for faster lookup by destination city - index 'airline_bts_code' # index for faster lookup by airline bts code - index 'airline_icao_code' # index for faster lookup by airline icao code - index 'airline_name' # index for faster lookup by airline name - index 'aircraft_bts_code' # index for faster lookup by aircraft bts code - index 'aircraft_description' # index for faster lookup by aircraft - index 'year' # index for faster lookup by year - end - months = Hash.new - (2009..2010).each do |year| + (2009..2011).each do |year| (1..12).each do |month| time = Time.gm year, month form_data = FORM_DATA.dup form_data.gsub! '__YEAR__', time.year.to_s form_data.gsub! '__MONTH_NUMBER__', time.month.to_s @@ -221,10 +186,12 @@ import "T100 flight segment data for #{month.strftime('%B %Y')}", :url => URL, :form_data => form_data, :compression => :zip, :glob => '/*.csv', + # FIXME TODO 6/6/2011 the errata doesn't work - still doesn't fill in missing airline in May and July 2009 + :errata => { :url => 'https://spreadsheets.google.com/spreadsheet/pub?key=0AoQJbWqPrREqdGxpYU1qWFR3d0syTVMyQVVOaDd0V3c&output=csv', :responder => FlightSegment::Guru.new }, :select => lambda { |record| record['DEPARTURES_PERFORMED'].to_i > 0 } do key 'row_hash' store 'origin_airport_iata_code', :field_name => 'ORIGIN' store 'origin_country_iso_3166_code', :field_name => 'ORIGIN_COUNTRY' store 'destination_airport_iata_code', :field_name => 'DEST' @@ -242,37 +209,28 @@ store 'year', :field_name => 'YEAR' store 'source', :static => 'BTS T100' end end - # verify origin_airport_iata_code is in airports - # verify destination_airport_iata_code is in airports - # verify origin_country_iso_3166_code is in countries - # verify destination_country_iso_3166_code is in countries - # verify airline_bts_code appears in airlines - # verify aircraft_description is never missing - # verify year is never missing - - process "Ensure Airline is populated" do + process "Ensure Airline and BtsAircraft are populated" do Airline.run_data_miner! + BtsAircraft.run_data_miner! end process "Look up airline name based on BTS code" do connection.select_values("SELECT DISTINCT airline_bts_code FROM flight_segments WHERE airline_bts_code IS NOT NULL").each do |bts_code| - name = Airline.find_by_bts_code(bts_code).name - update_all %{ airline_name = "#{name}" }, %{ airline_bts_code = "#{bts_code}" } + if airline = Airline.find_by_bts_code(bts_code) + update_all %{ airline_name = "#{airline.name}" }, %{ airline_bts_code = "#{bts_code}" } + end end end - process "Ensure BtsAircraft is populated" do - BtsAircraft.run_data_miner! - end - process "Look up aircraft description based on BTS code" do connection.select_values("SELECT DISTINCT aircraft_bts_code FROM flight_segments WHERE aircraft_bts_code IS NOT NULL").each do |bts_code| - description = BtsAircraft.find_by_bts_code(bts_code).description.downcase - update_all %{ aircraft_description = "#{description}" }, %{ aircraft_bts_code = "#{bts_code}" } + if aircraft = BtsAircraft.find_by_bts_code(bts_code) + update_all %{ aircraft_description = "#{aircraft.description.downcase}" }, %{ aircraft_bts_code = "#{bts_code}" } + end end end %w{ payload_capacity freight mail }.each do |field| process "Convert #{field} from pounds to kilograms" do @@ -314,58 +272,18 @@ process "Add a useful date field" do update_all 'approximate_date = DATE(CONCAT_WS("-", year, month, "14"))', 'month IS NOT NULL' end - process "Ensure Aircraft is populated" do + process "Data mine Aircraft to cache fuzzy matches" do Aircraft.run_data_miner! end - process "Cache fuzzy matches between FlightSegment aircraft_description and Aircraft description" do - LooseTightDictionary::CachedResult.setup - FlightSegment.find_by_sql("SELECT DISTINCT aircraft_description FROM flight_segments WHERE aircraft_description IS NOT NULL").each do |flight_segment| - original_description = flight_segment.aircraft_description - - # If the flight segment's aircraft_description contains '/' then it describes multiple aircraft. - # We need to synthesize descriptions for those aircraft, find all Aircraft that fuzzily match the - # synthesized descriptions, and associate those Aircraft with the original aircraft_description. - # e.g. boeing 747-100/200 - if original_description.include?("/") - # Pull out the complete first aircraft description - # e.g. 'boeing 747-100' - first_description = original_description.split('/')[0] - - # Pull out the root of the description - the text up to and including the last ' ' or '-' - # e.g. 'boeing 747-' - root_length = first_description.rindex('-') - root = first_description.slice(0..root_length) - - # Pull out the suffixes - the text separated by forward slashes - # e.g. ['100', '200'] - suffixes = original_description.split(root)[1].split('/') - - # Create an array of synthesized descriptions by appending each suffix to the root - # e.g. ['boeing 747-100', 'boeing 747-200'] - suffixes.map{ |suffix| root + suffix }.each do |synthesized_description| - # Look up the Aircraft that match each synthesized description and associate - # them with the original flight segment aircraft_description - Aircraft.loose_tight_dictionary.find_all(synthesized_description).each do |aircraft| - attrs = { - :a_class => "Aircraft", - :a => aircraft.description, - :b_class => "FlightSegment", - :b => original_description - } - unless ::LooseTightDictionary::CachedResult.exists? attrs - ::LooseTightDictionary::CachedResult.create! attrs - end - end - end - # If the flight segment's aircraft_description doesn't contain '/' we can use - # a method provided by loose_tight_dictionary to associate it with Aircraft - else - flight_segment.cache_aircraft! - end - end - end + # verify origin_airport_iata_code is in airports + # verify destination_airport_iata_code is in airports + # verify origin_country_iso_3166_code is in countries + # verify destination_country_iso_3166_code is in countries + # verify airline_name is never missing + # verify aircraft_description is never missing + # verify year is never missing end end