# frozen_string_literal: true # Performs sampling technique and generates CSV file with parameter options for each building. # The file has to follow general Ruby conventions. # File name must for the snake case (underscore case) of the class name. For example: WorkerInit = worker_init require 'csv' require_relative 'buildstock' class RunSampling def run(project_dir_name, num_samples, outfile, housing_characteristics_dir = 'housing_characteristics', lookup_file = nil) resources_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', 'resources')) # Should have been uploaded per 'Additional Analysis Files' in PAT characteristics_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', housing_characteristics_dir)) # Should have been uploaded per 'Additional Analysis Files' in PAT if not File.exist?(characteristics_dir) characteristics_dir = File.absolute_path(File.join(File.dirname(__FILE__), '..', project_dir_name, housing_characteristics_dir)) # Being run locally? end if lookup_file.nil? lookup_file = File.join(resources_dir, 'options_lookup.tsv') end lookup_csv_data = CSV.open(lookup_file, col_sep: "\t").each.to_a params = get_parameters_ordered_from_options_lookup_tsv(lookup_csv_data) tsvfiles = {} params.each do |param| tsvpath = File.join(characteristics_dir, param + '.tsv') next if not File.exist?(tsvpath) # Not every parameter used by every mode tsvfile = TsvFile.new(tsvpath, nil) tsvfiles[param] = tsvfile end params = tsvfiles.keys if params.size == 0 register_error('No parameters found, aborting...', nil) end params = update_parameter_dependencies(params, tsvfiles) sample_results = perform_sampling(params, num_samples, tsvfiles, project_dir_name).transpose out_file = write_csv(sample_results, outfile) return out_file end def update_parameter_dependencies(params, tsvfiles) # Returns a hash with the dependencies for each parameter params_with_deps = {} params.each do |param| params_with_deps[param] = tsvfiles[param].dependency_cols.keys end return params_with_deps end def perform_sampling(params, num_samples, tsvfiles, project_dir_name) results_data = [] results_data_cols = {} # Add building numbers results_data_bldgs = ['Building'] + Array(1..num_samples) results_data << results_data_bldgs results_data_cols[results_data_bldgs[0]] = results_data.size - 1 random_seed = 57 # Using a hard-coded seed so that we get repeatable results processed_params = [] bldgs_hash = {} while processed_params.size != params.size params.each do |param, param_deps| # Already processed? Skip next if processed_params.include?(param) # Dependencies not yet processed? Skip until a subsequent pass skip = false param_deps.each do |param_dep| next if processed_params.include?(param_dep) skip = true end next if skip puts "Sampling #{project_dir_name}/#{param}..." results_data_param = [param] + [nil] * num_samples tsvfile = tsvfiles[param] if param_deps.size == 0 # No dependencies, perform 'global' sampling sample_results = sample_probability_distribution(nil, tsvfile, num_samples) random_seed = distribute_samples(random_seed, results_data_param, sample_results, Array(1..num_samples)) else # For each combination of dependency values, perform sampling dep_hashes = get_combination_hashes(tsvfiles, param_deps) bldgs_processed = 0 if not bldgs_hash.keys.include?(param_deps) bldgs_hash[param_deps] = get_bldgs_by_dependency_values(results_data, dep_hashes, results_data_cols) end dep_hashes.each do |dep_hash| # Determine buildings this combo applies to bldgs = bldgs_hash[param_deps][dep_hash.values] next if bldgs.nil? sample_results = sample_probability_distribution(dep_hash, tsvfile, bldgs.size) random_seed = distribute_samples(random_seed, results_data_param, sample_results, bldgs) bldgs_processed += bldgs.size end # Ensure correct number of buildings were processed if bldgs_processed != num_samples register_error('Sampling algorithm unexpectedly failed.', nil) end end # Ensure no missing values if results_data_param.include?(nil) register_error('Sampling algorithm unexpectedly failed.', nil) end # Add results for this parameter results_data << results_data_param results_data_cols[results_data_param[0]] = results_data.size - 1 processed_params << param # We just processed a parameter; start back at the beginning to try # to keep the parameters better ordered. break end end return results_data end def get_bldgs_by_dependency_values(results_data, dep_hashes, results_data_cols) # Returns a hash with key:dep_hash, value:Array[bldgs] data = [] dep_hashes[0].keys.each do |dep_name| data << results_data[results_data_cols[dep_name]] end data = data.transpose bldgs_hash = {} data[1..-1].each_with_index do |bldg, idx| bldgs_hash[bldg] = [] if bldgs_hash[bldg].nil? bldgs_hash[bldg] << idx + 1 end return bldgs_hash end def get_tsvrow_with_dependency_values(tsvfile, dep_hash) # Returns the row of data in the tsvfile with the given dependency values. if dep_hash.nil? return tsvfile.rows[0] end key_s = hash_to_string(dep_hash) key_s_downcase = key_s.downcase rownum = tsvfile.rows_keys_s[key_s_downcase] if rownum.nil? register_error("Could not find row in #{tsvfile.filename} with dependency values: #{dep_hash}.", nil) end return tsvfile.rows[rownum] end def binary_search(arr, value) # Implementation of binary search if arr.nil? || arr.size == 0 return 0 end lo = 0 hi = arr.size - 1 m = 0 while lo < hi m = (hi + lo) / 2 if arr[m] > value lo = m + 1 else hi = m - 1 end end if arr[lo] > value lo += 1 end return lo end def sample_probability_distribution(dep_hash, tsvfile, num_samples) # Returns a dictionary with key:option_name, value:num_samples. # Create prob_dist hash needed by _sample_probability_distribution method. tsvrow = get_tsvrow_with_dependency_values(tsvfile, dep_hash) prob_dist = [] tsvfile.option_cols.each do |option_name, option_col| prob_val = tsvrow[option_col].to_f next if prob_val <= 0 prob_dist << [option_name, prob_val] end return _sample_probability_distribution(prob_dist, num_samples) end def _sample_probability_distribution(prob_dist, num_samples) # Instead of using Monte Carlo, which for small sample sizes can randomly choose # a low probability item, we use a quota sampling algorithm to more strategically # (non-randomly) choose the number of samples for each item in order to best # represent the input distribution. # prob_dist - array of arrays where the inner arrays are of the # form [option_name, probability] # num_samples - integer for the total number of samples # Returns a dictionary with key:option_name, value:num_samples. if prob_dist.size == 1 return { prob_dist[0][0] => num_samples } # Simply return num_samples for only item end return_samples = {} prob_dist.each do |item| return_samples[item[0]] = 0 end # Sort array in descending order # Using stable sort algorithm from https://groups.google.com/g/comp.lang.ruby/c/JcDGbaFHifI/m/2gKpc9FQbCoJ n = 0 prob_dist = prob_dist.sort_by { |x| n += 1; [x[1], n] }.reverse if num_samples == 1 return { prob_dist[0][0] => 1 } # Simply return 1 sample for max item end # We'll never choose to sample an item beyond the first # num_samples number of items, so discard the rest. if prob_dist.size > num_samples prob_dist.slice!(num_samples..prob_dist.size - 1) end sum = prob_dist.transpose[1].inject(0, :+) remaining_samples = num_samples while remaining_samples > 0 # Choose highest probability item (first in array) max_item = prob_dist[0] # Increment the number of samples for the highest probability return_samples[max_item[0]] += 1 # Calculate new probability target_num_samples = remaining_samples * max_item[1] / sum new_probability = (target_num_samples - 1) / target_num_samples * max_item[1] # Remove item, insert back into the appropriate sorted # position based on its new probability value prob_dist.delete_at(0) index = binary_search(prob_dist.transpose[1], new_probability) prob_dist.insert(index, [max_item[0], new_probability]) # Update sum and remaining_samples sum += (new_probability - max_item[1]) remaining_samples -= 1 # We'll never choose to sample an item beyond the first # remaining_samples number of items, so discard the rest. if prob_dist.size > remaining_samples prob_dist.pop end end # Remove items with no samples return_samples.delete_if { |_k, v| v == 0 } if return_samples.values.reduce(:+) != num_samples register_error('Sampling algorithm unexpectedly failed.', nil) end return return_samples end def distribute_samples(random_seed, results_data_param, sample_results, bldgs) # Randomly distributes sample_results to the specified bldgs. # Returns an updated results_data_param array. bldgs.shuffle(random: Random.new(random_seed)).each do |bldg| sample_results.each do |option_name, option_num_samples| next if option_num_samples <= 0 results_data_param[bldg] = option_name sample_results[option_name] -= 1 # one less sample to distribute break end end return random_seed + 1 end def write_csv(sample_results, outfile) # Writes the csv output file. out_file = File.absolute_path(File.join(File.dirname(__FILE__), outfile)) CSV.open(out_file, 'w') do |csv_object| sample_results.each do |sample_result| csv_object << sample_result end end puts "Wrote output file #{File.basename(out_file)}." return out_file end end