require "smarter_csv" require "tempfile" require "pry" require_relative "helpers/datatype_validation" require_relative "analyzer/csv_check_bounds" require_relative "helpers/common_functions" require_relative "sql_query_builder" module CsvImportAnalyzer class CsvDatatypeAnalysis include CsvImportAnalyzer::Helper include CsvImportAnalyzer::DatatypeValidator attr_accessor :csv_column_datatypes, :nullable def initialize(options) @options = options @csv_column_datatypes = {} @nullable = [] end def options @options end def filename @options[:filename] end # Process a chunk of csv file for all possible datatypes towards each column in the row def datatype_analysis SmarterCSV.process(filename, {:col_sep => delimiter, :chunk_size => chunk_size, :remove_empty_values => false, :remove_zero_values => false}) do |chunk| chunk.each do |row| row.each do |key, value| unless null_like?(value) datatype = determine_dataype(value) add_to_datatype(key, datatype.to_sym) else nullable.push(key) unless nullable.include?(key) end end end break end options[:csv_datatype_analysis] = csv_column_datatypes.clone # To retain the current state of csv_column_datatypes since it's altered further finalize_datatypes_for_csv options[:csv_column_datatypes] = csv_column_datatypes options[:nullable] = nullable take_further_actions end private def delimiter return options[:delimiter] end def chunk_size return options[:chunk] end #Call DatatypeValidator in helper module to process the possible datatype for the value #Is this the right way to hide dependency on the external classes or objects #May be a static would do ? Should I create an object and call method on the object each time rather than instantiate a new object each time ?? def determine_dataype(value) return validate_field(value) end # Build the hash of hashes which hold the count of different possible datatypes for each row def add_to_datatype(key, datatype) if csv_column_datatypes[key].nil? csv_column_datatypes[key] = {datatype => 1} else if csv_column_datatypes[key][datatype].nil? csv_column_datatypes[key][datatype] = 1 else csv_column_datatypes[key][datatype] += 1 end end end #Finalize the datatype for each column, A column datatype would be set to varchar or string if atleast of it's values tend to be string #If the column doesn't have any possible strings then assign the datatype to column with maximum count of identified possibilites def finalize_datatypes_for_csv csv_column_datatypes.map { |column_name, possible_datatypes| #If there is string type even atleast 1 there is no other option but to set the datatype to string => varchar if possible_datatypes.has_key?(:string) csv_column_datatypes[column_name] = :string else #set the max occurance datatype as the datatype of column csv_column_datatypes[column_name] = possible_datatypes.key(possible_datatypes.values.max) end } end #Decide if simple datatype analysis is enough or proced further def take_further_actions if options[:check_bounds] min_max_bounds = CsvImportAnalyzer::CsvCheckBounds.new(options) res = min_max_bounds.get_min_max_values options[:min_max_bounds] = res[:min_max] options[:uniques] = res[:uniques] end query = CsvImportAnalyzer::SqlQueryBuilder.new(options) query.generate_query end end end