lib/statsample/codification.rb in statsample-0.5.1 vs lib/statsample/codification.rb in statsample-0.6.0

- old
+ new

@@ -25,96 +25,145 @@ # File.open(recode_file,"r") {|fp| # Statsample::Codification.recode_dataset_split!(ds,fp,"*") # } # end # - module Codification - class << self - # Create a yaml dump for a hash, based on vectors - # The keys will be vectors name on dataset and the values - # will be hashes, with keys = values, for recodification - # - # v1=%w{a,b b,c d}.to_vector - # ds={"v1"=>v1}.to_dataset - # Statsample::Codification.create_yaml(ds,['v1']) - # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n" - def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil) - raise ArgumentError,"Array should't be empty" if vectors.size==0 - pro_hash=vectors.inject({}){|h,v_name| - raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name - v=dataset[v_name] - split_data=v.splitted(sep) - factors=split_data.flatten.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac} - h[v_name]=factors - h - } - YAML.dump(pro_hash,io) + module Codification + class << self + # Create a hash, based on vectors, to create the dictionary. + # The keys will be vectors name on dataset and the values + # will be hashes, with keys = values, for recodification + def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN) + raise ArgumentError,"Array should't be empty" if vectors.size==0 + pro_hash=vectors.inject({}){|h,v_name| + raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name + v=dataset[v_name] + split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?} + + factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac } + h[v_name]=factors + h + } + pro_hash + end + # Create a yaml to create a dictionary, based on vectors + # The keys will be vectors name on dataset and the values + # will be hashes, with keys = values, for recodification + # + # v1=%w{a,b b,c d}.to_vector + # ds={"v1"=>v1}.to_dataset + # Statsample::Codification.create_yaml(ds,['v1']) + # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n" + def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN) + pro_hash=create_hash(dataset, vectors, sep) + YAML.dump(pro_hash,io) + end + # Create a excel to create a dictionary, based on vectors. + # Raises an error if filename exists + # The rows will be: + # * field: name of vector + # * original: original name + # * recoded: new code + + def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN) + require 'spreadsheet' + if File.exists?(filename) + raise "Exists a file named #{filename}. Delete ir before overwrite." + end + book = Spreadsheet::Workbook.new + sheet = book.create_worksheet + sheet.row(0).concat(%w{field original recoded}) + i=1 + create_hash(dataset, vectors, sep).sort.each do |field, inner_hash| + inner_hash.sort.each do |k,v| + sheet.row(i).concat([field.dup,k.dup,v.dup]) + i+=1 + end + end + book.write(filename) + end + # From a excel generates a dictionary hash + # to use on recode_dataset_simple!() or recode_dataset_split!(). + # + def excel_to_recoded_hash(filename) + require 'spreadsheet' + h={} + book = Spreadsheet.open filename + sheet= book.worksheet 0 + row_i=0 + sheet.each do |row| + row_i+=1 + next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil? + h[row[0]]={} if h[row[0]].nil? + h[row[0]][row[1]]=row[2] + end + h + end + + def inverse_hash(h, sep=Statsample::SPLIT_TOKEN) + h.inject({}) do |a,v| + v[1].split(sep).each do |val| + a[val]||=[] + a[val].push(v[0]) + end + a + end + end + + def dictionary(h, sep=Statsample::SPLIT_TOKEN) + h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a } + end + + def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN) + dict=dictionary(h,sep) + new_data=v.splitted(sep) + recoded=new_data.collect do |c| + if c.nil? + nil + else + c.collect{|value| dict[value] }.flatten.uniq + end + end + end + def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN) + _recode_dataset(dataset,dictionary_hash ,sep,false) + end + def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN) + _recode_dataset(dataset, dictionary_hash, sep,true) + end + + def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false) + v_names||=h.keys + v_names.each do |v_name| + raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name + recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c| + if c.nil? + nil + else + c.join(sep) + end + }.to_vector + if(split) + recoded.split_by_separator(sep).each {|k,v| + dataset[v_name+"_"+k]=v + } + else + dataset[v_name+"_recoded"]=recoded end - def inverse_hash(h,sep=Statsample::SPLIT_TOKEN) - h.inject({}) {|a,v| - v[1].split(sep).each {|val| - a[val]||=[] - a[val].push(v[0]) - } - a - } - end - def dictionary(h,sep=Statsample::SPLIT_TOKEN) - h.inject({}) {|a,v| - a[v[0]]=v[1].split(sep) - a - } - end - def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN) - dict=dictionary(h,sep) - new_data=v.splitted(sep) - recoded=new_data.collect{|c| - if c.nil? - nil - else - c.collect{|value| - dict[value] - }.flatten.uniq - end - } - end - def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN) - _recode_dataset(dataset,yaml,sep,false) - end - def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN) - _recode_dataset(dataset,yaml,sep,true) - end - - def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false) - h=YAML::load(yaml) - v_names||=h.keys - v_names.each do |v_name| - raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name - recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c| - if c.nil? - nil - else - c.join(sep) - end - }.to_vector - if(split) - recoded.split_by_separator(sep).each {|k,v| - dataset[v_name+"_"+k]=v - } - else - dataset[v_name+"_recoded"]=recoded - end - end - end - def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>) - require 'pp' - h=YAML::load(yaml) - v_names||=h.keys - v_names.each{|v_name| - inverse=inverse_hash(h[v_name],sep) - io.puts "Vector: #{v_name}" - YAML.dump(inverse.sort,io) - } - end + end end + + + def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>) + require 'pp' + v_names||=h.keys + v_names.each{|v_name| + inverse=inverse_hash(h[v_name],sep) + io.puts "- Field: #{v_name}" + inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v| + io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'" + } + } + end end + end end