codification.rb in statsample-0.6.0

- old
+ new
@@ -25,96 +25,145 @@
 #     File.open(recode_file,"r") {|fp|
 #       Statsample::Codification.recode_dataset_split!(ds,fp,"*")
 #     }
 #   end
 #     
-    module Codification
-        class << self
-            # Create a yaml dump for a hash, based on vectors
-            # The keys will be vectors name on dataset and the values
-            # will be hashes, with keys = values, for recodification
-            # 
-            #   v1=%w{a,b b,c d}.to_vector
-            #   ds={"v1"=>v1}.to_dataset
-            #   Statsample::Codification.create_yaml(ds,['v1'])
-            #   => "--- \nv1: \n  a: a\n  b: b\n  c: c\n  d: d\n"
-            def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
-                raise ArgumentError,"Array should't be empty" if vectors.size==0
-                pro_hash=vectors.inject({}){|h,v_name|
-                    raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
-                    v=dataset[v_name]
-                    split_data=v.splitted(sep)
-                    factors=split_data.flatten.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac}
-                    h[v_name]=factors
-                    h
-                }
-                YAML.dump(pro_hash,io)
+  module Codification
+    class << self
+      # Create a hash, based on vectors, to create the dictionary.
+      # The keys will be vectors name on dataset and the values
+      # will be hashes, with keys = values, for recodification
+      def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
+        raise ArgumentError,"Array should't be empty" if vectors.size==0
+        pro_hash=vectors.inject({}){|h,v_name|
+          raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
+          v=dataset[v_name]
+          split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
+          
+          factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
+          h[v_name]=factors
+          h
+        }
+        pro_hash
+      end
+      # Create a yaml to create a dictionary, based on vectors
+      # The keys will be vectors name on dataset and the values
+      # will be hashes, with keys = values, for recodification
+      # 
+      #   v1=%w{a,b b,c d}.to_vector
+      #   ds={"v1"=>v1}.to_dataset
+      #   Statsample::Codification.create_yaml(ds,['v1'])
+      #   => "--- \nv1: \n  a: a\n  b: b\n  c: c\n  d: d\n"
+      def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
+        pro_hash=create_hash(dataset, vectors, sep)
+        YAML.dump(pro_hash,io)
+      end
+      # Create a excel to create a dictionary, based on vectors.
+      # Raises an error if filename exists
+      # The rows will be:
+      # * field: name of vector
+      # * original: original name
+      # * recoded: new code
+      
+      def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
+        require 'spreadsheet'
+        if File.exists?(filename)
+          raise "Exists a file named #{filename}. Delete ir before overwrite."
+        end
+        book = Spreadsheet::Workbook.new
+        sheet = book.create_worksheet
+        sheet.row(0).concat(%w{field original recoded})
+        i=1
+        create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
+          inner_hash.sort.each do |k,v|
+            sheet.row(i).concat([field.dup,k.dup,v.dup])
+            i+=1
+          end
+        end
+        book.write(filename)
+      end
+      # From a excel generates a dictionary hash
+      # to use on recode_dataset_simple!() or recode_dataset_split!().
+      #
+      def excel_to_recoded_hash(filename)
+        require 'spreadsheet'
+        h={}
+        book = Spreadsheet.open filename
+        sheet= book.worksheet 0
+        row_i=0
+        sheet.each do |row|
+          row_i+=1
+          next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
+          h[row[0]]={} if h[row[0]].nil?
+          h[row[0]][row[1]]=row[2]
+        end
+        h
+      end
+      
+      def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
+        h.inject({}) do |a,v|
+          v[1].split(sep).each do |val|
+            a[val]||=[]
+            a[val].push(v[0])
+          end
+          a
+        end
+      end
+      
+      def dictionary(h, sep=Statsample::SPLIT_TOKEN)
+        h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
+      end
+      
+      def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
+        dict=dictionary(h,sep)
+        new_data=v.splitted(sep)
+        recoded=new_data.collect do |c|
+          if c.nil?
+            nil
+          else
+            c.collect{|value| dict[value] }.flatten.uniq
+          end
+        end
+      end
+      def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
+            _recode_dataset(dataset,dictionary_hash ,sep,false)
+        end
+        def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
+            _recode_dataset(dataset, dictionary_hash, sep,true)
+        end
+        
+        def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
+          v_names||=h.keys
+          v_names.each do |v_name|
+            raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
+            recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
+              if c.nil?
+                  nil
+              else
+                  c.join(sep)
+              end
+            }.to_vector
+            if(split)
+            recoded.split_by_separator(sep).each {|k,v|
+              dataset[v_name+"_"+k]=v
+            }
+            else
+              dataset[v_name+"_recoded"]=recoded
             end
-            def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
-                h.inject({}) {|a,v|
-                    v[1].split(sep).each {|val|
-                        a[val]||=[]
-                        a[val].push(v[0])
-                    }
-                    a
-                }
-            end
-            def dictionary(h,sep=Statsample::SPLIT_TOKEN)
-                h.inject({}) {|a,v|
-                    a[v[0]]=v[1].split(sep)
-                    a
-                }
-            end
-            def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
-                dict=dictionary(h,sep)
-                new_data=v.splitted(sep)
-                recoded=new_data.collect{|c|
-                    if c.nil?
-                        nil
-                    else
-                    c.collect{|value|
-                        dict[value]
-                    }.flatten.uniq
-                end
-                }
-            end
-            def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
-                _recode_dataset(dataset,yaml,sep,false)
-            end
-            def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
-                _recode_dataset(dataset,yaml,sep,true)
-            end
-            
-            def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
-                h=YAML::load(yaml)
-                v_names||=h.keys
-                v_names.each do |v_name|
-                    raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
-                    recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
-                        if c.nil?
-                            nil
-                        else
-                            c.join(sep)
-                        end
-                    }.to_vector
-                    if(split)
-                    recoded.split_by_separator(sep).each {|k,v|
-                        dataset[v_name+"_"+k]=v
-                    }
-                    else
-                        dataset[v_name+"_recoded"]=recoded
-                    end
-                end
-            end
-            def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
-                require 'pp'
-                h=YAML::load(yaml)
-                v_names||=h.keys
-                v_names.each{|v_name|
-                    inverse=inverse_hash(h[v_name],sep)
-                    io.puts "Vector: #{v_name}"
-                    YAML.dump(inverse.sort,io)
-                }
-            end
+          end
         end
+        
+        
+        def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
+          require 'pp'
+          v_names||=h.keys
+          v_names.each{|v_name|
+            inverse=inverse_hash(h[v_name],sep)
+            io.puts "- Field: #{v_name}"
+            inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
+              io.puts "  - \"#{k}\" (#{v.count}) :\n    -'"+v.join("\n    -'")+"'"
+            }
+          }
+        end
     end
+  end
 end