lib/statsample/converters.rb in statsample-1.5.0 vs lib/statsample/converters.rb in statsample-2.0.0
- old
+ new
@@ -1,262 +1,102 @@
require 'statsample/converter/spss'
module Statsample
- # Create and dumps Datasets on a database
+ # Create and dumps Datasets on a database
+ #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame.from_sql and Daru::DataFrame#write_sql
module Database
class << self
# Read a database query and returns a Dataset
#
- # USE:
- #
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
- # Statsample.read(dbh, "SELECT * FROM test")
- #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame.from_sql instead.
def read(dbh,query)
- require 'dbi'
- sth=dbh.execute(query)
- vectors={}
- fields=[]
- sth.column_info.each {|c|
- vectors[c['name']]=Statsample::Vector.new([])
- vectors[c['name']].name=c['name']
- vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :numeric : :object
- fields.push(c['name'])
- }
- ds=Statsample::Dataset.new(vectors,fields)
- sth.fetch do |row|
- ds.add_case(row.to_a, false )
- end
- ds.update_valid_data
- ds
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_sql instead."
end
+
# Insert each case of the Dataset on the selected table
#
- # USE:
- #
- # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
- # Statsample::Database.insert(ds,dbh,"test")
- #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame#write_sql instead
def insert(ds, dbh, table)
- require 'dbi'
- query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
- sth=dbh.prepare(query)
- ds.each_array{|c| sth.execute(*c) }
- return true
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_sql instead."
end
# Create a sql, basen on a given Dataset
#
- # USE:
- #
- # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
- # Statsample::Database.create_sql(ds,'names')
- # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
- #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame#create_sql instead.
def create_sql(ds,table,charset="UTF8")
- sql="CREATE TABLE #{table} ("
- fields=ds.fields.collect{|f|
- v=ds[f]
- f+" "+v.db_type
- }
- sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#create_sql instead."
end
end
end
module Mondrian
class << self
def write(dataset,filename)
File.open(filename,"wb") do |fp|
- fp.puts dataset.fields.join("\t")
- dataset.each_array_with_nils do |row|
- row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
+ fp.puts dataset.vectors.to_a.join("\t")
+ dataset.each_row do |row|
+ row2 = row.map { |v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
fp.puts row2.join("\t")
end
end
end
end
end
- class SpreadsheetBase
+
+ class PlainText
class << self
- def extract_fields(row)
- i=0;
- fields=row.to_a.collect{|c|
- if c.nil?
- i+=1
- "var%05d" % i
- else
- c.to_s.downcase
- end
- }
- fields.recode_repeated
+ def read(filename, fields)
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_plaintext instead."
end
-
- def process_row(row,empty)
- row.to_a.map do |c|
- if empty.include?(c)
- nil
- else
- if c.is_a? String and c.is_number?
- if c=~/^\d+$/
- c.to_i
- else
- c.gsub(",",".").to_f
- end
- else
- c
- end
- end
- end
- end
- def convert_to_numeric_and_date(ds,fields)
- fields.each do |f|
- if ds[f].can_be_numeric?
- ds[f].type=:numeric
- elsif ds[f].can_be_date?
- ds[f].type=:date
- end
- end
- end
-
end
end
- class PlainText < SpreadsheetBase
- class << self
- def read(filename, fields)
- ds=Statsample::Dataset.new(fields)
- fp=File.open(filename,"r")
- fp.each_line do |line|
- row=process_row(line.strip.split(/\s+/),[""])
- next if row==["\x1A"]
- ds.add_case_array(row)
- end
- convert_to_numeric_and_date(ds,fields)
- ds.update_valid_data
- fields.each {|f|
- ds[f].name=f
- }
- ds
- end
- end
- end
- class Excel < SpreadsheetBase
+
+ # This class has been DEPRECATED. Use Daru::DataFrame::from_excel
+ # Daru::DataFrame#write_excel for XLS file operations.
+ class Excel
class << self
# Write a Excel spreadsheet based on a dataset
# * TODO: Format nicely date values
+ #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame#write_csv.
def write(dataset,filename)
- require 'spreadsheet'
- book = Spreadsheet::Workbook.new
- sheet = book.create_worksheet
- format = Spreadsheet::Format.new :color => :blue,
- :weight => :bold
- sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings
- sheet.row(0).default_format = format
- i=1
- dataset.each_array{|row|
- sheet.row(i).concat(row)
- i+=1
- }
- book.write(filename)
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_excel instead."
end
- # This should be fixed.
- # If we have a Formula, should be resolver first
- def preprocess_row(row, dates)
- i=-1
- row.collect!{|c|
- i+=1
- if c.is_a? Spreadsheet::Formula
- if(c.value.is_a? Spreadsheet::Excel::Error)
- nil
- else
- c.value
- end
- elsif dates.include? i and !c.nil? and c.is_a? Numeric
- row.date(i)
- else
- c
- end
- }
- end
- private :process_row, :preprocess_row
-
# Returns a dataset based on a xls file
- # USE:
- # ds = Statsample::Excel.read("test.xls")
- #
+ #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame.from_excel instead.
def read(filename, opts=Hash.new)
- require 'spreadsheet'
- raise "options should be Hash" unless opts.is_a? Hash
- opts_default={
- :worksheet_id=>0,
- :ignore_lines=>0,
- :empty=>['']
- }
-
- opts=opts_default.merge opts
-
- worksheet_id=opts[:worksheet_id]
- ignore_lines=opts[:ignore_lines]
- empty=opts[:empty]
-
- first_row=true
- fields=[]
- ds=nil
- line_number=0
- book = Spreadsheet.open filename
- sheet= book.worksheet worksheet_id
- sheet.each do |row|
- begin
- dates=[]
- row.formats.each_index{|i|
- if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
- dates.push(i)
- end
- }
- line_number+=1
- next if(line_number<=ignore_lines)
-
- preprocess_row(row,dates)
- if first_row
- fields=extract_fields(row)
- ds=Statsample::Dataset.new(fields)
- first_row=false
- else
- rowa=process_row(row,empty)
- (fields.size - rowa.size).times {
- rowa << nil
- }
- ds.add_case(rowa,false)
- end
- rescue => e
- error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
- raise
- end
- end
- convert_to_numeric_and_date(ds, fields)
- ds.update_valid_data
- fields.each {|f|
- ds[f].name=f
- }
- ds.name=filename
- ds
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_excel instead."
end
end
end
+
module Mx
class << self
def write(dataset,filename,type=:covariance)
puts "Writing MX File"
File.open(filename,"w") do |fp|
fp.puts "! #{filename}"
fp.puts "! Output generated by Statsample"
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
- fp.puts "Labels "+dataset.fields.join(" ")
+ fp.puts "Labels " + dataset.vectors.to_a.join(" ")
case type
when :raw
fp.puts "Rectangular"
dataset.each do |row|
- out=dataset.fields.collect do |f|
+ out=dataset.vectors.to_a.collect do |f|
if dataset[f].is_valid? row[f]
row[f]
else
"."
end
@@ -290,22 +130,22 @@
default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"}
default_opt.merge! opt
carrier=OpenStruct.new
carrier.categorials=[]
carrier.conversions={}
- variables_def=dataset.fields.collect{|k|
+ variables_def=dataset.vectors.to_a.collect{|k|
variable_definition(carrier,dataset[k],k)
}.join("\n")
indexes=carrier.categorials.inject({}) {|s,c|
- s[dataset.fields.index(c)]=c
+ s[dataset.vectors.to_a.index(c)]=c
s
}
records=""
- dataset.each_array {|c|
- indexes.each{|ik,iv|
- c[ik]=carrier.conversions[iv][c[ik]]
+ dataset.each_row {|c|
+ indexes.each { |ik,iv|
+ c[ik] = carrier.conversions[iv][c[ik]]
}
records << "<record>#{values_definition(c, default_opt[:missing])}</record>\n"
}
out=<<EOC
@@ -343,28 +183,27 @@
# v = vector
# name = name of the variable
# nickname = nickname
def variable_definition(carrier,v,name,nickname=nil)
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
- if v.type==:object or v.data.find {|d| d.is_a? String }
+ if v.type==:object or v.to_a.find {|d| d.is_a? String }
carrier.categorials.push(name)
carrier.conversions[name]={}
factors=v.factors
out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
out << "<levels count=\"#{factors.size}\">\n"
out << (1..factors.size).to_a.collect{|i|
carrier.conversions[name][factors[i-1]]=i
- "<level value=\"#{i}\">#{v.labeling(factors[i-1])}</level>"
+ "<level value=\"#{i}\">#{(v.labels[factors[i-1]] || factors[i-1])}</level>"
}.join("\n")
out << "</levels>\n</categoricalvariable>\n"
out
- elsif v.data.find {|d| d.is_a? Float}
+ elsif v.to_a.find {|d| d.is_a? Float}
"<realvariable name=\"#{name}\" #{nickname} />"
else
"<integervariable name=\"#{name}\" #{nickname} />"
end
end
-
end
end
end
require 'statsample/converter/csv.rb'