require 'statsample/converter/spss'
module Statsample
# Create and dumps Datasets on a database
module Database
class << self
# Read a database query and returns a Dataset
#
# USE:
#
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# Statsample.read(dbh, "SELECT * FROM test")
#
def read(dbh,query)
require 'dbi'
sth=dbh.execute(query)
vectors={}
fields=[]
sth.column_info.each {|c|
vectors[c['name']]=Statsample::Vector.new([])
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
fields.push(c['name'])
}
ds=Statsample::Dataset.new(vectors,fields)
sth.fetch do |row|
ds.add_case(row.to_a, false )
end
ds.update_valid_data
ds
end
# Insert each case of the Dataset on the selected table
#
# USE:
#
# ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# Statsample::Database.insert(ds,dbh,"test")
#
def insert(ds, dbh,table)
require 'dbi'
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
sth=dbh.prepare(query)
ds.each_array{|c| sth.execute(*c) }
return true
end
# Create a sql, basen on a given Dataset
#
# USE:
#
# ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
# Statsample::Database.create_sql(ds,'names')
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
#
def create_sql(ds,table,charset="UTF8")
sql="CREATE TABLE #{table} ("
fields=ds.fields.collect{|f|
v=ds[f]
f+" "+v.db_type
}
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
end
end
end
module Mondrian
class << self
def write(dataset,filename)
File.open(filename,"wb") do |fp|
fp.puts dataset.fields.join("\t")
dataset.each_array_with_nils do |row|
row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
fp.puts row2.join("\t")
end
end
end
end
end
class SpreadsheetBase
class << self
def extract_fields(row)
=begin
fields=[]
row.to_a.collect {|c|
if c.nil?
break
else
fields.push(c)
end
}
=end
raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all {|c| c.nil?}.size>0
fields=row.to_a.collect{|c| c.downcase}
fields.recode_repeated
end
def process_row(row,empty)
row.to_a.collect do |c|
if empty.include?(c)
nil
else
if c.is_a? String and c.is_number?
if c=~/^\d+$/
c.to_i
else
c.gsub(",",".").to_f
end
else
c
end
end
end
end
def convert_to_scale_and_date(ds,fields)
fields.each do |f|
if ds[f].can_be_scale?
ds[f].type=:scale
elsif ds[f].can_be_date?
ds[f].type=:date
end
end
end
end
end
class PlainText < SpreadsheetBase
class << self
def read(filename, fields)
ds=Statsample::Dataset.new(fields)
fp=File.open(filename,"r")
fp.each_line do |line|
row=process_row(line.strip.split(/\s+/),[""])
next if row==["\x1A"]
ds.add_case_array(row)
end
convert_to_scale_and_date(ds,fields)
ds.update_valid_data
ds
end
end
end
class Excel < SpreadsheetBase
class << self
# Write a Excel spreadsheet based on a dataset
# * TODO: Format nicely date values
def write(dataset,filename)
require 'spreadsheet'
book = Spreadsheet::Workbook.new
sheet = book.create_worksheet
format = Spreadsheet::Format.new :color => :blue,
:weight => :bold
sheet.row(0).concat(dataset.fields)
sheet.row(0).default_format = format
i=1
dataset.each_array{|row|
sheet.row(i).concat(row)
i+=1
}
book.write(filename)
end
# This should be fixed.
# If we have a Formula, should be resolver first
def preprocess_row(row, dates)
i=-1
row.collect!{|c|
i+=1
if c.is_a? Spreadsheet::Formula
if(c.value.is_a? Spreadsheet::Excel::Error)
nil
else
c.value
end
elsif dates.include? i and !c.nil? and c.is_a? Numeric
row.date(i)
else
c
end
}
end
private :process_row
# Returns a dataset based on a xls file
# USE:
# ds = Statsample::Excel.read("test.xls")
#
def read(filename, worksheet_id=0, ignore_lines=0, empty=[''])
require 'spreadsheet'
first_row=true
fields=[]
fields_data={}
ds=nil
line_number=0
book = Spreadsheet.open filename
sheet= book.worksheet worksheet_id
sheet.each do |row|
begin
dates=[]
row.formats.each_index{|i|
if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
dates.push(i)
end
}
line_number+=1
next if(line_number<=ignore_lines)
preprocess_row(row,dates)
if first_row
fields=extract_fields(row)
ds=Statsample::Dataset.new(fields)
first_row=false
else
rowa=process_row(row,empty)
(fields.size - rowa.size).times {
rowa << nil
}
ds.add_case(rowa,false)
end
rescue => e
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
raise
end
end
convert_to_scale_and_date(ds, fields)
ds.update_valid_data
ds
end
end
end
module Mx
class << self
def write(dataset,filename,type=:covariance)
puts "Writing MX File"
File.open(filename,"w") do |fp|
fp.puts "! #{filename}"
fp.puts "! Output generated by Statsample"
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
fp.puts "Labels "+dataset.fields.join(" ")
case type
when :raw
fp.puts "Rectangular"
dataset.each do |row|
out=dataset.fields.collect do |f|
if dataset[f].is_valid? row[f]
row[f]
else
"."
end
end
fp.puts out.join("\t")
end
fp.puts "End Rectangular"
when :covariance
fp.puts " CMatrix Full"
cm=Statsample::Bivariate.covariance_matrix(dataset)
d=(0...(cm.row_size)).collect {|row|
(0...(cm.column_size)).collect{|col|
cm[row,col].nil? ? "." : sprintf("%0.3f", cm[row,col])
}.join(" ")
}.join("\n")
fp.puts d
end
end
end
end
end
module GGobi
class << self
def write(dataset,filename,opt={})
File.open(filename,"w") {|fp|
fp.write(self.out(dataset,opt))
}
end
def out(dataset,opt={})
require 'ostruct'
default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"}
default_opt.merge! opt
carrier=OpenStruct.new
carrier.categorials=[]
carrier.conversions={}
variables_def=dataset.fields.collect{|k|
variable_definition(carrier,dataset[k],k)
}.join("\n")
indexes=carrier.categorials.inject({}) {|s,c|
s[dataset.fields.index(c)]=c
s
}
records=""
dataset.each_array {|c|
indexes.each{|ik,iv|
c[ik]=carrier.conversions[iv][c[ik]]
}
records << "#{values_definition(c, default_opt[:missing])}\n"
}
out=<
#{default_opt[:description]}
#{variables_def}
#{records}
EOC
out
end
def values_definition(c,missing)
c.collect{|v|
if v.nil?
"#{missing}"
elsif v.is_a? Numeric
"#{v}"
else
"#{v.gsub(/\s+/,"_")}"
end
}.join(" ")
end
# Outputs a string for a variable definition
# v = vector
# name = name of the variable
# nickname = nickname
def variable_definition(carrier,v,name,nickname=nil)
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
if v.type==:nominal or v.data.find {|d| d.is_a? String }
carrier.categorials.push(name)
carrier.conversions[name]={}
factors=v.factors
out ="\n"
out << "\n"
out << (1..factors.size).to_a.collect{|i|
carrier.conversions[name][factors[i-1]]=i
"#{v.labeling(factors[i-1])}"
}.join("\n")
out << "\n\n"
out
elsif v.data.find {|d| d.is_a? Float}
""
else
""
end
end
end
end
end
if RUBY_VERSION<"1.9"
require 'statsample/converter/csv18.rb'
else
require 'statsample/converter/csv19.rb'
end