require "numru/gphys"
module NumRu
module GAnalysis
# Library for function fitting
#
module Fitting
# Predifined functions for convenience
# predefined Proc for fitting: Polynomial x (function of the 1st dim)
X = proc {|*args|
raise(ArgumentError,"# of arge must be >= 1") if args.length==0
x = args[0]
self.ensure_1D_NArray(x, 0)
rank = args.length
f = x.dup
(rank-1).times{f.newdim!(-1)} # f.rank becomes the number of arguments
f
}
# predefined Proc for fitting: Polynomial x**2 (function of the 1st dim)
XX = proc {|*args|
raise(ArgumentError,"# of arge must be >= 1") if args.length==0
x = args[0]
self.ensure_1D_NArray(x, 0)
rank = args.length
f = x*x
(rank-1).times{f.newdim!(-1)} # f.rank becomes the number of arguments
f
}
# predefined Proc for fitting: Polynomial y (function of the 2nd dim)
Y = proc {|*args|
raise(ArgumentError,"# of arge must be >= 2") if args.length < 2
y = args[1]
self.ensure_1D_NArray(y, 1)
rank = args.length
f = y.dup
f.newdim!(0)
(rank-2).times{f.newdim!(-1)} # f.rank becomes the number of arguments
f
}
# predefined Proc for fitting: Polynomial y**2 (function of the 2nd dim)
YY = proc {|*args|
raise(ArgumentError,"# of arge must be >= 2") if args.length < 2
y = args[1]
self.ensure_1D_NArray(y, 1)
rank = args.length
f = y*y
f.newdim!(0)
(rank-2).times{f.newdim!(-1)} # f.rank becomes the number of arguments
f
}
# predefined Proc for fitting: Polynomial x*y (function of the 1st&2nd dims)
XY = proc {|*args|
raise(ArgumentError,"# of arge must be >= 2") if args.length < 2
x = args[0]
y = args[1]
self.ensure_1D_NArray(x, 0)
self.ensure_1D_NArray(y, 1)
rank = args.length
f = x.newdim(-1) * y.newdim(0)
(rank-2).times{f.newdim!(-1)} # f.rank becomes the number of arguments
f
}
@@unity = proc {|*args|
rank = args.length
f = NArray.sfloat(1).fill!(1.0) # will be coersed to float when needed
(rank-1).times{f.newdim!(-1)}
f
}
module_function
# Least square fit of a linear combination of any functions (basic NArray version).
#
# === ARGUMENTS
# * +data+ [NArray or NArrayMiss] multi-D data to fit
# * +grid_locs+ [Array of 1D NArrays] Grid points of independent variables
# (so grid_locs.length == the # of independent variables).
# * +functions+ [Array of Procs] Proc objects to represent the functions,
# which accept the elements of +grid_locs+ as the arguments (so the
# number of arguments fed is equal to the length of +grid_locs+).
# * +ensemble_dims+ (optional) [nil (defualt) or Array of Integers]
# When grid_locs.length < data.rank,
# this argument can be used to specify the dimensions that are
# not included in grid_locs and are used for ensemble averaging
# * +indep_dims+ (optional) [nil (defualt) or Array of Integers]
# When grid_locs.length < data.rank,
# this argument can be used to specify the dimensions that are
# not included in +grid_locs+ and are treated as independent, so
# the fitting is made for each of their component.
#
# Note that the sum of the lengths of +grid_locs+, +ensemble_dims+ and
# +indep_dims+ must be equal to the rank (# of dims) of +data+.
#
# === RETURN VALUES
# [ c, bf, diff ]
# where
# * +c+ is a NArray containing the coefficients of the functions
# and the constant offset; its length is one greater than the
# number of +functions+ because of the offset.
# It is 1D unless the +indep_dims+ argument is used
# (see the examples below).
# * +bf+ is a NArray having the best fit grid point values.
# Its rank is equal to data.rank, but the lengths along
# +ensemble_dims+ are simply 1.
# * rms of the difference between the data and best fit
#
# === EXAMPLES
# * Simple 1D case
#
# Line fitting:
#
# nx = 5
# x = NArray.float(nx).indgen! - nx/2
# data = x + x*x*0.1
# c, bf = GAnalysis::Fitting.least_square_fit(data, [x],
# [GAnalysis::Fitting::X])
# p "data:", data, "c:", c, "bf:", bf
#
# Here, +GAnalysis::Fitting::X+ is a predefined Proc to represent
# the first order polynomial x. The data values given as above follow
# f(x) = x + x**2/10. Then the result printed by the last line is
# "data:"
# NArray.float(5):
# [ -1.6, -0.9, 0.0, 1.1, 2.4 ]
# "c:"
# NArray.float(2):
# [ 1.0, 0.2 ]
# "bf:"
# NArray.float(5):
# [ -1.8, -0.8, 0.2, 1.2, 2.2 ]
# The +c+ values indicate that the fitting result is f(x) = 1.0*x + 0.2,
# and the +bf+ values are its grid point values.
#
# Parabolic fitting:
#
# You can also fit the data by 2nd order polynomial as
# c, bf = GAnalysis::Fitting.least_square_fit(data, [x],
# [GAnalysis::Fitting::XX,GAnalysis::Fitting::X])
# Then the result will be
# p c #--> [0.1, 1.0, 0.0]
# which indicates the original 2nd order polynomial 0.1 x**2 + x,
# so it follows data == bf (except for round-off error if any).
#
# * 1D fitting of multi-D data (ensemble case)
#
# Suppose you have a 2D NArray (or NArrayMiss) data, in which
# the 1st dim represents x and the 2nd dim represents something
# else (such as time sequence, or just a simple ensemble).
# If you want to use the entire data to get a single fit,
# use the +ensemble_dims+ argument to specify the non-x dimension(s).
# You can fit the data, for example, by
# p*sin(x) + q*cos(x) + r as follows:
#
# sin = proc{|x| NMath.sin(x)}
# cos = proc{|x| NMath.cos(x)}
# c, bf = GAnalysis::Fitting.least_square_fit(data, [x],
# [sin, cos], [1])
# Here, the last parameter [1] is given as the arguemnt
# +ensemble_dims+ to express that the dimension 1
# (2nd dimension) of +data+ is the ensemble dimension, so the x
# coordinate is the remaining dimension 0 (1st dimension). The
# coefficients of the functions are returned by
# the 1st return value as a NArray, so
# p = c[0]
# q = c[1]
# r = c[2]
#
# * 1D fitting of multi-D data (individual fitting)
#
# Suppose you have the same data as above, but
# you want to fit it for each of the 2nd dim elements. You can
# do it as follows:
#
# c, bf = GAnalysis::Fitting.least_square_fit(data, [x],
# [sin, cos], nil, [1])
#
# Here, +nil+ is given as the 4th argument (+ensemble_dims+)
# and [1] is given as the fifth (+indep_dims+).
# In this case, the return value +c+ is 2-dimensional; the
# first being the coefficients as above and the second representing
# the non-x (i.e., the second) dim of +data+.
#
# * 2D fitting
#
# It can be done like
#
# cosx = proc {|x,y| NMath.cos(x).newdim!(-1)}
# sinx = proc {|x,y| NMath.sin(x).newdim!(-1)}
# cosy = proc {|x,y| NMath.cos(y).newdim!(0)}
# siny = proc {|x,y| NMath.sin(y).newdim!(0)}
# c, bf = GAnalysis::Fitting.least_square_fit(data4D, [x,y],
# [cosx, sinx, cosy, siny], [2,3])
# where +data4D+ is a 4D NArray, whose first and second dimensions
# (dimensions 0 and 1) represent x and y axis, respectively, and the
# 1D NArrays +x+ and +y+ are the grid points.
# Note that the functions (+cosx+ etc) accept 2 arguments (x and y),
# and they use NArray's +newdim+ method to return 2D NArray
# (newdim!(-1) inserts a 1-element dim to the end, and
# newdim(0) inserts a 1-element dim to the beginning).
#
# TYPICAL ERRORS
# * Error is raised (from the LU decomposition), if the problem
# cannot be solved. That happens if you specify a same function twice
# (redundantly) in the +functions+ argument, as a matter of course.
# * Error is raised if the number of data is insuffcient for the
# number of functions (also unsolvable).
#
def least_square_fit(data, grid_locs, functions, ensemble_dims=nil,
indep_dims=nil, with_offset=true)
#< argument check >
grid_locs.each_with_index{|x,i| self.ensure_1D_NArray(x, i)}
functions.each{|f| raise("Found non-Proc arg") if !f.is_a?(Proc)}
if with_offset
functions = functions + [@@unity] # constanf offset
end
ng = grid_locs.length
rank = data.rank
ensemble_dims = [ ensemble_dims ] if ensemble_dims.is_a?(Integer)
indep_dims = [ indep_dims ] if indep_dims.is_a?(Integer)
ensemble_dims = Array.new if ensemble_dims.nil? # --> always an Array
n_indep = ( indep_dims ? indep_dims.length : 0 )
if ng < rank
ensemble_dims = ensemble_dims.collect{|d|
if d<-rank || d>=rank
raise "Invalid ensemble_dims value (#{d}) for rank #{rank} NArray"
end
d += rank if d<0
d
}
ensemble_dims.sort!
if indep_dims
indep_dims = indep_dims.collect{|d|
if d<-rank || d>=rank
raise "Invalid indep_dims value (#{d}) for rank #{rank} NArray"
end
d += rank if d<0
d
}
indep_dims.sort!
end
elsif ng > rank
raise "# of grid_locs (#{ng}) > data.rank (#{rank})"
end
if data.rank != ng + ensemble_dims.length + n_indep
raise ArgumentError,
"lengths of grid_locs, ensemble_dims and indep_dims != data.rank"
end
otherdims = ensemble_dims
if indep_dims
otherdims += indep_dims
otherdims.sort!.uniq!
if otherdims.length != ensemble_dims.length + n_indep
raise ArgumentError, "Overlap in ensemble_dims and indep_dims"
end
end
#< pre-process data >
d0 = data.mean
data = data - d0 # constant offset for numerical stability
if data.is_a?(NArrayMiss)
mask = data.get_mask
elsif data.is_a?(NArray)
mask = nil # NArray.byte(*data.shape).fill!(1)
else
raise "Data type (#{data.class}) is not NArray or NArrayMiss"
end
#< derive the matrix >
fv = functions.collect{|f|
f = f[*grid_locs]
otherdims.each{|d| f.newdim!(d)}
f
}
ms = fv.length # matrix size
if ( (len=data.length) < ms )
raise "Insufficient data length (#{len}) for the # of funcs+1 (#{ms})"
end
mat = NMatrix.float(ms,ms) # wil be symmetric
for i in 0...ms
for j in 0..i
if mask
fvij = NArrayMiss.to_nam( fv[i] * fv[j] * mask, mask )
mat[i,j] = (fvij).mean
else
mat[i,j] = (fv[i] * fv[j]).mean
end
end
end
for i in 0...ms
for j in i+1...ms
mat[i,j] = mat[j,i] # symmetric
end
end
#p "*** mat ***",mat
lu = mat.lu
#< derive the vector, solve, and best fit >
unless indep_dims # fitting only once
# derive the vector
b = NVector.float(ms)
for i in 0...ms
b[i] = (data * fv[i]).mean
end
# solve
c = lu.solve(b)
c[-1] += d0 # add the mean subtracted
# convert c from NVector to NArray (just for cleanliness)
na = NArray.float(ms)
na[true] = c[true]
c = na
# best fit
if with_offset
bf = c[-1] # the constant offset
for i in 0...ms-1
bf += c[i]*fv[i]
end
else
bf = 0.0
for i in 0...ms
bf += c[i]*fv[i]
end
end
else # fitting multiple times
# derive vectors
idshp = indep_dims.collect{|d| data.shape[d]}
bs = NArray.float(ms,*idshp)
meandims = (0...rank).collect{|d| d} - indep_dims
for i in 0...ms
bsi = (data * fv[i]).mean(*meandims)
if bsi.is_a?(NArrayMiss)
if bsi.count_invalid > 0
raise("Found invalid data everywhere along indep_dims. Trim data in advance and try again.")
end
bsi = bsi.to_na
end
bs[i,false] = bsi
end
idlen = 1
idshp.each{|l| idlen *= l}
# solve
bs = bs.reshape(ms, idlen)
c = NArray.float(ms,idlen)
b = NVector.float(ms)
for id in 0...idlen
b[true] = bs[true,id]
c[true,id] = lu.solve(b)
end
c[-1,true] += d0
c = c.reshape(ms, *idshp)
# best fit
idshp_full = Array.new
for d in 0...rank
if indep_dims.include?(d)
idshp_full[d] = data.shape[d]
else
idshp_full[d] = 1
end
end
cs = c.reshape(ms, *idshp_full)
if with_offset
bf = cs[-1,false]
for i in 0...ms-1
bf += cs[i,false]*fv[i]
end
else
bf = cs[-1,false] * 0
for i in 0...ms
bf += cs[i,false]*fv[i]
end
end
end
diff = Math.sqrt( ( (data + d0 - bf)**2 ).mean )
#< return >
[ c, bf, diff ]
end
################################################
# For internal usage
private
def self.ensure_1D_NArray(na, ith)
raise("proc argument #{ith}: not a NArray") if !na.is_a?(NArray)
raise("proc argument #{ith}: not 1 dimensional") if na.rank != 1
nil
end
end
end
# GPhys extension with GAnalysis::Fitting
class GPhys
# Least square fit of a linear combination of any functions (GPhys version).
#
# This method calls GAnalysis::Fitting.least_square_fit in
# the GAnalysis::Fitting module.
# See its document for the details, usage, and predifined functions.
#
# === ARGUMENTS
#
# The arguments are the same as the third to fifth arguemnts of
# GAnalysis::Fitting.least_square_fit except that +ensemble_dims+
# and +indep_dims+ accept dimension specification by names (in Strings).
#
# * +functions+ [Array of Procs] Proc objects to represent the functions,
# which accept the elements of +grid_locs+ as the arguments (so the
# number of arguments fed is equal to the length of +grid_locs+).
# (Some predifined functions are available in GAnalysis::Fitting).
# * +ensemble_dims+ (optional) [nil (defualt) or Array of Integers or Strings]
# When grid_locs.length < data.rank,
# this argument can be used to specify the dimensions that are
# not included in grid_locs and are used for ensemble averaging
# * +indep_dims+ (optional) [nil (defualt) or Array of Integers or Strings]
# When grid_locs.length < data.rank,
# this argument can be used to specify the dimensions that are
# not included in +grid_locs+ and are treated as independent, so
# the fitting is made for each of their component.
#
# === RETURN VALUES
# [ c, bf, diff ]
# where
# * +c+ is a NArray containing the coefficients of the functions
# and the constant offset; its length is one greater than the
# number of +functions+ because of the offset.
# It is 1D unless the +indep_dims+ argument is used
# (see the examples below).
# * +bf+ is a GPhys having the best fit grid point values.
# Its rank is equal to data.rank unless ensemble_dims
# are given; ensemble_dims are deleted unlike the return
# value of GAnalysis::Fitting.least_square_fit.
# * rms of the difference between the data and best fit
#
# === USAGE
# See GAnalysis::Fitting.least_square_fit.
def least_square_fit(functions, ensemble_dims=nil, indep_dims=nil)
#< preparation >
no_fitting_dims = Array.new
if ensemble_dims
ensemble_dims = ensemble_dims.collect{|d| @grid.dim_index(d)}
no_fitting_dims += ensemble_dims
end
if indep_dims
indep_dims = indep_dims.collect{|d| @grid.dim_index(d)}
no_fitting_dims += indep_dims
end
fitting_dims = (0...rank).collect{|i| i} - no_fitting_dims
grid_locs = fitting_dims.collect{|d| coord(d).val}
data = self.val
#< fitting >
c, bf, diff = GAnalysis::Fitting.least_square_fit(data, grid_locs,
functions, ensemble_dims, indep_dims)
#< make a GPhys of the best fit >
if !ensemble_dims
grid = self.grid
else
axes = Array.new
(0...rank).each{|d|
axes.push(self.axis(d)) unless ensemble_dims.include?(d)
}
grid = Grid.new(*axes)
shape = bf.shape
ensemble_dims.sort.reverse_each{|d| shape.delete_at(d)}
bf = bf.reshape(*shape)
end
va = VArray.new(bf, self.data, self.name)
bf = GPhys.new(grid, va)
[c, bf, diff]
end
end
end
######################################################
if $0 == __FILE__
include NumRu
nx = 7
ny = 5
x = NArray.float(nx).indgen!
y = NArray.float(ny).indgen! - 1
#p GAnalysis::Fitting::X[x]
#p GAnalysis::Fitting::X[x,nil,nil]
#p GAnalysis::Fitting::X[x,y]
#p GAnalysis::Fitting::XX[x,y]
#p GAnalysis::Fitting::Y[x,y]
#p GAnalysis::Fitting::YY[x,y]
#p GAnalysis::Fitting::XY[x,y,nil]
#exit
xx = x.newdim(-1)
yy = y.newdim(0)
data = xx + 2*yy + 100
data += data.random * 0.1
p "***data**", data
f_x = GAnalysis::Fitting::X
f_y = GAnalysis::Fitting::Y
p GAnalysis::Fitting.least_square_fit(data, [x,y], [f_x, f_y])
data2 = NArray.float(nx,2,ny)
data2[true,0,true] = data - 1
data2[true,1,true] = data + 1
p GAnalysis::Fitting.least_square_fit(data2, [x,y], [f_x, f_y], [1])
nx = 5
x = NArray.float(nx).indgen! - nx/2
data = x + x*x*0.1
c, bf, diff = GAnalysis::Fitting.least_square_fit(data, [x],
[GAnalysis::Fitting::X])
p "data:", data, "c:", c, "bf:", bf
exit
c, bf, diff = GAnalysis::Fitting.least_square_fit(data, [x],
[GAnalysis::Fitting::X,GAnalysis::Fitting::XX])
p c
xx = x.newdim(-1)
data = xx + 2*yy + 100
cosx = proc {|x,y| NMath.cos(x).newdim!(-1)}
sinx = proc {|x,y| NMath.sin(x).newdim!(-1)}
cosy = proc {|x,y| NMath.cos(y).newdim!(0)}
siny = proc {|x,y| NMath.sin(y).newdim!(0)}
p GAnalysis::Fitting.least_square_fit(data, [x,y], [cosx, sinx, cosy, siny])
end