lib/crossvalidation.rb in lazar-0.0.7 vs lib/crossvalidation.rb in lazar-0.0.9
- old
+ new
@@ -4,16 +4,62 @@
field :validation_ids, type: Array, default: []
field :model_id, type: BSON::ObjectId
field :folds, type: Integer
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
- field :predictions, type: Array
+ field :predictions, type: Array, default: []
field :finished_at, type: Time
def time
finished_at - created_at
end
+
+ def validations
+ validation_ids.collect{|vid| Validation.find vid}
+ end
+
+ def model
+ Model::Lazar.find model_id
+ end
+
+ def self.create model, n=10
+ model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+ bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+ cv = klass.new(
+ name: model.name,
+ model_id: model.id,
+ folds: n
+ )
+ cv.save # set created_at
+ nr_instances = 0
+ nr_unpredicted = 0
+ predictions = []
+ training_dataset = Dataset.find model.training_dataset_id
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
+ #fork do # parallel execution of validations
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+ t = Time.now
+ validation = Validation.create(model, fold[0], fold[1],cv)
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
+ #end
+ end
+ #Process.waitall
+ cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
+ cv.validations.each do |validation|
+ nr_instances += validation.nr_instances
+ nr_unpredicted += validation.nr_unpredicted
+ predictions += validation.predictions
+ end
+ cv.update_attributes(
+ nr_instances: nr_instances,
+ nr_unpredicted: nr_unpredicted,
+ predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+ )
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+ cv.statistics
+ cv
+ end
end
class ClassificationCrossValidation < CrossValidation
field :accept_values, type: Array
@@ -21,177 +67,234 @@
field :weighted_confusion_matrix, type: Array
field :accuracy, type: Float
field :weighted_accuracy, type: Float
field :true_rate, type: Hash
field :predictivity, type: Hash
+ field :confidence_plot_id, type: BSON::ObjectId
# TODO auc, f-measure (usability??)
- def self.create model, n=10
- cv = self.new
- cv.save # set created_at
- validation_ids = []
- nr_instances = 0
- nr_unpredicted = 0
- predictions = []
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+ def statistics
accept_values = Feature.find(model.prediction_feature_id).accept_values
confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
true_rate = {}
predictivity = {}
- fold_nr = 1
- training_dataset = Dataset.find model.training_dataset_id
- training_dataset.folds(n).each do |fold|
- t = Time.now
- $logger.debug "Fold #{fold_nr}"
- validation = validation_class.create(model, fold[0], fold[1])
- validation_ids << validation.id
- nr_instances += validation.nr_instances
- nr_unpredicted += validation.nr_unpredicted
- predictions += validation.predictions
- validation.confusion_matrix.each_with_index do |r,i|
- r.each_with_index do |c,j|
- confusion_matrix[i][j] += c
- weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
+ predictions.each do |pred|
+ compound_id,activities,prediction,confidence = pred
+ if activities and prediction #and confidence.numeric?
+ if activities.uniq.size == 1
+ activity = activities.uniq.first
+ if prediction == activity
+ if prediction == accept_values[0]
+ confusion_matrix[0][0] += 1
+ #weighted_confusion_matrix[0][0] += confidence
+ elsif prediction == accept_values[1]
+ confusion_matrix[1][1] += 1
+ #weighted_confusion_matrix[1][1] += confidence
+ end
+ elsif prediction != activity
+ if prediction == accept_values[0]
+ confusion_matrix[0][1] += 1
+ #weighted_confusion_matrix[0][1] += confidence
+ elsif prediction == accept_values[1]
+ confusion_matrix[1][0] += 1
+ #weighted_confusion_matrix[1][0] += confidence
+ end
+ end
end
+ else
+ nr_unpredicted += 1 if prediction.nil?
end
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
- fold_nr +=1
end
true_rate = {}
predictivity = {}
accept_values.each_with_index do |v,i|
true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
end
confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
- end
- end
- cv.update_attributes(
- name: model.name,
- model_id: model.id,
- folds: n,
- validation_ids: validation_ids,
- nr_instances: nr_instances,
- nr_unpredicted: nr_unpredicted,
+ #weighted_confusion_matrix.each do |r|
+ #r.each do |c|
+ #confidence_sum += c
+ #end
+ #end
+ update_attributes(
accept_values: accept_values,
confusion_matrix: confusion_matrix,
- weighted_confusion_matrix: weighted_confusion_matrix,
+ #weighted_confusion_matrix: weighted_confusion_matrix,
accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+ #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
true_rate: true_rate,
predictivity: predictivity,
- predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
finished_at: Time.now
)
- cv.save
- cv
+ $logger.debug "Accuracy #{accuracy}"
end
+ def confidence_plot
+ unless confidence_plot_id
+ tmpfile = "/tmp/#{id.to_s}_confidence.png"
+ accuracies = []
+ confidences = []
+ correct_predictions = 0
+ incorrect_predictions = 0
+ predictions.each do |p|
+ if p[1] and p[2]
+ p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+ confidences << p[3]
+
+ end
+ end
+ R.assign "accuracy", accuracies
+ R.assign "confidence", confidences
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
+ end
+ $gridfs.find_one(_id: confidence_plot_id).data
+ end
+
#Average area under roc 0.646
#Area under roc 0.646
#F measure carcinogen: 0.769, noncarcinogen: 0.348
end
class RegressionCrossValidation < CrossValidation
field :rmse, type: Float
field :mae, type: Float
- field :weighted_rmse, type: Float
- field :weighted_mae, type: Float
+ field :r_squared, type: Float
+ field :correlation_plot_id, type: BSON::ObjectId
- def self.create model, n=10
- cv = self.new
- cv.save # set created_at
- validation_ids = []
- nr_instances = 0
- nr_unpredicted = 0
- predictions = []
- validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
- fold_nr = 1
- training_dataset = Dataset.find model.training_dataset_id
- training_dataset.folds(n).each do |fold|
- t = Time.now
- $logger.debug "Predicting fold #{fold_nr}"
-
- validation = validation_class.create(model, fold[0], fold[1])
- validation_ids << validation.id
- nr_instances += validation.nr_instances
- nr_unpredicted += validation.nr_unpredicted
- predictions += validation.predictions
- $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds"
- fold_nr +=1
- end
+ def statistics
rmse = 0
- weighted_rmse = 0
- rse = 0
- weighted_rse = 0
mae = 0
- weighted_mae = 0
- rae = 0
- weighted_rae = 0
- n = 0
- confidence_sum = 0
+ x = []
+ y = []
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
- if activity and prediction
- error = prediction-activity
- rmse += error**2
- weighted_rmse += confidence*error**2
- mae += error.abs
- weighted_mae += confidence*error.abs
- n += 1
- confidence_sum += confidence
+ if activity and prediction
+ unless activity == [nil]
+ x << -Math.log10(activity.median)
+ y << -Math.log10(prediction)
+ error = Math.log10(prediction)-Math.log10(activity.median)
+ rmse += error**2
+ #weighted_rmse += confidence*error**2
+ mae += error.abs
+ #weighted_mae += confidence*error.abs
+ #confidence_sum += confidence
+ end
else
- # TODO: create warnings
- p pred
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
end
end
- mae = mae/n
- weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/n)
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- cv.update_attributes(
- name: model.name,
- model_id: model.id,
- folds: n,
- validation_ids: validation_ids,
- nr_instances: nr_instances,
- nr_unpredicted: nr_unpredicted,
- predictions: predictions.sort{|a,b| b[3] <=> a[3]},
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+ r = R.eval("r").to_ruby
+
+ mae = mae/predictions.size
+ #weighted_mae = weighted_mae/confidence_sum
+ rmse = Math.sqrt(rmse/predictions.size)
+ #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+ update_attributes(
mae: mae,
rmse: rmse,
- weighted_mae: weighted_mae,
- weighted_rmse: weighted_rmse
+ #weighted_mae: weighted_mae,
+ #weighted_rmse: weighted_rmse,
+ r_squared: r**2,
+ finished_at: Time.now
)
- cv.save
- cv
+ $logger.debug "R^2 #{r**2}"
+ $logger.debug "RMSE #{rmse}"
+ $logger.debug "MAE #{mae}"
end
- def plot
- # RMSE
- x = predictions.collect{|p| p[1]}
- y = predictions.collect{|p| p[2]}
- R.assign "Measurement", x
- R.assign "Prediction", y
- R.eval "par(pty='s')" # sets the plot type to be square
- #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
- #R.eval "error <- log(Measurement)-log(Prediction)"
- R.eval "error <- Measurement-Prediction"
- R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
- R.eval "mae <- mean( abs(error), na.rm = TRUE)"
- R.eval "r <- cor(log(Prediction),log(Measurement))"
- R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
- R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
- #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
- R.eval "abline(0,1,col='blue')"
- #R.eval "abline(fitline,col='red')"
- R.eval "dev.off()"
- "/tmp/#{id.to_s}.svg"
+ def misclassifications n=nil
+ #n = predictions.size unless n
+ n ||= 10
+ model = Model::Lazar.find(self.model_id)
+ training_dataset = Dataset.find(model.training_dataset_id)
+ prediction_feature = training_dataset.features.first
+ predictions.collect do |p|
+ unless p.include? nil
+ compound = Compound.find(p[0])
+ neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+ neighbors.collect! do |n|
+ neighbor = Compound.find(n[0])
+ values = training_dataset.values(neighbor,prediction_feature)
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+ end
+ {
+ :smiles => compound.smiles,
+ #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
+ :measured => p[1],
+ :predicted => p[2],
+ #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+ :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+ :relative_error => (p[1]-p[2]).abs/p[1],
+ :confidence => p[3],
+ :neighbors => neighbors
+ }
+ end
+ end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
+ end
+
+ def confidence_plot
+ tmpfile = "/tmp/#{id.to_s}_confidence.png"
+ sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+ R.assign "error", sorted_predictions.collect{|p| p[0]}
+ R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+ # TODO fix axis names
+ R.eval "image = qplot(confidence,error)"
+ R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
+ $gridfs.find_one(_id: confidence_plot_id).data
+ end
+
+ def correlation_plot
+ unless correlation_plot_id
+ tmpfile = "/tmp/#{id.to_s}_correlation.png"
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
+ attributes = Model::Lazar.find(self.model_id).attributes
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "all = c(-log(measurement),-log(prediction))"
+ R.eval "range = c(min(all), max(all))"
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
+ plot_id = $gridfs.insert_one(file)
+ update(:correlation_plot_id => plot_id)
+ end
+ $gridfs.find_one(_id: correlation_plot_id).data
+ end
+ end
+
+ class RepeatedCrossValidation
+ field :crossvalidation_ids, type: Array, default: []
+ def self.create model, folds=10, repeats=3
+ repeated_cross_validation = self.new
+ repeats.times do |n|
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+ end
+ repeated_cross_validation.save
+ repeated_cross_validation
+ end
+ def crossvalidations
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
end
end
end