lib/crossvalidation.rb in lazar-0.0.7 vs lib/crossvalidation.rb in lazar-0.0.9

- old
+ new

@@ -4,16 +4,62 @@ field :validation_ids, type: Array, default: [] field :model_id, type: BSON::ObjectId field :folds, type: Integer field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Array, default: [] field :finished_at, type: Time def time finished_at - created_at end + + def validations + validation_ids.collect{|vid| Validation.find vid} + end + + def model + Model::Lazar.find model_id + end + + def self.create model, n=10 + model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation + bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + nr_instances = 0 + nr_unpredicted = 0 + predictions = [] + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each_with_index do |fold,fold_nr| + #fork do # parallel execution of validations + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = Validation.create(model, fold[0], fold[1],cv) + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + #end + end + #Process.waitall + cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) + cv.validations.each do |validation| + nr_instances += validation.nr_instances + nr_unpredicted += validation.nr_unpredicted + predictions += validation.predictions + end + cv.update_attributes( + nr_instances: nr_instances, + nr_unpredicted: nr_unpredicted, + predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + ) + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv + end end class ClassificationCrossValidation < CrossValidation field :accept_values, type: Array @@ -21,177 +67,234 @@ field :weighted_confusion_matrix, type: Array field :accuracy, type: Float field :weighted_accuracy, type: Float field :true_rate, type: Hash field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId # TODO auc, f-measure (usability??) - def self.create model, n=10 - cv = self.new - cv.save # set created_at - validation_ids = [] - nr_instances = 0 - nr_unpredicted = 0 - predictions = [] - validation_class = Object.const_get(self.to_s.sub(/Cross/,'')) + def statistics accept_values = Feature.find(model.prediction_feature_id).accept_values confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} true_rate = {} predictivity = {} - fold_nr = 1 - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each do |fold| - t = Time.now - $logger.debug "Fold #{fold_nr}" - validation = validation_class.create(model, fold[0], fold[1]) - validation_ids << validation.id - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions - validation.confusion_matrix.each_with_index do |r,i| - r.each_with_index do |c,j| - confusion_matrix[i][j] += c - weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j] + predictions.each do |pred| + compound_id,activities,prediction,confidence = pred + if activities and prediction #and confidence.numeric? + if activities.uniq.size == 1 + activity = activities.uniq.first + if prediction == activity + if prediction == accept_values[0] + confusion_matrix[0][0] += 1 + #weighted_confusion_matrix[0][0] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][1] += 1 + #weighted_confusion_matrix[1][1] += confidence + end + elsif prediction != activity + if prediction == accept_values[0] + confusion_matrix[0][1] += 1 + #weighted_confusion_matrix[0][1] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][0] += 1 + #weighted_confusion_matrix[1][0] += confidence + end + end end + else + nr_unpredicted += 1 if prediction.nil? end - $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds" - fold_nr +=1 end true_rate = {} predictivity = {} accept_values.each_with_index do |v,i| true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f end confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c - end - end - cv.update_attributes( - name: model.name, - model_id: model.id, - folds: n, - validation_ids: validation_ids, - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, + #weighted_confusion_matrix.each do |r| + #r.each do |c| + #confidence_sum += c + #end + #end + update_attributes( accept_values: accept_values, confusion_matrix: confusion_matrix, - weighted_confusion_matrix: weighted_confusion_matrix, + #weighted_confusion_matrix: weighted_confusion_matrix, accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, true_rate: true_rate, predictivity: predictivity, - predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence finished_at: Time.now ) - cv.save - cv + $logger.debug "Accuracy #{accuracy}" end + def confidence_plot + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.png" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + if p[1] and p[2] + p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[3] + + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + end + $gridfs.find_one(_id: confidence_plot_id).data + end + #Average area under roc 0.646 #Area under roc 0.646 #F measure carcinogen: 0.769, noncarcinogen: 0.348 end class RegressionCrossValidation < CrossValidation field :rmse, type: Float field :mae, type: Float - field :weighted_rmse, type: Float - field :weighted_mae, type: Float + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId - def self.create model, n=10 - cv = self.new - cv.save # set created_at - validation_ids = [] - nr_instances = 0 - nr_unpredicted = 0 - predictions = [] - validation_class = Object.const_get(self.to_s.sub(/Cross/,'')) - fold_nr = 1 - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each do |fold| - t = Time.now - $logger.debug "Predicting fold #{fold_nr}" - - validation = validation_class.create(model, fold[0], fold[1]) - validation_ids << validation.id - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions - $logger.debug "Fold #{fold_nr}: #{Time.now-t} seconds" - fold_nr +=1 - end + def statistics rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 mae = 0 - weighted_mae = 0 - rae = 0 - weighted_rae = 0 - n = 0 - confidence_sum = 0 + x = [] + y = [] predictions.each do |pred| compound_id,activity,prediction,confidence = pred - if activity and prediction - error = prediction-activity - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - n += 1 - confidence_sum += confidence + if activity and prediction + unless activity == [nil] + x << -Math.log10(activity.median) + y << -Math.log10(prediction) + error = Math.log10(prediction)-Math.log10(activity.median) + rmse += error**2 + #weighted_rmse += confidence*error**2 + mae += error.abs + #weighted_mae += confidence*error.abs + #confidence_sum += confidence + end else - # TODO: create warnings - p pred + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." end end - mae = mae/n - weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/n) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - cv.update_attributes( - name: model.name, - model_id: model.id, - folds: n, - validation_ids: validation_ids, - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, - predictions: predictions.sort{|a,b| b[3] <=> a[3]}, + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + #weighted_mae = weighted_mae/confidence_sum + rmse = Math.sqrt(rmse/predictions.size) + #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) + update_attributes( mae: mae, rmse: rmse, - weighted_mae: weighted_mae, - weighted_rmse: weighted_rmse + #weighted_mae: weighted_mae, + #weighted_rmse: weighted_rmse, + r_squared: r**2, + finished_at: Time.now ) - cv.save - cv + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" end - def plot - # RMSE - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} - R.assign "Measurement", x - R.assign "Prediction", y - R.eval "par(pty='s')" # sets the plot type to be square - #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))" - #R.eval "error <- log(Measurement)-log(Prediction)" - R.eval "error <- Measurement-Prediction" - R.eval "rmse <- sqrt(mean(error^2,na.rm=T))" - R.eval "mae <- mean( abs(error), na.rm = TRUE)" - R.eval "r <- cor(log(Prediction),log(Measurement))" - R.eval "svg(filename='/tmp/#{id.to_s}.svg')" - R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)" - #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)" - #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)" - R.eval "abline(0,1,col='blue')" - #R.eval "abline(fitline,col='red')" - R.eval "dev.off()" - "/tmp/#{id.to_s}.svg" + def misclassifications n=nil + #n = predictions.size unless n + n ||= 10 + model = Model::Lazar.find(self.model_id) + training_dataset = Dataset.find(model.training_dataset_id) + prediction_feature = training_dataset.features.first + predictions.collect do |p| + unless p.include? nil + compound = Compound.find(p[0]) + neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) + neighbors.collect! do |n| + neighbor = Compound.find(n[0]) + values = training_dataset.values(neighbor,prediction_feature) + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + end + { + :smiles => compound.smiles, + #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, + :measured => p[1], + :predicted => p[2], + #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, + :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :relative_error => (p[1]-p[2]).abs/p[1], + :confidence => p[3], + :neighbors => neighbors + } + end + end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] + end + + def confidence_plot + tmpfile = "/tmp/#{id.to_s}_confidence.png" + sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact + R.assign "error", sorted_predictions.collect{|p| p[0]} + R.assign "confidence", sorted_predictions.collect{|p| p[1]} + # TODO fix axis names + R.eval "image = qplot(confidence,error)" + R.eval "image = image + stat_smooth(method='lm', se=FALSE)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + $gridfs.find_one(_id: confidence_plot_id).data + end + + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.png" + x = predictions.collect{|p| p[1]} + y = predictions.collect{|p| p[2]} + attributes = Model::Lazar.find(self.model_id).attributes + attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} + attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "range = c(min(all), max(all))" + R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end + end + + class RepeatedCrossValidation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation + end + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} end end end