lib/validation.rb in lazar-0.9.3 vs lib/validation.rb in lazar-1.0.0

- old
+ new

@@ -1,108 +1,25 @@ module OpenTox - class Validation + module Validation - field :model_id, type: BSON::ObjectId - field :prediction_dataset_id, type: BSON::ObjectId - field :crossvalidation_id, type: BSON::ObjectId - field :test_dataset_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Array + class Validation + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "validations" + field :name, type: String + field :model_id, type: BSON::ObjectId + field :nr_instances, type: Integer, default: 0 + field :nr_unpredicted, type: Integer, default: 0 + field :predictions, type: Hash, default: {} + field :finished_at, type: Time - def prediction_dataset - Dataset.find prediction_dataset_id - end - - def test_dataset - Dataset.find test_dataset_id - end - - def model - Model::Lazar.find model_id - end - - def self.create model, training_set, test_set, crossvalidation=nil - - atts = model.attributes.dup # do not modify attributes from original model - atts["_id"] = BSON::ObjectId.new - atts[:training_dataset_id] = training_set.id - validation_model = model.class.create training_set, atts - validation_model.save - cids = test_set.compound_ids - - test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used - prediction_dataset = validation_model.predict test_set_without_activities - predictions = [] - nr_unpredicted = 0 - activities = test_set.data_entries.collect{|de| de.first} - prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] #and de[1] - cid = prediction_dataset.compound_ids[i] - rows = cids.each_index.select{|r| cids[r] == cid } - activities = rows.collect{|r| test_set.data_entries[r][0]} - prediction = de.first - confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] - else - nr_unpredicted += 1 - end + def model + Model::Lazar.find model_id end - validation = self.new( - :model_id => validation_model.id, - :prediction_dataset_id => prediction_dataset.id, - :test_dataset_id => test_set.id, - :nr_instances => test_set.compound_ids.size, - :nr_unpredicted => nr_unpredicted, - :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence - ) - validation.crossvalidation_id = crossvalidation.id if crossvalidation - validation.save - validation - end - end - - class ClassificationValidation < Validation - end - - class RegressionValidation < Validation - - def statistics - rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 - mae = 0 - weighted_mae = 0 - confidence_sum = 0 - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - x = predictions.collect{|p| p[1].median} - y = predictions.collect{|p| p[2]} - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } end + end end