lib/validation.rb in lazar-0.9.3 vs lib/validation.rb in lazar-1.0.0
- old
+ new
@@ -1,108 +1,25 @@
module OpenTox
- class Validation
+ module Validation
- field :model_id, type: BSON::ObjectId
- field :prediction_dataset_id, type: BSON::ObjectId
- field :crossvalidation_id, type: BSON::ObjectId
- field :test_dataset_id, type: BSON::ObjectId
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Array
+ class Validation
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "validations"
+ field :name, type: String
+ field :model_id, type: BSON::ObjectId
+ field :nr_instances, type: Integer, default: 0
+ field :nr_unpredicted, type: Integer, default: 0
+ field :predictions, type: Hash, default: {}
+ field :finished_at, type: Time
- def prediction_dataset
- Dataset.find prediction_dataset_id
- end
-
- def test_dataset
- Dataset.find test_dataset_id
- end
-
- def model
- Model::Lazar.find model_id
- end
-
- def self.create model, training_set, test_set, crossvalidation=nil
-
- atts = model.attributes.dup # do not modify attributes from original model
- atts["_id"] = BSON::ObjectId.new
- atts[:training_dataset_id] = training_set.id
- validation_model = model.class.create training_set, atts
- validation_model.save
- cids = test_set.compound_ids
-
- test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
- prediction_dataset = validation_model.predict test_set_without_activities
- predictions = []
- nr_unpredicted = 0
- activities = test_set.data_entries.collect{|de| de.first}
- prediction_dataset.data_entries.each_with_index do |de,i|
- if de[0] #and de[1]
- cid = prediction_dataset.compound_ids[i]
- rows = cids.each_index.select{|r| cids[r] == cid }
- activities = rows.collect{|r| test_set.data_entries[r][0]}
- prediction = de.first
- confidence = de[1]
- predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
- else
- nr_unpredicted += 1
- end
+ def model
+ Model::Lazar.find model_id
end
- validation = self.new(
- :model_id => validation_model.id,
- :prediction_dataset_id => prediction_dataset.id,
- :test_dataset_id => test_set.id,
- :nr_instances => test_set.compound_ids.size,
- :nr_unpredicted => nr_unpredicted,
- :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
- )
- validation.crossvalidation_id = crossvalidation.id if crossvalidation
- validation.save
- validation
- end
- end
-
- class ClassificationValidation < Validation
- end
-
- class RegressionValidation < Validation
-
- def statistics
- rmse = 0
- weighted_rmse = 0
- rse = 0
- weighted_rse = 0
- mae = 0
- weighted_mae = 0
- confidence_sum = 0
- predictions.each do |pred|
- compound_id,activity,prediction,confidence = pred
- if activity and prediction
- error = Math.log10(prediction)-Math.log10(activity.median)
- rmse += error**2
- weighted_rmse += confidence*error**2
- mae += error.abs
- weighted_mae += confidence*error.abs
- confidence_sum += confidence
- else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- x = predictions.collect{|p| p[1].median}
- y = predictions.collect{|p| p[2]}
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
- r = R.eval("r").to_ruby
-
- mae = mae/predictions.size
- weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/predictions.size)
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
end
+
end
end