crossvalidation.rb in lazar-0.0.9

- old
+ new

@@ -4,16 +4,62 @@
     field :validation_ids, type: Array, default: []
     field :model_id, type: BSON::ObjectId
     field :folds, type: Integer
     field :nr_instances, type: Integer
     field :nr_unpredicted, type: Integer
-    field :predictions, type: Array
+    field :predictions, type: Array, default: []
     field :finished_at, type: Time 
 
     def time
       finished_at - created_at
     end
+
+    def validations
+      validation_ids.collect{|vid| Validation.find vid}
+    end
+
+    def model
+      Model::Lazar.find model_id
+    end
+
+    def self.create model, n=10
+      model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+      bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+      cv = klass.new(
+        name: model.name,
+        model_id: model.id,
+        folds: n
+      )
+      cv.save # set created_at
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each_with_index do |fold,fold_nr|
+        #fork do # parallel execution of validations
+          $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+          t = Time.now
+          validation = Validation.create(model, fold[0], fold[1],cv)
+          $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
+        #end
+      end
+      #Process.waitall
+      cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
+      cv.validations.each do |validation|
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+      end
+      cv.update_attributes(
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+      )
+      $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+      cv.statistics
+      cv
+    end
   end
 
   class ClassificationCrossValidation < CrossValidation
 
     field :accept_values, type: Array
@@ -21,177 +67,234 @@
     field :weighted_confusion_matrix, type: Array
     field :accuracy, type: Float
     field :weighted_accuracy, type: Float
     field :true_rate, type: Hash
     field :predictivity, type: Hash
+    field :confidence_plot_id, type: BSON::ObjectId
     # TODO auc, f-measure (usability??)
 
-    def self.create model, n=10
-      cv = self.new
-      cv.save # set created_at
-      validation_ids = []
-      nr_instances = 0
-      nr_unpredicted = 0
-      predictions = []
-      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
+    def statistics
       accept_values = Feature.find(model.prediction_feature_id).accept_values
       confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
       true_rate = {}
       predictivity = {}
-      fold_nr = 1
-      training_dataset = Dataset.find model.training_dataset_id
-      training_dataset.folds(n).each do |fold|
-        t = Time.now
-        $logger.debug "Fold #{fold_nr}"
-        validation = validation_class.create(model, fold[0], fold[1])
-        validation_ids << validation.id
-        nr_instances += validation.nr_instances
-        nr_unpredicted += validation.nr_unpredicted
-        predictions += validation.predictions
-        validation.confusion_matrix.each_with_index do |r,i|
-          r.each_with_index do |c,j|
-            confusion_matrix[i][j] += c
-            weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
+      predictions.each do |pred|
+        compound_id,activities,prediction,confidence = pred
+        if activities and prediction #and confidence.numeric? 
+          if activities.uniq.size == 1
+            activity = activities.uniq.first
+            if prediction == activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][0] += 1
+                #weighted_confusion_matrix[0][0] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][1] += 1
+                #weighted_confusion_matrix[1][1] += confidence
+              end
+            elsif prediction != activity
+              if prediction == accept_values[0]
+                confusion_matrix[0][1] += 1
+                #weighted_confusion_matrix[0][1] += confidence
+              elsif prediction == accept_values[1]
+                confusion_matrix[1][0] += 1
+                #weighted_confusion_matrix[1][0] += confidence
+              end
+            end
           end
+        else
+          nr_unpredicted += 1 if prediction.nil?
         end
-        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
-        fold_nr +=1
       end
       true_rate = {}
       predictivity = {}
       accept_values.each_with_index do |v,i|
         true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
         predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
       end
       confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
-        end
-      end
-      cv.update_attributes(
-        name: model.name,
-        model_id: model.id,
-        folds: n,
-        validation_ids: validation_ids,
-        nr_instances: nr_instances,
-        nr_unpredicted: nr_unpredicted,
+      #weighted_confusion_matrix.each do |r|
+        #r.each do |c|
+          #confidence_sum += c
+        #end
+      #end
+      update_attributes(
         accept_values: accept_values,
         confusion_matrix: confusion_matrix,
-        weighted_confusion_matrix: weighted_confusion_matrix,
+        #weighted_confusion_matrix: weighted_confusion_matrix,
         accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
         true_rate: true_rate,
         predictivity: predictivity,
-        predictions: predictions.sort{|a,b| b[3] <=> a[3]}, # sort according to confidence
         finished_at: Time.now
       )
-      cv.save
-      cv
+      $logger.debug "Accuracy #{accuracy}"
     end
 
+    def confidence_plot
+      unless confidence_plot_id
+        tmpfile = "/tmp/#{id.to_s}_confidence.png"
+        accuracies = []
+        confidences = []
+        correct_predictions = 0
+        incorrect_predictions = 0
+        predictions.each do |p|
+          if p[1] and p[2]
+            p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1
+            accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+            confidences << p[3]
+
+          end
+        end
+        R.assign "accuracy", accuracies
+        R.assign "confidence", confidences
+        R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
+        plot_id = $gridfs.insert_one(file)
+        update(:confidence_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+
     #Average area under roc  0.646
     #Area under roc  0.646
     #F measure carcinogen: 0.769, noncarcinogen: 0.348
   end
 
   class RegressionCrossValidation < CrossValidation
 
     field :rmse, type: Float
     field :mae, type: Float
-    field :weighted_rmse, type: Float
-    field :weighted_mae, type: Float
+    field :r_squared, type: Float
+    field :correlation_plot_id, type: BSON::ObjectId
 
-    def self.create model, n=10
-      cv = self.new
-      cv.save # set created_at
-      validation_ids = []
-      nr_instances = 0
-      nr_unpredicted = 0
-      predictions = []
-      validation_class = Object.const_get(self.to_s.sub(/Cross/,''))
-      fold_nr = 1
-      training_dataset = Dataset.find model.training_dataset_id
-      training_dataset.folds(n).each do |fold|
-        t = Time.now
-        $logger.debug "Predicting fold #{fold_nr}"
-
-        validation = validation_class.create(model, fold[0], fold[1])
-        validation_ids << validation.id
-        nr_instances += validation.nr_instances
-        nr_unpredicted += validation.nr_unpredicted
-        predictions += validation.predictions
-        $logger.debug "Fold #{fold_nr}:  #{Time.now-t} seconds"
-        fold_nr +=1
-      end
+    def statistics
       rmse = 0
-      weighted_rmse = 0
-      rse = 0
-      weighted_rse = 0
       mae = 0
-      weighted_mae = 0
-      rae = 0
-      weighted_rae = 0
-      n = 0
-      confidence_sum = 0
+      x = []
+      y = []
       predictions.each do |pred|
         compound_id,activity,prediction,confidence = pred
-        if activity and prediction
-          error = prediction-activity
-          rmse += error**2
-          weighted_rmse += confidence*error**2
-          mae += error.abs
-          weighted_mae += confidence*error.abs
-          n += 1
-          confidence_sum += confidence
+        if activity and prediction 
+          unless activity == [nil]
+            x << -Math.log10(activity.median)
+            y << -Math.log10(prediction)
+            error = Math.log10(prediction)-Math.log10(activity.median)
+            rmse += error**2
+            #weighted_rmse += confidence*error**2
+            mae += error.abs
+            #weighted_mae += confidence*error.abs
+            #confidence_sum += confidence
+          end
         else
-          # TODO: create warnings
-          p pred
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
         end
       end
-      mae = mae/n
-      weighted_mae = weighted_mae/confidence_sum
-      rmse = Math.sqrt(rmse/n)
-      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
-      cv.update_attributes(
-        name: model.name,
-        model_id: model.id,
-        folds: n,
-        validation_ids: validation_ids,
-        nr_instances: nr_instances,
-        nr_unpredicted: nr_unpredicted,
-        predictions: predictions.sort{|a,b| b[3] <=> a[3]},
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+
+      mae = mae/predictions.size
+      #weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/predictions.size)
+      #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      update_attributes(
         mae: mae,
         rmse: rmse,
-        weighted_mae: weighted_mae,
-        weighted_rmse: weighted_rmse
+        #weighted_mae: weighted_mae,
+        #weighted_rmse: weighted_rmse,
+        r_squared: r**2,
+        finished_at: Time.now
       )
-      cv.save
-      cv
+      $logger.debug "R^2 #{r**2}"
+      $logger.debug "RMSE #{rmse}"
+      $logger.debug "MAE #{mae}"
     end
 
-    def plot
-      # RMSE
-      x = predictions.collect{|p| p[1]}
-      y = predictions.collect{|p| p[2]}
-      R.assign "Measurement", x
-      R.assign "Prediction", y
-      R.eval "par(pty='s')" # sets the plot type to be square
-      #R.eval "fitline <- lm(log(Prediction) ~ log(Measurement))"
-      #R.eval "error <- log(Measurement)-log(Prediction)"
-      R.eval "error <- Measurement-Prediction"
-      R.eval "rmse <- sqrt(mean(error^2,na.rm=T))"
-      R.eval "mae <- mean( abs(error), na.rm = TRUE)"
-      R.eval "r <- cor(log(Prediction),log(Measurement))"
-      R.eval "svg(filename='/tmp/#{id.to_s}.svg')"
-      R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: ',r^2),asp=1)"
-      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', sub=paste('RMSE: ',rmse, 'MAE :',mae, 'r^2: '),asp=1)"
-      #R.eval "plot(log(Prediction),log(Measurement),main='#{self.name}', ,asp=1)"
-      R.eval "abline(0,1,col='blue')"
-      #R.eval "abline(fitline,col='red')"
-      R.eval "dev.off()"
-      "/tmp/#{id.to_s}.svg"
+    def misclassifications n=nil
+      #n = predictions.size unless n
+      n ||= 10 
+      model = Model::Lazar.find(self.model_id)
+      training_dataset = Dataset.find(model.training_dataset_id)
+      prediction_feature = training_dataset.features.first
+      predictions.collect do |p|
+        unless p.include? nil
+          compound = Compound.find(p[0])
+          neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+          neighbors.collect! do |n|
+            neighbor = Compound.find(n[0])
+            values = training_dataset.values(neighbor,prediction_feature)
+            { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+          end
+          {
+            :smiles => compound.smiles, 
+            #:fingerprint => compound.fp4.collect{|id|  Smarts.find(id).name},
+            :measured => p[1],
+            :predicted => p[2],
+            #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+            :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+            :relative_error => (p[1]-p[2]).abs/p[1],
+            :confidence => p[3],
+            :neighbors => neighbors
+          }
+        end
+      end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1]
+    end
+
+    def confidence_plot
+      tmpfile = "/tmp/#{id.to_s}_confidence.png"
+      sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+      R.assign "error", sorted_predictions.collect{|p| p[0]}
+      R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+      # TODO fix axis names
+      R.eval "image = qplot(confidence,error)"
+      R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
+      R.eval "ggsave(file='#{tmpfile}', plot=image)"
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png")
+      plot_id = $gridfs.insert_one(file)
+      update(:confidence_plot_id => plot_id)
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+
+    def correlation_plot
+      unless correlation_plot_id
+        tmpfile = "/tmp/#{id.to_s}_correlation.png"
+        x = predictions.collect{|p| p[1]}
+        y = predictions.collect{|p| p[2]}
+        attributes = Model::Lazar.find(self.model_id).attributes
+        attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+        attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+        R.assign "measurement", x
+        R.assign "prediction", y
+        R.eval "all = c(-log(measurement),-log(prediction))"
+        R.eval "range = c(min(all), max(all))"
+        R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+        R.eval "image = image + geom_abline(intercept=0, slope=1)"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
+        plot_id = $gridfs.insert_one(file)
+        update(:correlation_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: correlation_plot_id).data
+    end
+  end
+
+  class RepeatedCrossValidation
+    field :crossvalidation_ids, type: Array, default: []
+    def self.create model, folds=10, repeats=3
+      repeated_cross_validation = self.new
+      repeats.times do |n|
+        $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+        repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+      end
+      repeated_cross_validation.save
+      repeated_cross_validation
+    end
+    def crossvalidations
+      crossvalidation_ids.collect{|id| CrossValidation.find(id)}
     end
   end
 
 
 end