db.rb in perus-0.1.12

- old
+ new

@@ -2,10 +2,12 @@
 require 'sequel/plugins/serialization'
 require 'concurrent'
 
 module Perus::Server
     module DB
+        MAX_VACUUM_ATTEMPTS = 5
+
         def self.db
             @db
         end
 
         def self.start
@@ -40,11 +42,35 @@
             # performance rather than reclaim unused space. as old values and
             # metrics are deleted the data become very fragmented. vacuuming
             # restructures the db so system records in the values index should
             # be sequentially stored
             vacuum_task = Concurrent::TimerTask.new do
-                @db.execute('vacuum')
+                attempts = 0
+                complete = false
+
+                while !complete && attempts < MAX_VACUUM_ATTEMPTS
+                    begin
+                        puts "Vacuuming, attempt #{attempts + 1}"
+                        start = Time.now
+                        @db.execute('vacuum')
+                        Stats.vacuumed!(Time.now - start)
+                        complete = true
+                        puts "Vacuuming complete"
+                        
+                    rescue
+                        attempts += 1
+                        if attempts < MAX_VACUUM_ATTEMPTS
+                            puts "Vacuum failed, will reattempt after short sleep"
+                            sleep(5)
+                        end
+                    end
+                end
+
+                if !complete
+                    puts "Vacuum failed more than MAX_VACUUM_ATTEMPTS"
+                    Stats.vacuumed!('failed')
+                end
             end
 
             # fire every 12 hours
             vacuum_task.execution_interval = 60 * 60 * 12
             vacuum_task.execute
@@ -52,11 +78,17 @@
             # a fixed number of hours of data are kept in the database. once an
             # hour, old values and files are removed. if all values of a metric
             # are removed from a system, the accompanying metric record is also
             # removed.
             cleanup_task = Concurrent::TimerTask.new do
-                Perus::Server::DB.cleanup
+                begin
+                    start = Time.now
+                    Perus::Server::DB.cleanup
+                    Stats.cleaned!(Time.now - start)
+                rescue
+                    Stats.cleaned!('failed')
+                end
             end
 
             # fire every hour
             cleanup_task.execution_interval = 60 * 60
             cleanup_task.execute
@@ -64,13 +96,19 @@
             # alerts can be process intensive, so to keep page refreshes
             # responsive the 'active' state of an alert for each system is
             # cached so lookups can be done against the db, rather than running
             # each alert for each system on a page load.
             cache_alerts_task = Concurrent::TimerTask.new do
-                Perus::Server::Alert.cache_active_alerts
+                begin
+                    start = Time.now
+                    Perus::Server::Alert.cache_active_alerts
+                    Stats.alerts_cached!(Time.now - start)
+                rescue
+                    Stats.alerts_cached!('failed')
+                end
             end
 
-            cache_alerts_task.execution_interval = Server.options.cache_alerts_mins * 60
+            cache_alerts_task.execution_interval = Server.options.cache_alerts_secs
             cache_alerts_task.execute
         end
 
         def self.cleanup
             puts 'Cleaning old values and metrics'