postfix-exporter in postfix-exporter-2.1.0

- old
+ new

@@ -6,11 +6,15 @@
 require 'rack/handler/webrick'
 require 'logger'
 
 prometheus = Prometheus::Client.registry
 
+prometheus.gauge(:postfix_exporter_start_time_seconds, "When this process started up").set({}, Time.now.to_f)
+
 mailq = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue")
+q_err = prometheus.counter(:postfix_queue_processing_error_total, "Exceptions raised whilst scanning the Postfix queue")
+up    = prometheus.gauge(:postfix_up, "Whether the master process is running or not")
 
 Thread.abort_on_exception = true
 
 Thread.new do
 	loop do
@@ -20,22 +24,53 @@
 			end
 
 			# deferred is special, because it's often hueg it gets sharded into
 			# multiple subdirectories
 			mailq.set({ queue: 'deferred' }, Dir["/var/spool/postfix/deferred/*/*"].size)
-
-			sleep 5
 		rescue StandardError => ex
 			$stderr.puts "Error while monitoring queue sizes: #{ex.message} (#{ex.class})"
 			$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
-			sleep 1
+			q_err.increment(class: ex.class.to_s, phase: "scan")
 		end
+
+		begin
+			master_pid = File.read("/var/spool/postfix/pid/master.pid").to_i
+
+			if master_pid > 1
+				Process.kill(0, master_pid)
+				# If we get here, then the process exists, and
+				# that'll do for our purposes
+				up.set({}, 1)
+			else
+				up.set({}, 0)
+			end
+		rescue Errno::ENOENT, Errno::ESRCH, Errno::EACCES
+			up.set({}, 0)
+		rescue Errno::EPERM
+			# Ironically, we don't need to be able to *actually*
+			# signal the process; EPERM means it exists and is running
+			# as someone more privileged than us, which is enough
+			# for our purposes
+			up.set({}, 1)
+		rescue StandardError => ex
+			$stderr.puts "Error while checking master process: #{ex.message} (#{ex.class})"
+			$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
+			q_err.increment(class: ex.class.to_s, phase: "up")
+		end
+
+		sleep 5
+
 	end
 end
 
 if ENV["SYSLOG_SOCKET"]
-	delays   = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages")
+	delays     = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages")
+	connects   = prometheus.counter(:postfix_smtpd_connections_total, "Connections to smtpd")
+	active     = prometheus.gauge(:postfix_smtpd_active_connections, "Current connections to smtpd")
+	incoming   = prometheus.counter(:postfix_incoming_delivery_attempts_total, "Delivery attempts, labelled by dsn and status")
+	messages   = prometheus.counter(:postfix_log_messages_total, "Syslog messages received, labelled by how it was handled")
+	log_errors = prometheus.counter(:postfix_log_processing_error_total, "Exceptions raised whilst processing log messages")
 
 	Thread.new do
 		begin
 			s = Socket.new(Socket::AF_UNIX, Socket::SOCK_DGRAM, 0)
 			s.bind(Socket.pack_sockaddr_un(ENV["SYSLOG_SOCKET"]))
@@ -53,13 +88,39 @@
 					status = $4
 
 					if status == "bounced" or status == "sent"
 						delays.observe({dsn: dsn, status: status}, delay)
 					end
+
+					messages.increment(type: "delay")
+				elsif msg =~ %r{postfix/smtpd\[\d+\]: connect from }
+					connects.increment({})
+					active.send(:synchronize) { active.set({}, active.get({}) || 0 + 1) }
+					messages.increment(type: "connect")
+				elsif msg =~ %r{postfix/smtpd\[\d+\]: disconnect from }
+					active.send(:synchronize) do
+					  new = (active.get({}) || 0) - 1
+					  # If we start running mid-stream,
+					  # we might end up seeing more
+					  # disconnects than connections,
+					  # which would be confusing
+					  new = 0 if new < 0
+					  active.set({}, new)
+					end
+					messages.increment(type: "disconnect")
+				elsif msg =~ %r{postfix/smtpd\[\d+\]: [A-F0-9]+: client=}
+					incoming.increment(dsn: "2.0.0", status: "queued")
+					messages.increment(type: "queued")
+				elsif msg =~ %r{postfix/smtpd\[\d+\]: NOQUEUE: reject: RCPT from \S+: \d{3} (\d+\.\d+\.\d+) }
+					incoming.increment(dsn: $1, status: "rejected")
+					messages.increment(type: "noqueue")
+				else
+					messages.increment(type: "ignored")
 				end
 			rescue StandardError => ex
 				$stderr.puts "Error while receiving postfix logs: #{ex.message} (#{ex.class})"
 				$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
+				log_errors.increment(class: ex.class.to_s)
 				sleep 1
 			end
 		end
 	end
 end