postfix-exporter in postfix-exporter-1.0.0

- old
+ new

@@ -1,22 +1,18 @@
 #!/usr/bin/env ruby
 
-require 'rack'
-require 'prometheus/middleware/exporter'
+require 'prometheus/client/rack/exporter'
 require 'socket'
+require 'docker'
+require 'rack'
 require 'rack/handler/webrick'
 require 'logger'
 
 prometheus = Prometheus::Client.registry
 
-prometheus.gauge(:postfix_exporter_start_time_seconds, "When this process started up").set({}, Time.now.to_f)
+mailq = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue")
 
-oldest = prometheus.gauge(:postfix_oldest_message_timestamp_seconds, "Queue time of the oldest message")
-mailq  = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue")
-q_err  = prometheus.counter(:postfix_queue_processing_error_total, "Exceptions raised whilst scanning the Postfix queue")
-up     = prometheus.gauge(:postfix_up, "Whether the master process is running or not")
-
 Thread.abort_on_exception = true
 
 Thread.new do
 	loop do
 		begin
@@ -25,88 +21,23 @@
 			end
 
 			# deferred is special, because it's often hueg it gets sharded into
 			# multiple subdirectories
 			mailq.set({ queue: 'deferred' }, Dir["/var/spool/postfix/deferred/*/*"].size)
+
+			sleep 5
 		rescue StandardError => ex
 			$stderr.puts "Error while monitoring queue sizes: #{ex.message} (#{ex.class})"
 			$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
-			q_err.increment(class: ex.class.to_s, phase: "scan")
+			sleep 1
 		end
-
-		begin
-			master_pid = File.read("/var/spool/postfix/pid/master.pid").to_i
-
-			if master_pid > 1
-				Process.kill(0, master_pid)
-				# If we get here, then the process exists, and
-				# that'll do for our purposes
-				up.set({}, 1)
-			else
-				up.set({}, 0)
-			end
-		rescue Errno::ENOENT, Errno::ESRCH, Errno::EACCES
-			up.set({}, 0)
-		rescue Errno::EPERM
-			# Ironically, we don't need to be able to *actually*
-			# signal the process; EPERM means it exists and is running
-			# as someone more privileged than us, which is enough
-			# for our purposes
-			up.set({}, 1)
-		rescue StandardError => ex
-			$stderr.puts "Error while checking master process: #{ex.message} (#{ex.class})"
-			$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
-			q_err.increment(class: ex.class.to_s, phase: "up")
-		end
-
-		sleep 5
-
 	end
 end
 
-Thread.new do
-	earliest_ctime = ->(glob) do
-		# There is seemingly no way to unset or remove a gauge metric in the Ruby
-		# implementation of the prom exporter.  As a hack, we return the current
-		# time in cases where there is nothing to sample.
-		now = Time.now.to_i
-
-		Dir[glob].lazy.map do |n|
-			begin
-				File.stat(n).ctime.to_i
-			rescue Errno::ENOENT
-				now
-			end
-		end.min || now
-	end
-
-	loop do
-		begin
-			%w{incoming active corrupt hold}.each do |q|
-				oldest.set({ queue: q }, earliest_ctime["/var/spool/postfix/#{q}/*"])
-			end
-			oldest.set({ queue: 'deferred' }, earliest_ctime["/var/spool/postfix/deferred/*/*"])
-		rescue StandardError => ex
-			$stderr.puts "Error while sampling message ages: #{ex.message} (#{ex.class})"
-			$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
-			q_err.increment(class: ex.class.to_s, phase: "stat")
-		end
-
-		# stat()ing all the files in a large queue could potentially be quite
-		# expensive, so we sample this data less frequently.
-		sleep 60
-
-	end
-end
-
 if ENV["SYSLOG_SOCKET"]
-	delays     = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages")
-	connects   = prometheus.counter(:postfix_smtpd_connections_total, "Connections to smtpd")
-	active     = prometheus.gauge(:postfix_smtpd_active_connections, "Current connections to smtpd")
-	incoming   = prometheus.counter(:postfix_incoming_delivery_attempts_total, "Delivery attempts, labelled by dsn and status")
-	messages   = prometheus.counter(:postfix_log_messages_total, "Syslog messages received, labelled by how it was handled")
-	log_errors = prometheus.counter(:postfix_log_processing_error_total, "Exceptions raised whilst processing log messages")
+	delays   = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages")
+	statuses = prometheus.counter(:postfix_deliveries, "How many messages have been delivered (or bounced)")
 
 	Thread.new do
 		begin
 			s = Socket.new(Socket::AF_UNIX, Socket::SOCK_DGRAM, 0)
 			s.bind(Socket.pack_sockaddr_un(ENV["SYSLOG_SOCKET"]))
@@ -116,68 +47,33 @@
 		end
 
 		loop do
 			begin
 				msg = s.recvmsg.first
-				if msg =~ %r{postfix/.* delay=(\d+(\.\d+)?), .* dsn=(\d+\.\d+\.\d+), status=(\w+)}
+				if msg =~ %r{postfix/smtp.* delay=(\d+(\.\d+)?), .* dsn=(\d+\.\d+\.\d+), status=(\w+)}
 					delay  = $1.to_f
 					dsn    = $3
 					status = $4
 
 					if status == "bounced" or status == "sent"
-						delays.observe({dsn: dsn, status: status}, delay)
+						statuses.increment(dsn: dsn, status: status)
+						delays.add({dsn: dsn, status: status}, delay)
 					end
-
-					messages.increment(type: "delay")
-				elsif msg =~ %r{postfix/smtpd\[\d+\]: connect from }
-					connects.increment({})
-					active.send(:synchronize) { active.set({}, active.get({}) || 0 + 1) }
-					messages.increment(type: "connect")
-				elsif msg =~ %r{postfix/smtpd\[\d+\]: disconnect from }
-					active.send(:synchronize) do
-					  new = (active.get({}) || 0) - 1
-					  # If we start running mid-stream,
-					  # we might end up seeing more
-					  # disconnects than connections,
-					  # which would be confusing
-					  new = 0 if new < 0
-					  active.set({}, new)
-					end
-					messages.increment(type: "disconnect")
-				elsif msg =~ %r{postfix/smtpd\[\d+\]: [A-F0-9]+: client=}
-					incoming.increment(dsn: "2.0.0", status: "queued")
-					messages.increment(type: "queued")
-				elsif msg =~ %r{postfix/smtpd\[\d+\]: NOQUEUE: reject: RCPT from \S+: \d{3} (\d+\.\d+\.\d+) }
-					incoming.increment(dsn: $1, status: "rejected")
-					messages.increment(type: "noqueue")
-				else
-					messages.increment(type: "ignored")
 				end
 			rescue StandardError => ex
 				$stderr.puts "Error while receiving postfix logs: #{ex.message} (#{ex.class})"
 				$stderr.puts ex.backtrace.map { |l| "  #{l}" }.join("\n")
-				log_errors.increment(class: ex.class.to_s)
 				sleep 1
 			end
 		end
 	end
 end
 
 app = Rack::Builder.new
-app.use Rack::Deflater, if: ->(_, _, _, body) { body.any? && body[0].length > 512 }
-app.use Prometheus::Middleware::Exporter
+app.use Prometheus::Client::Rack::Exporter
 app.run ->(env) { [404, {'Content-Type' => 'text/plain'}, ['NOPE NOPE NOPE NOPE']] }
 
 logger = Logger.new($stderr)
 logger.level = Logger::INFO
 logger.formatter = proc { |s, t, p, m| "WEBrick: #{m}\n" }
 
-# This is the only way to get the Rack-mediated webrick to listen on both
-# INADDR_ANY and IN6ADDR_ANY on libcs that don't support getaddrinfo("*")
-# (ie musl-libc).  Setting `Host: '*'` barfs on the above-mentioned buggy(?)
-# libcs, `Host: '::'` fails on newer rubies (because they use
-# setsockopt(V6ONLY) by default), and with RACK_ENV at its default of
-# "development", it only listens on localhost.  And even *this* only works
-# on Rack 2, because before that the non-development default listen address
-# was "0.0.0.0"!
-ENV['RACK_ENV'] = "none"
-Rack::Handler::WEBrick.run app, Port: 9154, Logger: logger, AccessLog: []
+Rack::Handler::WEBrick.run app, Host: '::', Port: 9154, Logger: logger, AccessLog: []