#!/usr/bin/env ruby require 'rack' require 'prometheus/middleware/exporter' require 'socket' require 'rack/handler/webrick' require 'logger' prometheus = Prometheus::Client.registry prometheus.gauge(:postfix_exporter_start_time_seconds, "When this process started up").set({}, Time.now.to_f) oldest = prometheus.gauge(:postfix_oldest_message_timestamp_seconds, "Queue time of the oldest message") mailq = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue") q_err = prometheus.counter(:postfix_queue_processing_error_total, "Exceptions raised whilst scanning the Postfix queue") up = prometheus.gauge(:postfix_up, "Whether the master process is running or not") Thread.abort_on_exception = true Thread.new do loop do begin %w{incoming active corrupt hold}.each do |q| mailq.set({ queue: q }, Dir["/var/spool/postfix/#{q}/*"].size) end # deferred is special, because it's often hueg it gets sharded into # multiple subdirectories mailq.set({ queue: 'deferred' }, Dir["/var/spool/postfix/deferred/*/*"].size) rescue StandardError => ex $stderr.puts "Error while monitoring queue sizes: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") q_err.increment(class: ex.class.to_s, phase: "scan") end begin master_pid = File.read("/var/spool/postfix/pid/master.pid").to_i if master_pid > 1 Process.kill(0, master_pid) # If we get here, then the process exists, and # that'll do for our purposes up.set({}, 1) else up.set({}, 0) end rescue Errno::ENOENT, Errno::ESRCH, Errno::EACCES up.set({}, 0) rescue Errno::EPERM # Ironically, we don't need to be able to *actually* # signal the process; EPERM means it exists and is running # as someone more privileged than us, which is enough # for our purposes up.set({}, 1) rescue StandardError => ex $stderr.puts "Error while checking master process: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") q_err.increment(class: ex.class.to_s, phase: "up") end sleep 5 end end Thread.new do earliest_ctime = ->(glob) do # There is seemingly no way to unset or remove a gauge metric in the Ruby # implementation of the prom exporter. As a hack, we return the current # time in cases where there is nothing to sample. now = Time.now.to_i Dir[glob].lazy.map do |n| begin File.stat(n).ctime.to_i rescue Errno::ENOENT now end end.min || now end loop do begin %w{incoming active corrupt hold}.each do |q| oldest.set({ queue: q }, earliest_ctime["/var/spool/postfix/#{q}/*"]) end oldest.set({ queue: 'deferred' }, earliest_ctime["/var/spool/postfix/deferred/*/*"]) rescue StandardError => ex $stderr.puts "Error while sampling message ages: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") q_err.increment(class: ex.class.to_s, phase: "stat") end # stat()ing all the files in a large queue could potentially be quite # expensive, so we sample this data less frequently. sleep 60 end end if ENV["SYSLOG_SOCKET"] delays = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages") connects = prometheus.counter(:postfix_smtpd_connections_total, "Connections to smtpd") active = prometheus.gauge(:postfix_smtpd_active_connections, "Current connections to smtpd") incoming = prometheus.counter(:postfix_incoming_delivery_attempts_total, "Delivery attempts, labelled by dsn and status") messages = prometheus.counter(:postfix_log_messages_total, "Syslog messages received, labelled by how it was handled") log_errors = prometheus.counter(:postfix_log_processing_error_total, "Exceptions raised whilst processing log messages") Thread.new do begin s = Socket.new(Socket::AF_UNIX, Socket::SOCK_DGRAM, 0) s.bind(Socket.pack_sockaddr_un(ENV["SYSLOG_SOCKET"])) rescue Errno::EEXIST, Errno::EADDRINUSE File.unlink ENV["SYSLOG_SOCKET"] retry end loop do begin msg = s.recvmsg.first if msg =~ %r{postfix/.* delay=(\d+(\.\d+)?), .* dsn=(\d+\.\d+\.\d+), status=(\w+)} delay = $1.to_f dsn = $3 status = $4 if status == "bounced" or status == "sent" delays.observe({dsn: dsn, status: status}, delay) end messages.increment(type: "delay") elsif msg =~ %r{postfix/smtpd\[\d+\]: connect from } connects.increment({}) active.send(:synchronize) { active.set({}, active.get({}) || 0 + 1) } messages.increment(type: "connect") elsif msg =~ %r{postfix/smtpd\[\d+\]: disconnect from } active.send(:synchronize) do new = (active.get({}) || 0) - 1 # If we start running mid-stream, # we might end up seeing more # disconnects than connections, # which would be confusing new = 0 if new < 0 active.set({}, new) end messages.increment(type: "disconnect") elsif msg =~ %r{postfix/smtpd\[\d+\]: [A-F0-9]+: client=} incoming.increment(dsn: "2.0.0", status: "queued") messages.increment(type: "queued") elsif msg =~ %r{postfix/smtpd\[\d+\]: NOQUEUE: reject: RCPT from \S+: \d{3} (\d+\.\d+\.\d+) } incoming.increment(dsn: $1, status: "rejected") messages.increment(type: "noqueue") else messages.increment(type: "ignored") end rescue StandardError => ex $stderr.puts "Error while receiving postfix logs: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") log_errors.increment(class: ex.class.to_s) sleep 1 end end end end app = Rack::Builder.new app.use Rack::Deflater, if: ->(_, _, _, body) { body.any? && body[0].length > 512 } app.use Prometheus::Middleware::Exporter app.run ->(env) { [404, {'Content-Type' => 'text/plain'}, ['NOPE NOPE NOPE NOPE']] } logger = Logger.new($stderr) logger.level = Logger::INFO logger.formatter = proc { |s, t, p, m| "WEBrick: #{m}\n" } # This is the only way to get the Rack-mediated webrick to listen on both # INADDR_ANY and IN6ADDR_ANY on libcs that don't support getaddrinfo("*") # (ie musl-libc). Setting `Host: '*'` barfs on the above-mentioned buggy(?) # libcs, `Host: '::'` fails on newer rubies (because they use # setsockopt(V6ONLY) by default), and with RACK_ENV at its default of # "development", it only listens on localhost. And even *this* only works # on Rack 2, because before that the non-development default listen address # was "0.0.0.0"! ENV['RACK_ENV'] = "none" Rack::Handler::WEBrick.run app, Port: 9154, Logger: logger, AccessLog: []