bin/postfix-exporter in postfix-exporter-0.1.0 vs bin/postfix-exporter in postfix-exporter-1.0.0

- old
+ new

@@ -1,22 +1,18 @@ #!/usr/bin/env ruby -require 'rack' -require 'prometheus/middleware/exporter' +require 'prometheus/client/rack/exporter' require 'socket' +require 'docker' +require 'rack' require 'rack/handler/webrick' require 'logger' prometheus = Prometheus::Client.registry -prometheus.gauge(:postfix_exporter_start_time_seconds, "When this process started up").set({}, Time.now.to_f) +mailq = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue") -oldest = prometheus.gauge(:postfix_oldest_message_timestamp_seconds, "Queue time of the oldest message") -mailq = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue") -q_err = prometheus.counter(:postfix_queue_processing_error_total, "Exceptions raised whilst scanning the Postfix queue") -up = prometheus.gauge(:postfix_up, "Whether the master process is running or not") - Thread.abort_on_exception = true Thread.new do loop do begin @@ -25,88 +21,23 @@ end # deferred is special, because it's often hueg it gets sharded into # multiple subdirectories mailq.set({ queue: 'deferred' }, Dir["/var/spool/postfix/deferred/*/*"].size) + + sleep 5 rescue StandardError => ex $stderr.puts "Error while monitoring queue sizes: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") - q_err.increment(class: ex.class.to_s, phase: "scan") + sleep 1 end - - begin - master_pid = File.read("/var/spool/postfix/pid/master.pid").to_i - - if master_pid > 1 - Process.kill(0, master_pid) - # If we get here, then the process exists, and - # that'll do for our purposes - up.set({}, 1) - else - up.set({}, 0) - end - rescue Errno::ENOENT, Errno::ESRCH, Errno::EACCES - up.set({}, 0) - rescue Errno::EPERM - # Ironically, we don't need to be able to *actually* - # signal the process; EPERM means it exists and is running - # as someone more privileged than us, which is enough - # for our purposes - up.set({}, 1) - rescue StandardError => ex - $stderr.puts "Error while checking master process: #{ex.message} (#{ex.class})" - $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") - q_err.increment(class: ex.class.to_s, phase: "up") - end - - sleep 5 - end end -Thread.new do - earliest_ctime = ->(glob) do - # There is seemingly no way to unset or remove a gauge metric in the Ruby - # implementation of the prom exporter. As a hack, we return the current - # time in cases where there is nothing to sample. - now = Time.now.to_i - - Dir[glob].lazy.map do |n| - begin - File.stat(n).ctime.to_i - rescue Errno::ENOENT - now - end - end.min || now - end - - loop do - begin - %w{incoming active corrupt hold}.each do |q| - oldest.set({ queue: q }, earliest_ctime["/var/spool/postfix/#{q}/*"]) - end - oldest.set({ queue: 'deferred' }, earliest_ctime["/var/spool/postfix/deferred/*/*"]) - rescue StandardError => ex - $stderr.puts "Error while sampling message ages: #{ex.message} (#{ex.class})" - $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") - q_err.increment(class: ex.class.to_s, phase: "stat") - end - - # stat()ing all the files in a large queue could potentially be quite - # expensive, so we sample this data less frequently. - sleep 60 - - end -end - if ENV["SYSLOG_SOCKET"] - delays = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages") - connects = prometheus.counter(:postfix_smtpd_connections_total, "Connections to smtpd") - active = prometheus.gauge(:postfix_smtpd_active_connections, "Current connections to smtpd") - incoming = prometheus.counter(:postfix_incoming_delivery_attempts_total, "Delivery attempts, labelled by dsn and status") - messages = prometheus.counter(:postfix_log_messages_total, "Syslog messages received, labelled by how it was handled") - log_errors = prometheus.counter(:postfix_log_processing_error_total, "Exceptions raised whilst processing log messages") + delays = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages") + statuses = prometheus.counter(:postfix_deliveries, "How many messages have been delivered (or bounced)") Thread.new do begin s = Socket.new(Socket::AF_UNIX, Socket::SOCK_DGRAM, 0) s.bind(Socket.pack_sockaddr_un(ENV["SYSLOG_SOCKET"])) @@ -116,68 +47,33 @@ end loop do begin msg = s.recvmsg.first - if msg =~ %r{postfix/.* delay=(\d+(\.\d+)?), .* dsn=(\d+\.\d+\.\d+), status=(\w+)} + if msg =~ %r{postfix/smtp.* delay=(\d+(\.\d+)?), .* dsn=(\d+\.\d+\.\d+), status=(\w+)} delay = $1.to_f dsn = $3 status = $4 if status == "bounced" or status == "sent" - delays.observe({dsn: dsn, status: status}, delay) + statuses.increment(dsn: dsn, status: status) + delays.add({dsn: dsn, status: status}, delay) end - - messages.increment(type: "delay") - elsif msg =~ %r{postfix/smtpd\[\d+\]: connect from } - connects.increment({}) - active.send(:synchronize) { active.set({}, active.get({}) || 0 + 1) } - messages.increment(type: "connect") - elsif msg =~ %r{postfix/smtpd\[\d+\]: disconnect from } - active.send(:synchronize) do - new = (active.get({}) || 0) - 1 - # If we start running mid-stream, - # we might end up seeing more - # disconnects than connections, - # which would be confusing - new = 0 if new < 0 - active.set({}, new) - end - messages.increment(type: "disconnect") - elsif msg =~ %r{postfix/smtpd\[\d+\]: [A-F0-9]+: client=} - incoming.increment(dsn: "2.0.0", status: "queued") - messages.increment(type: "queued") - elsif msg =~ %r{postfix/smtpd\[\d+\]: NOQUEUE: reject: RCPT from \S+: \d{3} (\d+\.\d+\.\d+) } - incoming.increment(dsn: $1, status: "rejected") - messages.increment(type: "noqueue") - else - messages.increment(type: "ignored") end rescue StandardError => ex $stderr.puts "Error while receiving postfix logs: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") - log_errors.increment(class: ex.class.to_s) sleep 1 end end end end app = Rack::Builder.new -app.use Rack::Deflater, if: ->(_, _, _, body) { body.any? && body[0].length > 512 } -app.use Prometheus::Middleware::Exporter +app.use Prometheus::Client::Rack::Exporter app.run ->(env) { [404, {'Content-Type' => 'text/plain'}, ['NOPE NOPE NOPE NOPE']] } logger = Logger.new($stderr) logger.level = Logger::INFO logger.formatter = proc { |s, t, p, m| "WEBrick: #{m}\n" } -# This is the only way to get the Rack-mediated webrick to listen on both -# INADDR_ANY and IN6ADDR_ANY on libcs that don't support getaddrinfo("*") -# (ie musl-libc). Setting `Host: '*'` barfs on the above-mentioned buggy(?) -# libcs, `Host: '::'` fails on newer rubies (because they use -# setsockopt(V6ONLY) by default), and with RACK_ENV at its default of -# "development", it only listens on localhost. And even *this* only works -# on Rack 2, because before that the non-development default listen address -# was "0.0.0.0"! -ENV['RACK_ENV'] = "none" -Rack::Handler::WEBrick.run app, Port: 9154, Logger: logger, AccessLog: [] +Rack::Handler::WEBrick.run app, Host: '::', Port: 9154, Logger: logger, AccessLog: []