bin/postfix-exporter in postfix-exporter-2.0.3 vs bin/postfix-exporter in postfix-exporter-2.1.0

- old
+ new

@@ -6,11 +6,15 @@ require 'rack/handler/webrick' require 'logger' prometheus = Prometheus::Client.registry +prometheus.gauge(:postfix_exporter_start_time_seconds, "When this process started up").set({}, Time.now.to_f) + mailq = prometheus.gauge(:postfix_queue_size, "Number of messages in the mail queue") +q_err = prometheus.counter(:postfix_queue_processing_error_total, "Exceptions raised whilst scanning the Postfix queue") +up = prometheus.gauge(:postfix_up, "Whether the master process is running or not") Thread.abort_on_exception = true Thread.new do loop do @@ -20,22 +24,53 @@ end # deferred is special, because it's often hueg it gets sharded into # multiple subdirectories mailq.set({ queue: 'deferred' }, Dir["/var/spool/postfix/deferred/*/*"].size) - - sleep 5 rescue StandardError => ex $stderr.puts "Error while monitoring queue sizes: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") - sleep 1 + q_err.increment(class: ex.class.to_s, phase: "scan") end + + begin + master_pid = File.read("/var/spool/postfix/pid/master.pid").to_i + + if master_pid > 1 + Process.kill(0, master_pid) + # If we get here, then the process exists, and + # that'll do for our purposes + up.set({}, 1) + else + up.set({}, 0) + end + rescue Errno::ENOENT, Errno::ESRCH, Errno::EACCES + up.set({}, 0) + rescue Errno::EPERM + # Ironically, we don't need to be able to *actually* + # signal the process; EPERM means it exists and is running + # as someone more privileged than us, which is enough + # for our purposes + up.set({}, 1) + rescue StandardError => ex + $stderr.puts "Error while checking master process: #{ex.message} (#{ex.class})" + $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") + q_err.increment(class: ex.class.to_s, phase: "up") + end + + sleep 5 + end end if ENV["SYSLOG_SOCKET"] - delays = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages") + delays = prometheus.summary(:postfix_delivery_delays, "Distribution of time taken to deliver (or bounce) messages") + connects = prometheus.counter(:postfix_smtpd_connections_total, "Connections to smtpd") + active = prometheus.gauge(:postfix_smtpd_active_connections, "Current connections to smtpd") + incoming = prometheus.counter(:postfix_incoming_delivery_attempts_total, "Delivery attempts, labelled by dsn and status") + messages = prometheus.counter(:postfix_log_messages_total, "Syslog messages received, labelled by how it was handled") + log_errors = prometheus.counter(:postfix_log_processing_error_total, "Exceptions raised whilst processing log messages") Thread.new do begin s = Socket.new(Socket::AF_UNIX, Socket::SOCK_DGRAM, 0) s.bind(Socket.pack_sockaddr_un(ENV["SYSLOG_SOCKET"])) @@ -53,13 +88,39 @@ status = $4 if status == "bounced" or status == "sent" delays.observe({dsn: dsn, status: status}, delay) end + + messages.increment(type: "delay") + elsif msg =~ %r{postfix/smtpd\[\d+\]: connect from } + connects.increment({}) + active.send(:synchronize) { active.set({}, active.get({}) || 0 + 1) } + messages.increment(type: "connect") + elsif msg =~ %r{postfix/smtpd\[\d+\]: disconnect from } + active.send(:synchronize) do + new = (active.get({}) || 0) - 1 + # If we start running mid-stream, + # we might end up seeing more + # disconnects than connections, + # which would be confusing + new = 0 if new < 0 + active.set({}, new) + end + messages.increment(type: "disconnect") + elsif msg =~ %r{postfix/smtpd\[\d+\]: [A-F0-9]+: client=} + incoming.increment(dsn: "2.0.0", status: "queued") + messages.increment(type: "queued") + elsif msg =~ %r{postfix/smtpd\[\d+\]: NOQUEUE: reject: RCPT from \S+: \d{3} (\d+\.\d+\.\d+) } + incoming.increment(dsn: $1, status: "rejected") + messages.increment(type: "noqueue") + else + messages.increment(type: "ignored") end rescue StandardError => ex $stderr.puts "Error while receiving postfix logs: #{ex.message} (#{ex.class})" $stderr.puts ex.backtrace.map { |l| " #{l}" }.join("\n") + log_errors.increment(class: ex.class.to_s) sleep 1 end end end end