monitoring: send alert emails for failed services

this idea is based on
 https://utcc.utoronto.ca/~cks/space/blog/linux/SystemdTimersMailNotes
and the therein linked
 https://wiki.archlinux.org/title/Systemd/Timers#MAILTO
but using a top-level systemd override to send such alerts for all
service units on parsons, not just timers. Tested by sending SIGKILL to
monit a couple times & receiving emails.

We might now get two emails for some failing units, or possibly even
three! (if is-system-running is false, caused by a service unit failure,
and monit also notices the service not running). On the other hand, we
now also get emails if monit fails.
This commit is contained in:
stuebinm 2025-02-01 16:54:59 +01:00
parent 0f3c41e548
commit 064a9a05dc
2 changed files with 32 additions and 1 deletions

View file

@ -19,7 +19,7 @@
./tracktrain.nix
./uffd.nix
./lxc.nix
./monit.nix
./monitoring.nix
];
hacc.bindToPersist = [ "/var/lib/acme" ];

View file

@ -36,6 +36,27 @@ let
exit 1
end
'';
unitFailedAlertScript = pkgs.writeShellScript "unit-failed-alert" ''
sendmail -t <<ERRMAIL
To: admin@hacc.space
From: systemd <root@$HOSTNAME>
Subject: unit $1 failed
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset=UTF-8
$(systemctl status --full "$1")
ERRMAIL
'';
systemdTopLevelOverride = pkgs.writeTextFile {
name = "systemd-service-toplevel-override";
destination = "/etc/systemd/system/service.d/toplevel-override.conf";
text = ''
[Unit]
OnFailure=service-failed-email@%n.service
'';
};
in
{
mailserver.monitoring = {
@ -61,4 +82,14 @@ in
check program check-deploy-age path ${lib.getExe checkDeployAge}
if status == 1 then alert
'';
systemd.services."service-failed-email@" = {
path = [ pkgs.bash pkgs.postfix ];
serviceConfig.Type = "oneshot";
# serviceConfig.DynamicUser = true; # TODO: figure out how to make postfix accept emails if this is set
serviceConfig.ExecStart =
"${unitFailedAlertScript} %i";
};
systemd.packages = [ systemdTopLevelOverride ];
}