monitoring: send alert emails for failed services
this idea is based on https://utcc.utoronto.ca/~cks/space/blog/linux/SystemdTimersMailNotes and the therein linked https://wiki.archlinux.org/title/Systemd/Timers#MAILTO but using a top-level systemd override to send such alerts for all service units on parsons, not just timers. Tested by sending SIGKILL to monit a couple times & receiving emails. We might now get two emails for some failing units, or possibly even three! (if is-system-running is false, caused by a service unit failure, and monit also notices the service not running). On the other hand, we now also get emails if monit fails.
This commit is contained in:
parent
0f3c41e548
commit
064a9a05dc
2 changed files with 32 additions and 1 deletions
|
@ -19,7 +19,7 @@
|
|||
./tracktrain.nix
|
||||
./uffd.nix
|
||||
./lxc.nix
|
||||
./monit.nix
|
||||
./monitoring.nix
|
||||
];
|
||||
|
||||
hacc.bindToPersist = [ "/var/lib/acme" ];
|
||||
|
|
|
@ -36,6 +36,27 @@ let
|
|||
exit 1
|
||||
end
|
||||
'';
|
||||
|
||||
unitFailedAlertScript = pkgs.writeShellScript "unit-failed-alert" ''
|
||||
sendmail -t <<ERRMAIL
|
||||
To: admin@hacc.space
|
||||
From: systemd <root@$HOSTNAME>
|
||||
Subject: unit $1 failed
|
||||
Content-Transfer-Encoding: 8bit
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
$(systemctl status --full "$1")
|
||||
ERRMAIL
|
||||
'';
|
||||
|
||||
systemdTopLevelOverride = pkgs.writeTextFile {
|
||||
name = "systemd-service-toplevel-override";
|
||||
destination = "/etc/systemd/system/service.d/toplevel-override.conf";
|
||||
text = ''
|
||||
[Unit]
|
||||
OnFailure=service-failed-email@%n.service
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
mailserver.monitoring = {
|
||||
|
@ -61,4 +82,14 @@ in
|
|||
check program check-deploy-age path ${lib.getExe checkDeployAge}
|
||||
if status == 1 then alert
|
||||
'';
|
||||
|
||||
systemd.services."service-failed-email@" = {
|
||||
path = [ pkgs.bash pkgs.postfix ];
|
||||
serviceConfig.Type = "oneshot";
|
||||
# serviceConfig.DynamicUser = true; # TODO: figure out how to make postfix accept emails if this is set
|
||||
serviceConfig.ExecStart =
|
||||
"${unitFailedAlertScript} %i";
|
||||
};
|
||||
|
||||
systemd.packages = [ systemdTopLevelOverride ];
|
||||
}
|
Loading…
Add table
Reference in a new issue