haccfiles/parsons/monitoring.nix
stuebinm 064a9a05dc monitoring: send alert emails for failed services
this idea is based on
 https://utcc.utoronto.ca/~cks/space/blog/linux/SystemdTimersMailNotes
and the therein linked
 https://wiki.archlinux.org/title/Systemd/Timers#MAILTO
but using a top-level systemd override to send such alerts for all
service units on parsons, not just timers. Tested by sending SIGKILL to
monit a couple times & receiving emails.

We might now get two emails for some failing units, or possibly even
three! (if is-system-running is false, caused by a service unit failure,
and monit also notices the service not running). On the other hand, we
now also get emails if monit fails.
2025-02-01 17:00:03 +01:00

95 lines
3.1 KiB
Nix

{ config, options, lib, pkgs, ... }:
let
checkHash = pkgs.writeScriptBin "check-commit-hash" ''
#!${lib.getExe pkgs.fish}
set wanted (${lib.getExe pkgs.curl} -s https://git.infra4future.de/api/v1/repos/hacc/haccfiles/branches/main \
-H 'accept: application/json' | jq -r .commit.id)
if test $status != 0
echo "could not reach git.infra4future.de"
exit 2
end
set actual (cat /etc/haccfiles-commit)
if test $status != 0
echo "/etc/haccfiles-commit does not exist??"
exit 2
end
if test $actual != $wanted
echo "parsons was built on $actual, but commit on main is $wanted"
exit 1
end
'';
checkDeployAge = pkgs.writeScriptBin "check-deploy-age" ''
#!${lib.getExe pkgs.fish}
set date (date +%s)
# we do this indirection here so monit's config won't change on each deploy
set deploytimestamp (cat /etc/haccfiles-timestamp)
set age (expr $date - $deploytimestamp)
if test $age -ge (expr 3600 \* 24 \* 10)
echo "${config.networking.hostName} has not been deployed since 10 days, perhaps someone should do updates?"
exit 1
end
'';
unitFailedAlertScript = pkgs.writeShellScript "unit-failed-alert" ''
sendmail -t <<ERRMAIL
To: admin@hacc.space
From: systemd <root@$HOSTNAME>
Subject: unit $1 failed
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset=UTF-8
$(systemctl status --full "$1")
ERRMAIL
'';
systemdTopLevelOverride = pkgs.writeTextFile {
name = "systemd-service-toplevel-override";
destination = "/etc/systemd/system/service.d/toplevel-override.conf";
text = ''
[Unit]
OnFailure=service-failed-email@%n.service
'';
};
in
{
mailserver.monitoring = {
enable = true;
alertAddress = "admin@hacc.space";
config = (lib.replaceStrings ["port 22"] ["port ${toString (lib.head config.services.openssh.ports)}"] options.mailserver.monitoring.config.default);
};
services.monit.config = ''
check host onlyoffice with address onlyoffice.infra4future.de
start program "/run/current-system/sw/bin/lxc-start -n onlyoffice -f /persist/lxc/onlyoffice/config"
stop program "/run/current-system/sw/bin/lxc-stop -n onlyoffice"
if failed port 443 protocol https status = 302
then restart
check program deployed-commit-on-main path ${lib.getExe checkHash}
if status == 1 for 64 cycles then alert
if status == 2 for 3 cycles then alert
check program is-system-running path ${pkgs.systemd}/bin/systemctl is-system-running
if status != 0 then alert
check program check-deploy-age path ${lib.getExe checkDeployAge}
if status == 1 then alert
'';
systemd.services."service-failed-email@" = {
path = [ pkgs.bash pkgs.postfix ];
serviceConfig.Type = "oneshot";
# serviceConfig.DynamicUser = true; # TODO: figure out how to make postfix accept emails if this is set
serviceConfig.ExecStart =
"${unitFailedAlertScript} %i";
};
systemd.packages = [ systemdTopLevelOverride ];
}