this idea is based on https://utcc.utoronto.ca/~cks/space/blog/linux/SystemdTimersMailNotes and the therein linked https://wiki.archlinux.org/title/Systemd/Timers#MAILTO but using a top-level systemd override to send such alerts for all service units on parsons, not just timers. Tested by sending SIGKILL to monit a couple times & receiving emails. We might now get two emails for some failing units, or possibly even three! (if is-system-running is false, caused by a service unit failure, and monit also notices the service not running). On the other hand, we now also get emails if monit fails.
95 lines
3.1 KiB
Nix
95 lines
3.1 KiB
Nix
{ config, options, lib, pkgs, ... }:
|
|
|
|
let
|
|
checkHash = pkgs.writeScriptBin "check-commit-hash" ''
|
|
#!${lib.getExe pkgs.fish}
|
|
set wanted (${lib.getExe pkgs.curl} -s https://git.infra4future.de/api/v1/repos/hacc/haccfiles/branches/main \
|
|
-H 'accept: application/json' | jq -r .commit.id)
|
|
|
|
if test $status != 0
|
|
echo "could not reach git.infra4future.de"
|
|
exit 2
|
|
end
|
|
|
|
set actual (cat /etc/haccfiles-commit)
|
|
if test $status != 0
|
|
echo "/etc/haccfiles-commit does not exist??"
|
|
exit 2
|
|
end
|
|
|
|
if test $actual != $wanted
|
|
echo "parsons was built on $actual, but commit on main is $wanted"
|
|
exit 1
|
|
end
|
|
'';
|
|
|
|
checkDeployAge = pkgs.writeScriptBin "check-deploy-age" ''
|
|
#!${lib.getExe pkgs.fish}
|
|
|
|
set date (date +%s)
|
|
# we do this indirection here so monit's config won't change on each deploy
|
|
set deploytimestamp (cat /etc/haccfiles-timestamp)
|
|
set age (expr $date - $deploytimestamp)
|
|
|
|
if test $age -ge (expr 3600 \* 24 \* 10)
|
|
echo "${config.networking.hostName} has not been deployed since 10 days, perhaps someone should do updates?"
|
|
exit 1
|
|
end
|
|
'';
|
|
|
|
unitFailedAlertScript = pkgs.writeShellScript "unit-failed-alert" ''
|
|
sendmail -t <<ERRMAIL
|
|
To: admin@hacc.space
|
|
From: systemd <root@$HOSTNAME>
|
|
Subject: unit $1 failed
|
|
Content-Transfer-Encoding: 8bit
|
|
Content-Type: text/plain; charset=UTF-8
|
|
|
|
$(systemctl status --full "$1")
|
|
ERRMAIL
|
|
'';
|
|
|
|
systemdTopLevelOverride = pkgs.writeTextFile {
|
|
name = "systemd-service-toplevel-override";
|
|
destination = "/etc/systemd/system/service.d/toplevel-override.conf";
|
|
text = ''
|
|
[Unit]
|
|
OnFailure=service-failed-email@%n.service
|
|
'';
|
|
};
|
|
in
|
|
{
|
|
mailserver.monitoring = {
|
|
enable = true;
|
|
alertAddress = "admin@hacc.space";
|
|
config = (lib.replaceStrings ["port 22"] ["port ${toString (lib.head config.services.openssh.ports)}"] options.mailserver.monitoring.config.default);
|
|
};
|
|
|
|
services.monit.config = ''
|
|
check host onlyoffice with address onlyoffice.infra4future.de
|
|
start program "/run/current-system/sw/bin/lxc-start -n onlyoffice -f /persist/lxc/onlyoffice/config"
|
|
stop program "/run/current-system/sw/bin/lxc-stop -n onlyoffice"
|
|
if failed port 443 protocol https status = 302
|
|
then restart
|
|
|
|
check program deployed-commit-on-main path ${lib.getExe checkHash}
|
|
if status == 1 for 64 cycles then alert
|
|
if status == 2 for 3 cycles then alert
|
|
|
|
check program is-system-running path ${pkgs.systemd}/bin/systemctl is-system-running
|
|
if status != 0 then alert
|
|
|
|
check program check-deploy-age path ${lib.getExe checkDeployAge}
|
|
if status == 1 then alert
|
|
'';
|
|
|
|
systemd.services."service-failed-email@" = {
|
|
path = [ pkgs.bash pkgs.postfix ];
|
|
serviceConfig.Type = "oneshot";
|
|
# serviceConfig.DynamicUser = true; # TODO: figure out how to make postfix accept emails if this is set
|
|
serviceConfig.ExecStart =
|
|
"${unitFailedAlertScript} %i";
|
|
};
|
|
|
|
systemd.packages = [ systemdTopLevelOverride ];
|
|
}
|