1
0

Introduce service levels and change alert routing based on this

This commit is contained in:
clerie 2023-01-05 23:16:50 +01:00
parent 30e22dff8d
commit cfd746fddb
5 changed files with 53 additions and 2 deletions

View File

@ -59,6 +59,7 @@
enable = true;
id = "212";
pubkey = "P1ONelxezvkcLJFyvuCVeIUd3uewPIlONfKk9y6h9QE=";
serviceLevel = "event";
};
system.stateVersion = "22.11";

View File

@ -61,6 +61,7 @@
enable = true;
id = "213";
pubkey = "hMIr7fgfZwSjNufRaMtq+7MDxfwN3XLJ4ZlmSOoFrz4=";
serviceLevel ="event";
};
system.stateVersion = "22.11";

View File

@ -20,7 +20,7 @@ let
monitoringHosts;
monitoringTargets = mapAttrsToList (name: host:
"${host.config.networking.hostName}.mon.clerie.de:9100")
"${host.config.networking.hostName}.mon.clerie.de:9100;${attrByPath ["clerie" "monitoring" "serviceLevel"] "infra" host.config}")
monitoringHosts;
nixosMonitoringTargets = mapAttrsToList (name: host:
@ -106,6 +106,12 @@ in {
repeat_interval = "4h";
group_by = [ "instance" ];
routes = [
{
matchers = [
''severity = "muted"''
];
receiver = "muted";
}
{
receiver = "xmpp-receiver";
matchers = [
@ -116,6 +122,9 @@ in {
];
};
receivers = [
{
name = "muted";
}
{
name = "xmpp-receiver";
webhook_configs = [
@ -125,6 +134,20 @@ in {
];
}
];
inhibit_rules = [
{
# Mute all alerts for an instance which also has an alert starting with MuteInstanceOn
target_matchers = [
''alertname =~ ".+"''
];
source_matchers = [
''mute = "instance"''
];
equal = [
"instance"
];
}
];
};
};
@ -160,12 +183,24 @@ in {
static_configs = [
{
targets = [
"monitoring-3.mon.clerie.de:9100"
"monitoring-3.mon.clerie.de:9100;infra"
]
++ monitoringTargets;
}
];
relabel_configs = [
{
source_labels = [ "__address__" ];
regex = "(.+);(.+)";
target_label = "service_level";
replacement = "\${2}";
}
{
source_labels = [ "__address__" ];
regex = "(.+);(.+)";
target_label = "__address__";
replacement = "\${1}";
}
relabelAddressToInstance
];
}

View File

@ -31,3 +31,12 @@ groups:
annotations:
summary: "Uptime of {{ $labels.instance }} less than 5 min"
description: "{{ $labels.instance }} just booted"
- alert: EventInstanceDown
expr: up{job="node-exporter", service_level="event"} == 0
for: 2h
labels:
mute: instance
severity: muted
annotations:
summary: "Event instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} has been down for more than 2 hours."

View File

@ -21,6 +21,11 @@ in
type = types.str;
description = "Public Key of the monitoring wireguard interface of this host";
};
serviceLevel = mkOption {
type = types.str;
default = "infra";
description = "Service level this instance is assigned to";
};
bird = mkEnableOption "Monitor bird";
blackbox = mkEnableOption "Monitor blackbox";
nixos = mkOption {