1
0

Introduce service levels and change alert routing based on this

This commit is contained in:
clerie 2023-01-05 23:16:50 +01:00
parent 30e22dff8d
commit cfd746fddb
5 changed files with 53 additions and 2 deletions

View File

@ -59,6 +59,7 @@
enable = true; enable = true;
id = "212"; id = "212";
pubkey = "P1ONelxezvkcLJFyvuCVeIUd3uewPIlONfKk9y6h9QE="; pubkey = "P1ONelxezvkcLJFyvuCVeIUd3uewPIlONfKk9y6h9QE=";
serviceLevel = "event";
}; };
system.stateVersion = "22.11"; system.stateVersion = "22.11";

View File

@ -61,6 +61,7 @@
enable = true; enable = true;
id = "213"; id = "213";
pubkey = "hMIr7fgfZwSjNufRaMtq+7MDxfwN3XLJ4ZlmSOoFrz4="; pubkey = "hMIr7fgfZwSjNufRaMtq+7MDxfwN3XLJ4ZlmSOoFrz4=";
serviceLevel ="event";
}; };
system.stateVersion = "22.11"; system.stateVersion = "22.11";

View File

@ -20,7 +20,7 @@ let
monitoringHosts; monitoringHosts;
monitoringTargets = mapAttrsToList (name: host: monitoringTargets = mapAttrsToList (name: host:
"${host.config.networking.hostName}.mon.clerie.de:9100") "${host.config.networking.hostName}.mon.clerie.de:9100;${attrByPath ["clerie" "monitoring" "serviceLevel"] "infra" host.config}")
monitoringHosts; monitoringHosts;
nixosMonitoringTargets = mapAttrsToList (name: host: nixosMonitoringTargets = mapAttrsToList (name: host:
@ -106,6 +106,12 @@ in {
repeat_interval = "4h"; repeat_interval = "4h";
group_by = [ "instance" ]; group_by = [ "instance" ];
routes = [ routes = [
{
matchers = [
''severity = "muted"''
];
receiver = "muted";
}
{ {
receiver = "xmpp-receiver"; receiver = "xmpp-receiver";
matchers = [ matchers = [
@ -116,6 +122,9 @@ in {
]; ];
}; };
receivers = [ receivers = [
{
name = "muted";
}
{ {
name = "xmpp-receiver"; name = "xmpp-receiver";
webhook_configs = [ webhook_configs = [
@ -125,6 +134,20 @@ in {
]; ];
} }
]; ];
inhibit_rules = [
{
# Mute all alerts for an instance which also has an alert starting with MuteInstanceOn
target_matchers = [
''alertname =~ ".+"''
];
source_matchers = [
''mute = "instance"''
];
equal = [
"instance"
];
}
];
}; };
}; };
@ -160,12 +183,24 @@ in {
static_configs = [ static_configs = [
{ {
targets = [ targets = [
"monitoring-3.mon.clerie.de:9100" "monitoring-3.mon.clerie.de:9100;infra"
] ]
++ monitoringTargets; ++ monitoringTargets;
} }
]; ];
relabel_configs = [ relabel_configs = [
{
source_labels = [ "__address__" ];
regex = "(.+);(.+)";
target_label = "service_level";
replacement = "\${2}";
}
{
source_labels = [ "__address__" ];
regex = "(.+);(.+)";
target_label = "__address__";
replacement = "\${1}";
}
relabelAddressToInstance relabelAddressToInstance
]; ];
} }

View File

@ -31,3 +31,12 @@ groups:
annotations: annotations:
summary: "Uptime of {{ $labels.instance }} less than 5 min" summary: "Uptime of {{ $labels.instance }} less than 5 min"
description: "{{ $labels.instance }} just booted" description: "{{ $labels.instance }} just booted"
- alert: EventInstanceDown
expr: up{job="node-exporter", service_level="event"} == 0
for: 2h
labels:
mute: instance
severity: muted
annotations:
summary: "Event instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} has been down for more than 2 hours."

View File

@ -21,6 +21,11 @@ in
type = types.str; type = types.str;
description = "Public Key of the monitoring wireguard interface of this host"; description = "Public Key of the monitoring wireguard interface of this host";
}; };
serviceLevel = mkOption {
type = types.str;
default = "infra";
description = "Service level this instance is assigned to";
};
bird = mkEnableOption "Monitor bird"; bird = mkEnableOption "Monitor bird";
blackbox = mkEnableOption "Monitor blackbox"; blackbox = mkEnableOption "Monitor blackbox";
nixos = mkOption { nixos = mkOption {