Introduce service levels and change alert routing based on this
This commit is contained in:
parent
30e22dff8d
commit
cfd746fddb
@ -59,6 +59,7 @@
|
||||
enable = true;
|
||||
id = "212";
|
||||
pubkey = "P1ONelxezvkcLJFyvuCVeIUd3uewPIlONfKk9y6h9QE=";
|
||||
serviceLevel = "event";
|
||||
};
|
||||
|
||||
system.stateVersion = "22.11";
|
||||
|
@ -61,6 +61,7 @@
|
||||
enable = true;
|
||||
id = "213";
|
||||
pubkey = "hMIr7fgfZwSjNufRaMtq+7MDxfwN3XLJ4ZlmSOoFrz4=";
|
||||
serviceLevel ="event";
|
||||
};
|
||||
|
||||
system.stateVersion = "22.11";
|
||||
|
@ -20,7 +20,7 @@ let
|
||||
monitoringHosts;
|
||||
|
||||
monitoringTargets = mapAttrsToList (name: host:
|
||||
"${host.config.networking.hostName}.mon.clerie.de:9100")
|
||||
"${host.config.networking.hostName}.mon.clerie.de:9100;${attrByPath ["clerie" "monitoring" "serviceLevel"] "infra" host.config}")
|
||||
monitoringHosts;
|
||||
|
||||
nixosMonitoringTargets = mapAttrsToList (name: host:
|
||||
@ -106,6 +106,12 @@ in {
|
||||
repeat_interval = "4h";
|
||||
group_by = [ "instance" ];
|
||||
routes = [
|
||||
{
|
||||
matchers = [
|
||||
''severity = "muted"''
|
||||
];
|
||||
receiver = "muted";
|
||||
}
|
||||
{
|
||||
receiver = "xmpp-receiver";
|
||||
matchers = [
|
||||
@ -116,6 +122,9 @@ in {
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "muted";
|
||||
}
|
||||
{
|
||||
name = "xmpp-receiver";
|
||||
webhook_configs = [
|
||||
@ -125,6 +134,20 @@ in {
|
||||
];
|
||||
}
|
||||
];
|
||||
inhibit_rules = [
|
||||
{
|
||||
# Mute all alerts for an instance which also has an alert starting with MuteInstanceOn
|
||||
target_matchers = [
|
||||
''alertname =~ ".+"''
|
||||
];
|
||||
source_matchers = [
|
||||
''mute = "instance"''
|
||||
];
|
||||
equal = [
|
||||
"instance"
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
@ -160,12 +183,24 @@ in {
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"monitoring-3.mon.clerie.de:9100"
|
||||
"monitoring-3.mon.clerie.de:9100;infra"
|
||||
]
|
||||
++ monitoringTargets;
|
||||
}
|
||||
];
|
||||
relabel_configs = [
|
||||
{
|
||||
source_labels = [ "__address__" ];
|
||||
regex = "(.+);(.+)";
|
||||
target_label = "service_level";
|
||||
replacement = "\${2}";
|
||||
}
|
||||
{
|
||||
source_labels = [ "__address__" ];
|
||||
regex = "(.+);(.+)";
|
||||
target_label = "__address__";
|
||||
replacement = "\${1}";
|
||||
}
|
||||
relabelAddressToInstance
|
||||
];
|
||||
}
|
||||
|
@ -31,3 +31,12 @@ groups:
|
||||
annotations:
|
||||
summary: "Uptime of {{ $labels.instance }} less than 5 min"
|
||||
description: "{{ $labels.instance }} just booted"
|
||||
- alert: EventInstanceDown
|
||||
expr: up{job="node-exporter", service_level="event"} == 0
|
||||
for: 2h
|
||||
labels:
|
||||
mute: instance
|
||||
severity: muted
|
||||
annotations:
|
||||
summary: "Event instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} has been down for more than 2 hours."
|
||||
|
@ -21,6 +21,11 @@ in
|
||||
type = types.str;
|
||||
description = "Public Key of the monitoring wireguard interface of this host";
|
||||
};
|
||||
serviceLevel = mkOption {
|
||||
type = types.str;
|
||||
default = "infra";
|
||||
description = "Service level this instance is assigned to";
|
||||
};
|
||||
bird = mkEnableOption "Monitor bird";
|
||||
blackbox = mkEnableOption "Monitor blackbox";
|
||||
nixos = mkOption {
|
||||
|
Loading…
Reference in New Issue
Block a user