Introduce service levels and change alert routing based on this
This commit is contained in:
parent
30e22dff8d
commit
cfd746fddb
@ -59,6 +59,7 @@
|
|||||||
enable = true;
|
enable = true;
|
||||||
id = "212";
|
id = "212";
|
||||||
pubkey = "P1ONelxezvkcLJFyvuCVeIUd3uewPIlONfKk9y6h9QE=";
|
pubkey = "P1ONelxezvkcLJFyvuCVeIUd3uewPIlONfKk9y6h9QE=";
|
||||||
|
serviceLevel = "event";
|
||||||
};
|
};
|
||||||
|
|
||||||
system.stateVersion = "22.11";
|
system.stateVersion = "22.11";
|
||||||
|
@ -61,6 +61,7 @@
|
|||||||
enable = true;
|
enable = true;
|
||||||
id = "213";
|
id = "213";
|
||||||
pubkey = "hMIr7fgfZwSjNufRaMtq+7MDxfwN3XLJ4ZlmSOoFrz4=";
|
pubkey = "hMIr7fgfZwSjNufRaMtq+7MDxfwN3XLJ4ZlmSOoFrz4=";
|
||||||
|
serviceLevel ="event";
|
||||||
};
|
};
|
||||||
|
|
||||||
system.stateVersion = "22.11";
|
system.stateVersion = "22.11";
|
||||||
|
@ -20,7 +20,7 @@ let
|
|||||||
monitoringHosts;
|
monitoringHosts;
|
||||||
|
|
||||||
monitoringTargets = mapAttrsToList (name: host:
|
monitoringTargets = mapAttrsToList (name: host:
|
||||||
"${host.config.networking.hostName}.mon.clerie.de:9100")
|
"${host.config.networking.hostName}.mon.clerie.de:9100;${attrByPath ["clerie" "monitoring" "serviceLevel"] "infra" host.config}")
|
||||||
monitoringHosts;
|
monitoringHosts;
|
||||||
|
|
||||||
nixosMonitoringTargets = mapAttrsToList (name: host:
|
nixosMonitoringTargets = mapAttrsToList (name: host:
|
||||||
@ -106,6 +106,12 @@ in {
|
|||||||
repeat_interval = "4h";
|
repeat_interval = "4h";
|
||||||
group_by = [ "instance" ];
|
group_by = [ "instance" ];
|
||||||
routes = [
|
routes = [
|
||||||
|
{
|
||||||
|
matchers = [
|
||||||
|
''severity = "muted"''
|
||||||
|
];
|
||||||
|
receiver = "muted";
|
||||||
|
}
|
||||||
{
|
{
|
||||||
receiver = "xmpp-receiver";
|
receiver = "xmpp-receiver";
|
||||||
matchers = [
|
matchers = [
|
||||||
@ -116,6 +122,9 @@ in {
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
receivers = [
|
receivers = [
|
||||||
|
{
|
||||||
|
name = "muted";
|
||||||
|
}
|
||||||
{
|
{
|
||||||
name = "xmpp-receiver";
|
name = "xmpp-receiver";
|
||||||
webhook_configs = [
|
webhook_configs = [
|
||||||
@ -125,6 +134,20 @@ in {
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
inhibit_rules = [
|
||||||
|
{
|
||||||
|
# Mute all alerts for an instance which also has an alert starting with MuteInstanceOn
|
||||||
|
target_matchers = [
|
||||||
|
''alertname =~ ".+"''
|
||||||
|
];
|
||||||
|
source_matchers = [
|
||||||
|
''mute = "instance"''
|
||||||
|
];
|
||||||
|
equal = [
|
||||||
|
"instance"
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -160,12 +183,24 @@ in {
|
|||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [
|
targets = [
|
||||||
"monitoring-3.mon.clerie.de:9100"
|
"monitoring-3.mon.clerie.de:9100;infra"
|
||||||
]
|
]
|
||||||
++ monitoringTargets;
|
++ monitoringTargets;
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
relabel_configs = [
|
relabel_configs = [
|
||||||
|
{
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
regex = "(.+);(.+)";
|
||||||
|
target_label = "service_level";
|
||||||
|
replacement = "\${2}";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
regex = "(.+);(.+)";
|
||||||
|
target_label = "__address__";
|
||||||
|
replacement = "\${1}";
|
||||||
|
}
|
||||||
relabelAddressToInstance
|
relabelAddressToInstance
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
@ -31,3 +31,12 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Uptime of {{ $labels.instance }} less than 5 min"
|
summary: "Uptime of {{ $labels.instance }} less than 5 min"
|
||||||
description: "{{ $labels.instance }} just booted"
|
description: "{{ $labels.instance }} just booted"
|
||||||
|
- alert: EventInstanceDown
|
||||||
|
expr: up{job="node-exporter", service_level="event"} == 0
|
||||||
|
for: 2h
|
||||||
|
labels:
|
||||||
|
mute: instance
|
||||||
|
severity: muted
|
||||||
|
annotations:
|
||||||
|
summary: "Event instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} has been down for more than 2 hours."
|
||||||
|
@ -21,6 +21,11 @@ in
|
|||||||
type = types.str;
|
type = types.str;
|
||||||
description = "Public Key of the monitoring wireguard interface of this host";
|
description = "Public Key of the monitoring wireguard interface of this host";
|
||||||
};
|
};
|
||||||
|
serviceLevel = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = "infra";
|
||||||
|
description = "Service level this instance is assigned to";
|
||||||
|
};
|
||||||
bird = mkEnableOption "Monitor bird";
|
bird = mkEnableOption "Monitor bird";
|
||||||
blackbox = mkEnableOption "Monitor blackbox";
|
blackbox = mkEnableOption "Monitor blackbox";
|
||||||
nixos = mkOption {
|
nixos = mkOption {
|
||||||
|
Loading…
Reference in New Issue
Block a user