2021-10-22 23:53:42 +02:00
|
|
|
groups:
|
|
|
|
- name: alert.rules
|
|
|
|
rules:
|
|
|
|
- alert: InstanceDown
|
2021-10-23 18:14:51 +02:00
|
|
|
expr: up{job="node-exporter"} == 0
|
2021-10-23 18:26:08 +02:00
|
|
|
for: 5m
|
2021-10-22 23:53:42 +02:00
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "Endpoint {{ $labels.instance }} down"
|
|
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
|
2023-02-04 01:15:07 +01:00
|
|
|
- alert: InstanceConfigNotSync
|
|
|
|
expr: nixos_current_system_is_sync{} == 0
|
|
|
|
for: 2h
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "Current system of {{ $labels.instance }} not in sync with config"
|
|
|
|
description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config"
|
2022-09-14 19:38:10 +02:00
|
|
|
- alert: BackupStorageFull
|
2023-04-18 22:52:52 +02:00
|
|
|
expr: ((node_filesystem_avail_bytes{instance="clerie-backup.net.clerie.de", job="node-exporter", mountpoint="/mnt/clerie-backup"} / node_filesystem_size_bytes{instance="clerie-backup.net.clerie.de", job="node-exporter", mountpoint="/mnt/clerie-backup"}) * 100) < 5
|
2022-09-14 19:38:10 +02:00
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "Storage for backup is nearly full"
|
|
|
|
description: "Storage for backups is nearly full"
|
2022-10-02 11:59:37 +02:00
|
|
|
- alert: InstanceJustBooted
|
|
|
|
expr: time() - node_boot_time_seconds{job="node-exporter"} <= 300
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "Uptime of {{ $labels.instance }} less than 5 min"
|
|
|
|
description: "{{ $labels.instance }} just booted"
|
2023-01-05 23:16:50 +01:00
|
|
|
- alert: EventInstanceDown
|
|
|
|
expr: up{job="node-exporter", service_level="event"} == 0
|
|
|
|
for: 2h
|
|
|
|
labels:
|
|
|
|
mute: instance
|
|
|
|
severity: muted
|
|
|
|
annotations:
|
|
|
|
summary: "Event instance {{ $labels.instance }} down"
|
|
|
|
description: "{{ $labels.instance }} has been down for more than 2 hours."
|
2023-03-25 20:42:17 +01:00
|
|
|
- alert: KernelChanged
|
|
|
|
expr: nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"} == 0
|
|
|
|
for: 2h
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "Kernel of {{ $labels.instance }} changed"
|
|
|
|
description: "The Kernel {{ $labels.instance }} booted with, isn't the target Kernel. A reboot may be required."
|