1
0
nixfiles/hosts/monitoring-3/rules.yml

52 lines
1.9 KiB
YAML
Raw Normal View History

2021-10-22 23:53:42 +02:00
groups:
- name: alert.rules
rules:
- alert: InstanceDown
2021-10-23 18:14:51 +02:00
expr: up{job="node-exporter"} == 0
for: 5m
2021-10-22 23:53:42 +02:00
labels:
severity: critical
annotations:
summary: "Endpoint {{ $labels.instance }} down"
description: "{{ $labels.instance }} is unreachable"
- alert: InstanceConfigNotSync
expr: nixos_current_system_is_sync{} == 0
for: 2h
labels:
severity: warning
annotations:
summary: "Current system of {{ $labels.instance }} not in sync with config"
description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config"
2022-09-14 19:38:10 +02:00
- alert: BackupStorageFull
expr: ((node_filesystem_avail_bytes{job="node-exporter"} / node_filesystem_size_bytes{job="node-exporter"}) * 100) < 5
2022-09-14 19:38:10 +02:00
for: 30m
labels:
severity: critical
annotations:
summary: "Storage of {{ $labels.instance }} is full"
description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full"
- alert: InstanceJustBooted
expr: time() - node_boot_time_seconds{job="node-exporter"} <= 300
labels:
severity: warning
annotations:
summary: "Uptime of {{ $labels.instance }} less than 5 min"
description: "{{ $labels.instance }} just booted"
- alert: EventInstanceDown
expr: up{job="node-exporter", service_level="event"} == 0
for: 2h
labels:
mute: instance
severity: muted
annotations:
summary: "Event instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} has been down for more than 2 hours."
- alert: KernelChanged
expr: nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"} == 0
for: 2h
labels:
severity: warning
annotations:
summary: "Kernel of {{ $labels.instance }} changed"
description: "The Kernel {{ $labels.instance }} booted with, isn't the target Kernel. A reboot may be required."