groups: - name: alert.rules rules: - alert: InstanceDown expr: up{job="node-exporter"} == 0 for: 5m labels: severity: critical annotations: summary: "Endpoint {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." - alert: InstanceUptime expr: time() - node_boot_time_seconds{job="node-exporter"} >= 7776000 labels: severity: warning annotations: summary: "Uptime of {{ $labels.instance }} more than 90 days" description: "The last reboot of {{ $labels.instance }} was more than 90 days ago" - alert: BackupStorageFull expr: ((node_filesystem_avail_bytes{instance="clerie-backup.mon.clerie.de:9100", job="node-exporter", mountpoint="/mnt/clerie-backup"} / node_filesystem_size_bytes{instance="clerie-backup.mon.clerie.de:9100", job="node-exporter", mountpoint="/mnt/clerie-backup"}) * 100) < 5 for: 30m labels: severity: critical annotations: summary: "Storage for backup is nearly full" description: "Storage for backups is nearly full" - alert: InstanceJustBooted expr: time() - node_boot_time_seconds{job="node-exporter"} <= 300 labels: severity: warning annotations: summary: "Uptime of {{ $labels.instance }} less than 5 min" description: "{{ $labels.instance }} just booted" - alert: EventInstanceDown expr: up{job="node-exporter", service_level="event"} == 0 for: 2h labels: mute: instance severity: muted annotations: summary: "Event instance {{ $labels.instance }} down" description: "{{ $labels.instance }} has been down for more than 2 hours."