hosts/monitoring-3: Make alerting rules more relsilient against missing scrapes
This commit is contained in:
parent
ad137204c3
commit
5270f493b8
@ -10,7 +10,7 @@ groups:
|
|||||||
summary: "Endpoint {{ $labels.instance }} down"
|
summary: "Endpoint {{ $labels.instance }} down"
|
||||||
description: "{{ $labels.instance }} is unreachable"
|
description: "{{ $labels.instance }} is unreachable"
|
||||||
- alert: InstanceConfigNotSync
|
- alert: InstanceConfigNotSync
|
||||||
expr: nixos_current_system_is_sync{} == 0
|
expr: last_over_time(nixos_current_system_is_sync{}[5m]) == 0
|
||||||
for: 2h
|
for: 2h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -18,7 +18,7 @@ groups:
|
|||||||
summary: "Current system of {{ $labels.instance }} not in sync with config"
|
summary: "Current system of {{ $labels.instance }} not in sync with config"
|
||||||
description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config"
|
description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config"
|
||||||
- alert: BackupStorageFull
|
- alert: BackupStorageFull
|
||||||
expr: ((avg_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / avg_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 5
|
expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 5
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -26,7 +26,7 @@ groups:
|
|||||||
summary: "Storage of {{ $labels.instance }} is full"
|
summary: "Storage of {{ $labels.instance }} is full"
|
||||||
description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full"
|
description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full"
|
||||||
- alert: ClerieBackupJobLastSuccessfulRunBehind
|
- alert: ClerieBackupJobLastSuccessfulRunBehind
|
||||||
expr: time() - clerie_backup_last_successful_run_time{} >= 9000
|
expr: time() - last_over_time(clerie_backup_last_successful_run_time{}[5m]) >= 9000
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -34,7 +34,7 @@ groups:
|
|||||||
summary: "Last successful backup on {{ $labels.instance }} older than 1h"
|
summary: "Last successful backup on {{ $labels.instance }} older than 1h"
|
||||||
description: "Backup job {{ $labels.backup_job }} to target {{ $labels.backup_instance }} finished successfully over an hour ago"
|
description: "Backup job {{ $labels.backup_job }} to target {{ $labels.backup_instance }} finished successfully over an hour ago"
|
||||||
- alert: InstanceJustBooted
|
- alert: InstanceJustBooted
|
||||||
expr: time() - node_boot_time_seconds{job="node-exporter"} <= 300
|
expr: time() - last_over_time(node_boot_time_seconds{job="node-exporter"}[5m]) <= 300
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
@ -50,7 +50,7 @@ groups:
|
|||||||
summary: "Event instance {{ $labels.instance }} down"
|
summary: "Event instance {{ $labels.instance }} down"
|
||||||
description: "{{ $labels.instance }} has been down for more than 2 hours."
|
description: "{{ $labels.instance }} has been down for more than 2 hours."
|
||||||
- alert: KernelChanged
|
- alert: KernelChanged
|
||||||
expr: avg_over_time(nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"}[5m]) == 0
|
expr: last_over_time(nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"}[5m]) == 0
|
||||||
for: 2h
|
for: 2h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
Loading…
Reference in New Issue
Block a user