From 5270f493b84a679e3734ad07070bb8e0a117ab9f Mon Sep 17 00:00:00 2001 From: clerie Date: Sun, 28 May 2023 12:10:45 +0200 Subject: [PATCH] hosts/monitoring-3: Make alerting rules more relsilient against missing scrapes --- hosts/monitoring-3/rules.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hosts/monitoring-3/rules.yml b/hosts/monitoring-3/rules.yml index 0e333ac..e0b3cd5 100644 --- a/hosts/monitoring-3/rules.yml +++ b/hosts/monitoring-3/rules.yml @@ -10,7 +10,7 @@ groups: summary: "Endpoint {{ $labels.instance }} down" description: "{{ $labels.instance }} is unreachable" - alert: InstanceConfigNotSync - expr: nixos_current_system_is_sync{} == 0 + expr: last_over_time(nixos_current_system_is_sync{}[5m]) == 0 for: 2h labels: severity: warning @@ -18,7 +18,7 @@ groups: summary: "Current system of {{ $labels.instance }} not in sync with config" description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config" - alert: BackupStorageFull - expr: ((avg_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / avg_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 5 + expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 5 for: 30m labels: severity: critical @@ -26,7 +26,7 @@ groups: summary: "Storage of {{ $labels.instance }} is full" description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full" - alert: ClerieBackupJobLastSuccessfulRunBehind - expr: time() - clerie_backup_last_successful_run_time{} >= 9000 + expr: time() - last_over_time(clerie_backup_last_successful_run_time{}[5m]) >= 9000 for: 5m labels: severity: warning @@ -34,7 +34,7 @@ groups: summary: "Last successful backup on {{ $labels.instance }} older than 1h" description: "Backup job {{ $labels.backup_job }} to target {{ $labels.backup_instance }} finished successfully over an hour ago" - alert: InstanceJustBooted - expr: time() - node_boot_time_seconds{job="node-exporter"} <= 300 + expr: time() - last_over_time(node_boot_time_seconds{job="node-exporter"}[5m]) <= 300 labels: severity: warning annotations: @@ -50,7 +50,7 @@ groups: summary: "Event instance {{ $labels.instance }} down" description: "{{ $labels.instance }} has been down for more than 2 hours." - alert: KernelChanged - expr: avg_over_time(nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"}[5m]) == 0 + expr: last_over_time(nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"}[5m]) == 0 for: 2h labels: severity: warning