From 3b7f59a66e85624ef5a8405d88a5f72a606b823d Mon Sep 17 00:00:00 2001 From: clerie Date: Tue, 21 Jan 2025 17:18:41 +0100 Subject: [PATCH] hosts/monitoring-3: Warn if storages are almost full --- hosts/monitoring-3/alertmanager.nix | 12 ++++++++++++ hosts/monitoring-3/rules.yml | 10 +++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/hosts/monitoring-3/alertmanager.nix b/hosts/monitoring-3/alertmanager.nix index 7913392..86472a8 100644 --- a/hosts/monitoring-3/alertmanager.nix +++ b/hosts/monitoring-3/alertmanager.nix @@ -63,6 +63,18 @@ "instance" ]; } + { + target_matchers = [ + ''alertname = "StorageAlmostFull"'' + ]; + source_matchers = [ + ''alertname = "StorageFull"'' + ]; + equal = [ + "instance" + "mountpoint" + ]; + } ]; }; }; diff --git a/hosts/monitoring-3/rules.yml b/hosts/monitoring-3/rules.yml index a895ddb..9a4a160 100644 --- a/hosts/monitoring-3/rules.yml +++ b/hosts/monitoring-3/rules.yml @@ -17,7 +17,7 @@ groups: annotations: summary: "Current system of {{ $labels.instance }} not in sync with config" description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config" - - alert: BackupStorageFull + - alert: StorageFull expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 5 for: 30m labels: @@ -25,6 +25,14 @@ groups: annotations: summary: "Storage of {{ $labels.instance }} is full" description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full" + - alert: StorageAlmostFull + expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 10 + for: 30m + labels: + severity: warning + annotations: + summary: "Storage of {{ $labels.instance }} is almost full" + description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is almost full" - alert: ClerieBackupJobLastSuccessfulRunBehind expr: time() - last_over_time(clerie_backup_last_successful_run_time{}[5m]) >= 9000 for: 5m