105 lines
4.7 KiB
YAML
105 lines
4.7 KiB
YAML
groups:
|
|
- name: alert.rules
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up{job="node-exporter"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Endpoint {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} is unreachable"
|
|
- alert: InstanceConfigNotSync
|
|
expr: last_over_time(nixos_current_system_is_sync{}[5m]) == 0
|
|
for: 2h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Current system of {{ $labels.instance }} not in sync with config"
|
|
description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config"
|
|
- alert: StorageFull
|
|
expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter", mountpoint!="/nix/store"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter", mountpoint!="/nix/store"}[5m])) * 100) < 5
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Storage of {{ $labels.instance }} is full"
|
|
description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full"
|
|
- alert: StorageAlmostFull
|
|
expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter", mountpoint!="/nix/store"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter", mountpoint!="/nix/store"}[5m])) * 100) < 10
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Storage of {{ $labels.instance }} is almost full"
|
|
description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is almost full"
|
|
- alert: ClerieBackupJobLastSuccessfulRunBehind
|
|
expr: time() - last_over_time(clerie_backup_last_successful_run_time{}[5m]) >= 9000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Last successful backup on {{ $labels.instance }} older than 1h"
|
|
description: "Backup job {{ $labels.backup_job }} to target {{ $labels.backup_instance }} finished successfully over an hour ago"
|
|
- alert: InstanceJustBooted
|
|
expr: time() - last_over_time(node_boot_time_seconds{job="node-exporter"}[5m]) <= 300
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Uptime of {{ $labels.instance }} less than 5 min"
|
|
description: "{{ $labels.instance }} just booted"
|
|
- alert: EventInstanceDown
|
|
expr: up{job="node-exporter", service_level="event"} == 0
|
|
for: 2h
|
|
labels:
|
|
mute: instance
|
|
severity: muted
|
|
annotations:
|
|
summary: "Event instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} has been down for more than 2 hours."
|
|
- alert: KernelChanged
|
|
expr: last_over_time(nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"}[5m]) == 0
|
|
for: 2h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Kernel of {{ $labels.instance }} changed"
|
|
description: "The Kernel {{ $labels.instance }} booted with, isn't the target Kernel. A reboot may be required."
|
|
- alert: SynapseUnavailable
|
|
expr: last_over_time(probe_success{instance="monitoring-3.net.clerie.de", job="blackbox_local_synapse", target="matrix.entr0py.de"}[5m]) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Synapse of {{ $labels.target }} unavailable"
|
|
description: "The Synapse backend of {{ $labels.target }} is unreachable or returns garbage"
|
|
- alert: ClerieKeysExpire
|
|
expr: last_over_time(clerie_keys_gpg_key_expire_time[15m]) - time() < 1209600
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "GPG {{ $labels.fingerprint }} is expiring"
|
|
description: "GPG with fingerprint {{ $labels.fingerprint }} is expiring in less then two weeks"
|
|
- alert: ClerieKeysAlmostExpire
|
|
expr: last_over_time(clerie_keys_gpg_key_expire_time[15m]) - time() < 3628800
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "GPG {{ $labels.fingerprint }} is expiring soon"
|
|
description: "GPG with fingerprint {{ $labels.fingerprint }} is expiring in less then six weeks"
|
|
- alert: NadjaTopIPv4ProxyBroken
|
|
expr: probe_success{job="blackbox_local_http6", target="blog.nadja.top"} != on (target) probe_success{job="blackbox_local_http4", target="blog.nadja.top"}
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "blog.nadja.top unreachable via IPv4"
|
|
description: "blog.nadja.top unreachable IPv4, but reachable via IPv6"
|
|
- alert: AlertmanagerNotificationRequestsFailed
|
|
expr: rate(alertmanager_notification_requests_failed_total[5m]) > 0
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Too many notification requests failed"
|
|
description: "Too many notification requests to Alertmanager integration {{ $labels.integration }} failed"
|