groups: - name: alert.rules rules: - alert: InstanceDown expr: up{job="node-exporter"} == 0 for: 5m labels: severity: critical annotations: summary: "Endpoint {{ $labels.instance }} down" description: "{{ $labels.instance }} is unreachable" - alert: InstanceConfigNotSync expr: last_over_time(nixos_current_system_is_sync{}[5m]) == 0 for: 2h labels: severity: warning annotations: summary: "Current system of {{ $labels.instance }} not in sync with config" description: "The current system hash of {{ $labels.instance }} does not match the one generated by hydra based on the current config" - alert: StorageFull expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 5 for: 30m labels: severity: critical annotations: summary: "Storage of {{ $labels.instance }} is full" description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is full" - alert: StorageAlmostFull expr: ((last_over_time(node_filesystem_avail_bytes{job="node-exporter"}[5m]) / last_over_time(node_filesystem_size_bytes{job="node-exporter"}[5m])) * 100) < 10 for: 30m labels: severity: warning annotations: summary: "Storage of {{ $labels.instance }} is almost full" description: "Storage of {{ $labels.instance }} for {{ $labels.mountpoint }} on {{ $labels.device }} is almost full" - alert: ClerieBackupJobLastSuccessfulRunBehind expr: time() - last_over_time(clerie_backup_last_successful_run_time{}[5m]) >= 9000 for: 5m labels: severity: warning annotations: summary: "Last successful backup on {{ $labels.instance }} older than 1h" description: "Backup job {{ $labels.backup_job }} to target {{ $labels.backup_instance }} finished successfully over an hour ago" - alert: InstanceJustBooted expr: time() - last_over_time(node_boot_time_seconds{job="node-exporter"}[5m]) <= 300 labels: severity: warning annotations: summary: "Uptime of {{ $labels.instance }} less than 5 min" description: "{{ $labels.instance }} just booted" - alert: EventInstanceDown expr: up{job="node-exporter", service_level="event"} == 0 for: 2h labels: mute: instance severity: muted annotations: summary: "Event instance {{ $labels.instance }} down" description: "{{ $labels.instance }} has been down for more than 2 hours." - alert: KernelChanged expr: last_over_time(nixos_current_system_kernel_is_booted_system_kernel{job="nixos-exporter"}[5m]) == 0 for: 2h labels: severity: warning annotations: summary: "Kernel of {{ $labels.instance }} changed" description: "The Kernel {{ $labels.instance }} booted with, isn't the target Kernel. A reboot may be required." - alert: SynapseUnavailable expr: last_over_time(probe_success{instance="monitoring-3.net.clerie.de", job="blackbox_local_synapse", target="matrix.entr0py.de"}[5m]) == 0 for: 5m labels: severity: warning annotations: summary: "Synapse of {{ $labels.target }} unavailable" description: "The Synapse backend of {{ $labels.target }} is unreachable or returns garbage"