groups:
- name: alert.rules
  rules:
  - alert: InstanceDown
    expr: up{job="node-exporter"} == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Endpoint {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
  - alert: InstanceUptime
    expr: time() - node_boot_time_seconds{job="node-exporter"} >= 7776000
    labels:
      severity: warning
    annotations:
      summary: "Uptime of {{ $labels.instance }} more than 90 days"
      description: "The last reboot of {{ $labels.instance }} was more than 90 days ago"
  - alert: BackupStorageFull
    expr: ((node_filesystem_avail_bytes{instance="clerie-backup.mon.clerie.de:9100", job="node-exporter", mountpoint="/mnt/clerie-backup"} / node_filesystem_size_bytes{instance="clerie-backup.mon.clerie.de:9100", job="node-exporter", mountpoint="/mnt/clerie-backup"}) * 100) < 5
    for: 30m
    labels:
      severity: critical
    annotations:
      summary: "Storage for backup is nearly full"
      description: "Storage for backups is nearly full"
  - alert: InstanceJustBooted
    expr: time() - node_boot_time_seconds{job="node-exporter"} <= 300
    labels:
      severity: warning
    annotations:
      summary: "Uptime of {{ $labels.instance }} less than 5 min"
      description: "{{ $labels.instance }} just booted"
  - alert: EventInstanceDown
    expr: up{job="node-exporter", service_level="event"} == 0
    for: 2h
    labels:
      mute: instance
      severity: muted
    annotations:
      summary: "Event instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} has been down for more than 2 hours."