1
0
Fork 0
nixos-config/nixos/roles/monitoring/rules.yml
2020-12-04 15:08:33 +01:00

52 lines
2.1 KiB
YAML

groups:
- name: rules
rules:
- alert: probe_timeout
expr: probe_success == 0
for: 60m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} probe {{ $labels.job}} failed.'
- alert: nixpkgs
expr: hydra_job_failed == 1
for: 2h
labels:
severity: warning
annotations:
description: 'hydra build {{ $labels.packageName }} on nixpkgs branch {{ $labels.jobset }} failed.'
- alert: node_down
expr: 'up{name!="apollo",instance!="hydra.nixos.org:443"} == 0'
for: 5m
labels:
severity: critical
annotations:
description: '{{ $labels.name }} is not reachable.'
- alert: systemd_service_failed
expr: node_systemd_unit_state{state="failed",exported_name!~"configure-printer.*"} == 1
for: 5m
labels:
severity: critical
annotations:
description: 'service {{$labels.exported_name}} on {{$labels.name}} failed.'
- alert: out_of_diskspace
expr: min by (device, name) (node_filesystem_avail_bytes{device!="tmpfs",inContainer!="true"}) / max by (device,name) (node_filesystem_size_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
description: "{{ $labels.device }} on {{ $labels.name }} has only {{ $value | humanizePercentage }} free diskspace."
- alert: out_of_inodes
expr: node_filesystem_files_free{fstype!="tmpfs"} / node_filesystem_files{fstype!="tmpfs"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
description: "mountpoint {{ $labels.mountpoint }} on {{ $labels.name }} out of inodes."
- alert: mailq
expr: postfix_showq_message_age_seconds_sum > 1800
for: 5m
labels:
severity: warning
annotations:
description: "mail queue {{ $labels.queue }} of {{ $labels.name }} has accumulated a waiting time of {{ $value | humanizeDuration }}."