1
0
Fork 0

Simplify monitoring

This commit is contained in:
Malte Brandy 2019-08-24 21:55:50 +02:00
parent fb2775837e
commit 52333fa346
3 changed files with 3 additions and 44 deletions

View file

@ -24,14 +24,6 @@ in {
target = ".weechat/perl";
source = ./plugins/perl;
};
# plugins = {
# target = ".weechat/plugins.conf";
# text = ''
# [var]
# python.buffer_autohide.hide_inactive = off
# python.buffer_autohide.hide_private = off
# '';
# };
weechat = {
target = ".weechat/weechat.conf";
text = ''
@ -71,7 +63,7 @@ in {
freenode.sasl_password = "${config.m-0.weechat.freenode_pw}"
freenode.autoconnect = on
freenode.username = "${config.m-0.weechat.user}"
freenode.autojoin = "#nixos,#matrix,#haskell"
freenode.autojoin = "#nixos,#matrix,#haskell,#card10badge,#ghc,#home-manager,#krebs,#nixos-de"
[server]
hackint.addresses = "irc.hackint.org/6697"

Binary file not shown.

View file

@ -1,58 +1,35 @@
groups:
- name: rules
rules:
- alert: VEDPageDown
expr: probe_success{instance=~".*vocalensemble.*"} == 0
- alert: ProbeTimeout
expr: probe_success == 0
for: 60m
labels:
severity: page
annotations:
description: 'A BlackBoxProbe timed out.'
summary: 'Instance {{ $labels.instance }} does not respond as wished.'
- alert: BlackBoxProbeTimeout
expr: probe_success{instance!~".*vocalensemble.*"} == 0
for: 5m
labels:
severity: page
annotations:
description: 'A BlackBoxProbe timed out.'
summary: 'Instance {{ $labels.instance }} does not respond as wished.'
- alert: node_down
expr: (up{name!="apollo"} == 0)
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.name }} has been down for more than 5 minutes.'
summary: '{{$labels.name}}: Node is down.'
- alert: systemd_service_failed
expr: node_systemd_unit_state{state="failed"} == 1
for: 4m
labels:
severity: page
annotations:
description: '{{$labels.name}} failed to (re)start service {{$labels.exported_name}}.'
summary: '{{$labels.name}}: Service {{$labels.exported_name}} failed.'
- alert: systemd_service_flapping
expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or (changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
labels:
severity: page
annotations:
description: '{{$labels.name}}: Service {{$labels.exported_name}} changed its state more than 5x/5min or 15x/1h'
summary: '{{$labels.name}}: Service {{$labels.exported_name}} is flapping.'
- alert: node_filesystem_full_90percent
expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024 ^ 3
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem.'
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
- alert: node_filesystem_full_in_4h
expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
is running out of space of in approx. 4 hours'
@ -60,8 +37,6 @@ groups:
- alert: node_filedescriptors_full_in_3h
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
for: 20m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is running out of available file descriptors
in approx. 3 hours'
@ -70,8 +45,6 @@ groups:
- alert: node_load1_90percent
expr: node_load1 / on(alias) count by(alias) (node_cpu{mode="system"}) >= 0.9
for: 1h
labels:
severity: page
annotations:
description: '{{$labels.alias}} is running with > 90% total load for at least
1h.'
@ -79,8 +52,6 @@ groups:
- alert: node_cpu_util_90percent
expr: 100 - (avg by(alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
for: 1h
labels:
severity: page
annotations:
description: '{{$labels.alias}} has total CPU utilization over 90% for at least
1h.'
@ -88,8 +59,6 @@ groups:
- alert: node_ram_using_90percent
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
for: 30m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is using at least 90% of its RAM for at least
30 minutes now.'
@ -97,8 +66,6 @@ groups:
- alert: node_swap_using_80percent
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
for: 10m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is using 80% of its swap space for at least
10 minutes now.'