Simplify monitoring
This commit is contained in:
parent
fb2775837e
commit
52333fa346
3 changed files with 3 additions and 44 deletions
|
@ -24,14 +24,6 @@ in {
|
|||
target = ".weechat/perl";
|
||||
source = ./plugins/perl;
|
||||
};
|
||||
# plugins = {
|
||||
# target = ".weechat/plugins.conf";
|
||||
# text = ''
|
||||
# [var]
|
||||
# python.buffer_autohide.hide_inactive = off
|
||||
# python.buffer_autohide.hide_private = off
|
||||
# '';
|
||||
# };
|
||||
weechat = {
|
||||
target = ".weechat/weechat.conf";
|
||||
text = ''
|
||||
|
@ -71,7 +63,7 @@ in {
|
|||
freenode.sasl_password = "${config.m-0.weechat.freenode_pw}"
|
||||
freenode.autoconnect = on
|
||||
freenode.username = "${config.m-0.weechat.user}"
|
||||
freenode.autojoin = "#nixos,#matrix,#haskell"
|
||||
freenode.autojoin = "#nixos,#matrix,#haskell,#card10badge,#ghc,#home-manager,#krebs,#nixos-de"
|
||||
|
||||
[server]
|
||||
hackint.addresses = "irc.hackint.org/6697"
|
||||
|
|
Binary file not shown.
|
@ -1,58 +1,35 @@
|
|||
groups:
|
||||
- name: rules
|
||||
rules:
|
||||
- alert: VEDPageDown
|
||||
expr: probe_success{instance=~".*vocalensemble.*"} == 0
|
||||
- alert: ProbeTimeout
|
||||
expr: probe_success == 0
|
||||
for: 60m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'A BlackBoxProbe timed out.'
|
||||
summary: 'Instance {{ $labels.instance }} does not respond as wished.'
|
||||
- alert: BlackBoxProbeTimeout
|
||||
expr: probe_success{instance!~".*vocalensemble.*"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: 'A BlackBoxProbe timed out.'
|
||||
summary: 'Instance {{ $labels.instance }} does not respond as wished.'
|
||||
- alert: node_down
|
||||
expr: (up{name!="apollo"} == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.name }} has been down for more than 5 minutes.'
|
||||
summary: '{{$labels.name}}: Node is down.'
|
||||
- alert: systemd_service_failed
|
||||
expr: node_systemd_unit_state{state="failed"} == 1
|
||||
for: 4m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.name}} failed to (re)start service {{$labels.exported_name}}.'
|
||||
summary: '{{$labels.name}}: Service {{$labels.exported_name}} failed.'
|
||||
- alert: systemd_service_flapping
|
||||
expr: changes(node_systemd_unit_state{state="failed"}[5m]) > 5 or (changes(node_systemd_unit_state{state="failed"}[1h]) > 15 unless changes(node_systemd_unit_state{state="failed"}[30m]) < 7)
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.name}}: Service {{$labels.exported_name}} changed its state more than 5x/5min or 15x/1h'
|
||||
summary: '{{$labels.name}}: Service {{$labels.exported_name}} is flapping.'
|
||||
- alert: node_filesystem_full_90percent
|
||||
expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024 ^ 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem.'
|
||||
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
|
||||
- alert: node_filesystem_full_in_4h
|
||||
expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
|
||||
is running out of space of in approx. 4 hours'
|
||||
|
@ -60,8 +37,6 @@ groups:
|
|||
- alert: node_filedescriptors_full_in_3h
|
||||
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
|
||||
for: 20m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is running out of available file descriptors
|
||||
in approx. 3 hours'
|
||||
|
@ -70,8 +45,6 @@ groups:
|
|||
- alert: node_load1_90percent
|
||||
expr: node_load1 / on(alias) count by(alias) (node_cpu{mode="system"}) >= 0.9
|
||||
for: 1h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is running with > 90% total load for at least
|
||||
1h.'
|
||||
|
@ -79,8 +52,6 @@ groups:
|
|||
- alert: node_cpu_util_90percent
|
||||
expr: 100 - (avg by(alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
|
||||
for: 1h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} has total CPU utilization over 90% for at least
|
||||
1h.'
|
||||
|
@ -88,8 +59,6 @@ groups:
|
|||
- alert: node_ram_using_90percent
|
||||
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is using at least 90% of its RAM for at least
|
||||
30 minutes now.'
|
||||
|
@ -97,8 +66,6 @@ groups:
|
|||
- alert: node_swap_using_80percent
|
||||
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{$labels.alias}} is using 80% of its swap space for at least
|
||||
10 minutes now.'
|
||||
|
|
Loading…
Reference in a new issue