参考自: https://gitlab.com/gitlab-com/runbooks/blob/master/rules/node.yml
groups:
- name: CPU rules
interval: 1m
rules:
# The count of CPUs per node, useful for getting CPU time as a percent of total.
- record: instance:node_cpus:count
expr: >
count without (cpu, mode) (
node_cpu_seconds_total{mode="idle"}
)
# CPU in use by CPU.
- record: instance_cpu:node_cpu_seconds_not_idle:rate1m
expr: >
sum without (mode) (
1 - rate(node_cpu_seconds_total{mode="idle"}[1m])
)
# CPU in use by mode.
# Split recording for iowait to avoid reset bugs.
- record: instance_mode:node_cpu_seconds:rate1m
expr: >
sum without (cpu) (
rate(node_cpu_seconds_total{mode!="iowait"}[1m])
)
- record: instance_mode:node_cpu_seconds:deriv1m
expr: >
sum without (cpu) (
deriv(node_cpu_seconds_total{mode="iowait"}[1m]) > 0
)
# CPU in use ratio.
- record: instance:node_cpu_utilization:ratio
expr: >
avg without (cpu) (
instance_cpu:node_cpu_seconds_not_idle:rate1m
)
# CPU summaries
- record: job:node_cpu_utilization:min_ratio
expr: >
min without (fqdn,instance,node,pod) (
instance:node_cpu_utilization:ratio
)
- record: job:node_cpu_utilization:avg_ratio
expr: >
avg without (fqdn,instance,node,pod) (
instance:node_cpu_utilization:ratio
)
- record: job:node_cpu_utilization:max_ratio
expr: >
max without (fqdn,instance,node,pod) (
instance:node_cpu_utilization:ratio
)
# CPU Alerts
- alert: HighCPU
expr: instance:node_cpu_utilization:ratio > 0.95
for: 2h
labels:
pager: pagerduty
severity: s1
alert_type: cause
annotations:
runbook: docs/uncategorized/node_cpu.md
title: CPU use percent is extremely high on {{ if $labels.fqdn }}{{ $labels.fqdn
}}{{ else }}{{ $labels.instance }}{{ end }} for the past 2 hours.
# Rules for calculating and alerting on long-term node utilization issues.
- name: Utilization
interval: 300s
rules:
- record: instance:cpu_utilization:ratio_max
expr: max_over_time(instance:node_cpu_utilization:ratio[300s])
- record: instance:cpu_utilization:ratio_avg
expr: avg_over_time(instance:node_cpu_utilization:ratio[300s])
- record: instance:cpu_utilization:ratio_q95
expr: quantile_over_time(0.95, instance:node_cpu_utilization:ratio[300s])
- record: instance:memory_utilization:ratio_max
expr: max_over_time(instance:node_memory_utilization:ratio[300s])
- record: instance:memory_utilization:ratio_avg
expr: avg_over_time(instance:node_memory_utilization:ratio[300s])
- record: instance:memory_utilization:ratio_q95
expr: quantile_over_time(0.95, instance:node_memory_utilization:ratio[300s])
# TODO(bjk): This is a proposal for an alert. We should send this to a webhook that opens an issue.
# - alert: NodeUnderUtilized
# expr: >
# (quantile_over_time(0.95, instance:cpu_utilization:ratio_q95[1d]) * 100 < 10)
# and
# (quantile_over_time(0.95, instance:memory_utilization:ratio_q95[1d]) * 100 < 10)
# for: 7d
- name: Node memory
rules:
- record: instance:node_memory_available:ratio
expr: >
(
node_memory_MemAvailable_bytes or
(
node_memory_Buffers_bytes +
node_memory_Cached_bytes +
node_memory_MemFree_bytes +
node_memory_Slab_bytes
)
) /
node_memory_MemTotal_bytes
- record: instance:node_memory_utilization:ratio
expr: 1 - instance:node_memory_available:ratio
- name: Node filesystem rules
rules:
- record: instance:node_filesystem_avail:ratio
expr: node_filesystem_avail_bytes{device=~"(/dev/.+|tank/dataset)"} / node_filesystem_size_bytes{device=~"(/dev/.+|tank/dataset)"}
- record: instance:node_disk_writes_completed:irate1m
expr: sum(irate(node_disk_writes_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)
- record: instance:node_disk_reads_completed:irate1m
# expr: sum(irate(node_disk_reads_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)
expr: |-
avg by (node) (
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:node_disk_utilisation:avg_irate
expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
- record: ':node_disk_saturation:avg_irate'
expr: |-
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:node_disk_saturation:avg_irate
expr: |-
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: 'node:node_filesystem_usage:'
expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
# - record: 'node:node_filesystem_avail:'
- alert: FilesystemFullSoon
expr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0
for: 2h
labels:
severity: s4
alert_type: cause
annotations:
title: 'Filesystem will be full SOON'
description: >
The filesystem is predicted to be full in will be full in the next 24 hours.
runbook: docs/monitoring/filesystem_alerts.md
instance: '{{ $labels.instance }}'
device: '{{ $labels.device }}'
mountpoint: '{{ $labels.mountpoint }}'
- name: Misc rules
rules:
- record: instance:up:count
expr: count(up{job="node"} == 1) WITHOUT (instance, fqdn)
- alert: FleetSizeChanged
expr: changes(instance:up:count[2m]) >= 1
for: 1m
labels:
severity: s4
alert_type: cause
annotations:
description: The {{ $labels.type }} fleet has changed, this can be due to having
more or less, if it's the latter it can be because nodes went down silently
title: The fleet size has changed in the last 5 minutes
- alert: HighMemoryPressure
expr: instance:node_memory_available:ratio * 100 < 10 and rate(node_vmstat_pgmajfault[1m]) > 1000
for: 1m
labels:
severity: s4
alert_type: cause
annotations:
description: The node is under heavy memory pressure. The available memory is under 5% and
there is a high rate of major page faults.
runbook: docs/monitoring/node_memory_alerts.md
value: 'Available memory {{ $value | printf "%.2f" }}%'
title: Node is under heavy memory pressure
- alert: CPUStalls
expr: rate(syslog_rcu_sched_stalls_total[1m]) > 0 or rate(rcu_sched_stalls_total[1m]) > 0
for: 10m
labels:
severity: s4
alert_type: cause
annotations:
description: The node is encountering RCU CPU stall warnings, which may cause the node to lock up occasionally.
Check `/var/log/kern.log` for more details. You may need to contact the cloud provider and possibly redeploy the VM.
title: CPU stall warnings have been detected on {{ if $labels.fqdn }}{{ $labels.fqdn }}
{{ else }}{{ $labels.instance }}{{ end }} for the past 10 minutes.
# TODO eventually, once all kernels are upgraded to support
# node_vmstat_oom_kill, we can deprecate the syslog_oom_events metric and remove
# it from our mtail configuration.
- name: OOM kills detected
rules:
- alert: TooManyOOMKills
expr: >
increase(node_vmstat_oom_kill[1h]) > 2
or
increase(syslog_oom_kills_total[1h]) > 2
labels:
severity: s3
alert_type: cause
annotations:
title: Several OOM kills detected on {{$labels.fqdn}} recently
description: >
Find out which process by running `dmesg | grep -i oom`, and continue
debugging.