参考自: https://gitlab.com/gitlab-com/runbooks/blob/master/rules/node.yml
groups:- name: CPU rulesinterval: 1mrules:# The count of CPUs per node, useful for getting CPU time as a percent of total.- record: instance:node_cpus:countexpr: >count without (cpu, mode) (node_cpu_seconds_total{mode="idle"})# CPU in use by CPU.- record: instance_cpu:node_cpu_seconds_not_idle:rate1mexpr: >sum without (mode) (1 - rate(node_cpu_seconds_total{mode="idle"}[1m]))# CPU in use by mode.# Split recording for iowait to avoid reset bugs.- record: instance_mode:node_cpu_seconds:rate1mexpr: >sum without (cpu) (rate(node_cpu_seconds_total{mode!="iowait"}[1m]))- record: instance_mode:node_cpu_seconds:deriv1mexpr: >sum without (cpu) (deriv(node_cpu_seconds_total{mode="iowait"}[1m]) > 0)# CPU in use ratio.- record: instance:node_cpu_utilization:ratioexpr: >avg without (cpu) (instance_cpu:node_cpu_seconds_not_idle:rate1m)# CPU summaries- record: job:node_cpu_utilization:min_ratioexpr: >min without (fqdn,instance,node,pod) (instance:node_cpu_utilization:ratio)- record: job:node_cpu_utilization:avg_ratioexpr: >avg without (fqdn,instance,node,pod) (instance:node_cpu_utilization:ratio)- record: job:node_cpu_utilization:max_ratioexpr: >max without (fqdn,instance,node,pod) (instance:node_cpu_utilization:ratio)# CPU Alerts- alert: HighCPUexpr: instance:node_cpu_utilization:ratio > 0.95for: 2hlabels:pager: pagerdutyseverity: s1alert_type: causeannotations:runbook: docs/uncategorized/node_cpu.mdtitle: CPU use percent is extremely high on {{ if $labels.fqdn }}{{ $labels.fqdn}}{{ else }}{{ $labels.instance }}{{ end }} for the past 2 hours.# Rules for calculating and alerting on long-term node utilization issues.- name: Utilizationinterval: 300srules:- record: instance:cpu_utilization:ratio_maxexpr: max_over_time(instance:node_cpu_utilization:ratio[300s])- record: instance:cpu_utilization:ratio_avgexpr: avg_over_time(instance:node_cpu_utilization:ratio[300s])- record: instance:cpu_utilization:ratio_q95expr: quantile_over_time(0.95, instance:node_cpu_utilization:ratio[300s])- record: instance:memory_utilization:ratio_maxexpr: max_over_time(instance:node_memory_utilization:ratio[300s])- record: instance:memory_utilization:ratio_avgexpr: avg_over_time(instance:node_memory_utilization:ratio[300s])- record: instance:memory_utilization:ratio_q95expr: quantile_over_time(0.95, instance:node_memory_utilization:ratio[300s])# TODO(bjk): This is a proposal for an alert. We should send this to a webhook that opens an issue.# - alert: NodeUnderUtilized# expr: ># (quantile_over_time(0.95, instance:cpu_utilization:ratio_q95[1d]) * 100 < 10)# and# (quantile_over_time(0.95, instance:memory_utilization:ratio_q95[1d]) * 100 < 10)# for: 7d- name: Node memoryrules:- record: instance:node_memory_available:ratioexpr: >(node_memory_MemAvailable_bytes or(node_memory_Buffers_bytes +node_memory_Cached_bytes +node_memory_MemFree_bytes +node_memory_Slab_bytes)) /node_memory_MemTotal_bytes- record: instance:node_memory_utilization:ratioexpr: 1 - instance:node_memory_available:ratio- name: Node filesystem rulesrules:- record: instance:node_filesystem_avail:ratioexpr: node_filesystem_avail_bytes{device=~"(/dev/.+|tank/dataset)"} / node_filesystem_size_bytes{device=~"(/dev/.+|tank/dataset)"}- record: instance:node_disk_writes_completed:irate1mexpr: sum(irate(node_disk_writes_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)- record: instance:node_disk_reads_completed:irate1m# expr: sum(irate(node_disk_reads_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)expr: |-avg by (node) (irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])* on (namespace, pod) group_left(node)node_namespace_pod:kube_pod_info:)- record: node:node_disk_utilisation:avg_irateexpr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))- record: ':node_disk_saturation:avg_irate'expr: |-avg by (node) (irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])* on (namespace, pod) group_left(node)node_namespace_pod:kube_pod_info:)- record: node:node_disk_saturation:avg_irateexpr: |-max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})- record: 'node:node_filesystem_usage:'expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})# - record: 'node:node_filesystem_avail:'- alert: FilesystemFullSoonexpr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0for: 2hlabels:severity: s4alert_type: causeannotations:title: 'Filesystem will be full SOON'description: >The filesystem is predicted to be full in will be full in the next 24 hours.runbook: docs/monitoring/filesystem_alerts.mdinstance: '{{ $labels.instance }}'device: '{{ $labels.device }}'mountpoint: '{{ $labels.mountpoint }}'- name: Misc rulesrules:- record: instance:up:countexpr: count(up{job="node"} == 1) WITHOUT (instance, fqdn)- alert: FleetSizeChangedexpr: changes(instance:up:count[2m]) >= 1for: 1mlabels:severity: s4alert_type: causeannotations:description: The {{ $labels.type }} fleet has changed, this can be due to havingmore or less, if it's the latter it can be because nodes went down silentlytitle: The fleet size has changed in the last 5 minutes- alert: HighMemoryPressureexpr: instance:node_memory_available:ratio * 100 < 10 and rate(node_vmstat_pgmajfault[1m]) > 1000for: 1mlabels:severity: s4alert_type: causeannotations:description: The node is under heavy memory pressure. The available memory is under 5% andthere is a high rate of major page faults.runbook: docs/monitoring/node_memory_alerts.mdvalue: 'Available memory {{ $value | printf "%.2f" }}%'title: Node is under heavy memory pressure- alert: CPUStallsexpr: rate(syslog_rcu_sched_stalls_total[1m]) > 0 or rate(rcu_sched_stalls_total[1m]) > 0for: 10mlabels:severity: s4alert_type: causeannotations:description: The node is encountering RCU CPU stall warnings, which may cause the node to lock up occasionally.Check `/var/log/kern.log` for more details. You may need to contact the cloud provider and possibly redeploy the VM.title: CPU stall warnings have been detected on {{ if $labels.fqdn }}{{ $labels.fqdn }}{{ else }}{{ $labels.instance }}{{ end }} for the past 10 minutes.# TODO eventually, once all kernels are upgraded to support# node_vmstat_oom_kill, we can deprecate the syslog_oom_events metric and remove# it from our mtail configuration.- name: OOM kills detectedrules:- alert: TooManyOOMKillsexpr: >increase(node_vmstat_oom_kill[1h]) > 2orincrease(syslog_oom_kills_total[1h]) > 2labels:severity: s3alert_type: causeannotations:title: Several OOM kills detected on {{$labels.fqdn}} recentlydescription: >Find out which process by running `dmesg | grep -i oom`, and continuedebugging.
