参考自: https://gitlab.com/gitlab-com/runbooks/blob/master/rules/node.yml

    1. groups:
    2. - name: CPU rules
    3. interval: 1m
    4. rules:
    5. # The count of CPUs per node, useful for getting CPU time as a percent of total.
    6. - record: instance:node_cpus:count
    7. expr: >
    8. count without (cpu, mode) (
    9. node_cpu_seconds_total{mode="idle"}
    10. )
    11. # CPU in use by CPU.
    12. - record: instance_cpu:node_cpu_seconds_not_idle:rate1m
    13. expr: >
    14. sum without (mode) (
    15. 1 - rate(node_cpu_seconds_total{mode="idle"}[1m])
    16. )
    17. # CPU in use by mode.
    18. # Split recording for iowait to avoid reset bugs.
    19. - record: instance_mode:node_cpu_seconds:rate1m
    20. expr: >
    21. sum without (cpu) (
    22. rate(node_cpu_seconds_total{mode!="iowait"}[1m])
    23. )
    24. - record: instance_mode:node_cpu_seconds:deriv1m
    25. expr: >
    26. sum without (cpu) (
    27. deriv(node_cpu_seconds_total{mode="iowait"}[1m]) > 0
    28. )
    29. # CPU in use ratio.
    30. - record: instance:node_cpu_utilization:ratio
    31. expr: >
    32. avg without (cpu) (
    33. instance_cpu:node_cpu_seconds_not_idle:rate1m
    34. )
    35. # CPU summaries
    36. - record: job:node_cpu_utilization:min_ratio
    37. expr: >
    38. min without (fqdn,instance,node,pod) (
    39. instance:node_cpu_utilization:ratio
    40. )
    41. - record: job:node_cpu_utilization:avg_ratio
    42. expr: >
    43. avg without (fqdn,instance,node,pod) (
    44. instance:node_cpu_utilization:ratio
    45. )
    46. - record: job:node_cpu_utilization:max_ratio
    47. expr: >
    48. max without (fqdn,instance,node,pod) (
    49. instance:node_cpu_utilization:ratio
    50. )
    51. # CPU Alerts
    52. - alert: HighCPU
    53. expr: instance:node_cpu_utilization:ratio > 0.95
    54. for: 2h
    55. labels:
    56. pager: pagerduty
    57. severity: s1
    58. alert_type: cause
    59. annotations:
    60. runbook: docs/uncategorized/node_cpu.md
    61. title: CPU use percent is extremely high on {{ if $labels.fqdn }}{{ $labels.fqdn
    62. }}{{ else }}{{ $labels.instance }}{{ end }} for the past 2 hours.
    63. # Rules for calculating and alerting on long-term node utilization issues.
    64. - name: Utilization
    65. interval: 300s
    66. rules:
    67. - record: instance:cpu_utilization:ratio_max
    68. expr: max_over_time(instance:node_cpu_utilization:ratio[300s])
    69. - record: instance:cpu_utilization:ratio_avg
    70. expr: avg_over_time(instance:node_cpu_utilization:ratio[300s])
    71. - record: instance:cpu_utilization:ratio_q95
    72. expr: quantile_over_time(0.95, instance:node_cpu_utilization:ratio[300s])
    73. - record: instance:memory_utilization:ratio_max
    74. expr: max_over_time(instance:node_memory_utilization:ratio[300s])
    75. - record: instance:memory_utilization:ratio_avg
    76. expr: avg_over_time(instance:node_memory_utilization:ratio[300s])
    77. - record: instance:memory_utilization:ratio_q95
    78. expr: quantile_over_time(0.95, instance:node_memory_utilization:ratio[300s])
    79. # TODO(bjk): This is a proposal for an alert. We should send this to a webhook that opens an issue.
    80. # - alert: NodeUnderUtilized
    81. # expr: >
    82. # (quantile_over_time(0.95, instance:cpu_utilization:ratio_q95[1d]) * 100 < 10)
    83. # and
    84. # (quantile_over_time(0.95, instance:memory_utilization:ratio_q95[1d]) * 100 < 10)
    85. # for: 7d
    86. - name: Node memory
    87. rules:
    88. - record: instance:node_memory_available:ratio
    89. expr: >
    90. (
    91. node_memory_MemAvailable_bytes or
    92. (
    93. node_memory_Buffers_bytes +
    94. node_memory_Cached_bytes +
    95. node_memory_MemFree_bytes +
    96. node_memory_Slab_bytes
    97. )
    98. ) /
    99. node_memory_MemTotal_bytes
    100. - record: instance:node_memory_utilization:ratio
    101. expr: 1 - instance:node_memory_available:ratio
    102. - name: Node filesystem rules
    103. rules:
    104. - record: instance:node_filesystem_avail:ratio
    105. expr: node_filesystem_avail_bytes{device=~"(/dev/.+|tank/dataset)"} / node_filesystem_size_bytes{device=~"(/dev/.+|tank/dataset)"}
    106. - record: instance:node_disk_writes_completed:irate1m
    107. expr: sum(irate(node_disk_writes_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)
    108. - record: instance:node_disk_reads_completed:irate1m
    109. # expr: sum(irate(node_disk_reads_completed_total{device=~"sd.*"}[1m])) WITHOUT (device)
    110. expr: |-
    111. avg by (node) (
    112. irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
    113. * on (namespace, pod) group_left(node)
    114. node_namespace_pod:kube_pod_info:
    115. )
    116. - record: node:node_disk_utilisation:avg_irate
    117. expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
    118. - record: ':node_disk_saturation:avg_irate'
    119. expr: |-
    120. avg by (node) (
    121. irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
    122. * on (namespace, pod) group_left(node)
    123. node_namespace_pod:kube_pod_info:
    124. )
    125. - record: node:node_disk_saturation:avg_irate
    126. expr: |-
    127. max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
    128. - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
    129. / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
    130. - record: 'node:node_filesystem_usage:'
    131. expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
    132. # - record: 'node:node_filesystem_avail:'
    133. - alert: FilesystemFullSoon
    134. expr: predict_linear(node_filesystem_avail_bytes{fstype=~"(ext.|xfs|zfs)"}[6h], 24 * 3600) < 0
    135. for: 2h
    136. labels:
    137. severity: s4
    138. alert_type: cause
    139. annotations:
    140. title: 'Filesystem will be full SOON'
    141. description: >
    142. The filesystem is predicted to be full in will be full in the next 24 hours.
    143. runbook: docs/monitoring/filesystem_alerts.md
    144. instance: '{{ $labels.instance }}'
    145. device: '{{ $labels.device }}'
    146. mountpoint: '{{ $labels.mountpoint }}'
    147. - name: Misc rules
    148. rules:
    149. - record: instance:up:count
    150. expr: count(up{job="node"} == 1) WITHOUT (instance, fqdn)
    151. - alert: FleetSizeChanged
    152. expr: changes(instance:up:count[2m]) >= 1
    153. for: 1m
    154. labels:
    155. severity: s4
    156. alert_type: cause
    157. annotations:
    158. description: The {{ $labels.type }} fleet has changed, this can be due to having
    159. more or less, if it's the latter it can be because nodes went down silently
    160. title: The fleet size has changed in the last 5 minutes
    161. - alert: HighMemoryPressure
    162. expr: instance:node_memory_available:ratio * 100 < 10 and rate(node_vmstat_pgmajfault[1m]) > 1000
    163. for: 1m
    164. labels:
    165. severity: s4
    166. alert_type: cause
    167. annotations:
    168. description: The node is under heavy memory pressure. The available memory is under 5% and
    169. there is a high rate of major page faults.
    170. runbook: docs/monitoring/node_memory_alerts.md
    171. value: 'Available memory {{ $value | printf "%.2f" }}%'
    172. title: Node is under heavy memory pressure
    173. - alert: CPUStalls
    174. expr: rate(syslog_rcu_sched_stalls_total[1m]) > 0 or rate(rcu_sched_stalls_total[1m]) > 0
    175. for: 10m
    176. labels:
    177. severity: s4
    178. alert_type: cause
    179. annotations:
    180. description: The node is encountering RCU CPU stall warnings, which may cause the node to lock up occasionally.
    181. Check `/var/log/kern.log` for more details. You may need to contact the cloud provider and possibly redeploy the VM.
    182. title: CPU stall warnings have been detected on {{ if $labels.fqdn }}{{ $labels.fqdn }}
    183. {{ else }}{{ $labels.instance }}{{ end }} for the past 10 minutes.
    184. # TODO eventually, once all kernels are upgraded to support
    185. # node_vmstat_oom_kill, we can deprecate the syslog_oom_events metric and remove
    186. # it from our mtail configuration.
    187. - name: OOM kills detected
    188. rules:
    189. - alert: TooManyOOMKills
    190. expr: >
    191. increase(node_vmstat_oom_kill[1h]) > 2
    192. or
    193. increase(syslog_oom_kills_total[1h]) > 2
    194. labels:
    195. severity: s3
    196. alert_type: cause
    197. annotations:
    198. title: Several OOM kills detected on {{$labels.fqdn}} recently
    199. description: >
    200. Find out which process by running `dmesg | grep -i oom`, and continue
    201. debugging.