编辑prometheus.yml
rule_files区域新增/opt/prometheus/rules/*.yml
rule_files:
- "/opt/prometheus/rules/*.yml"
网络/cpu/内存/分区监控规则
mkdir -p /opt/prometheus/rules/
vim /opt/prometheus/rules/linux.yml
groups:
- name: linux
rules:
- alert: "内存使用率过高"
expr: round(((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Buffers_bytes-node_memory_Cached_bytes-node_memory_SReclaimable_bytes)/node_memory_MemTotal_bytes)*100) > 90
for: 1m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "当前使用率{{ $value }}%"
- alert: "CPU使用率过高"
expr: round(100 - ((avg by (instance,job,hostname)(irate(node_cpu_seconds_total{mode="idle",instance!~'bac-.*'}[5m]))) *100)) > 95
for: 2m
labels:
severity: warning
annotations:
summary: "CPU使用率过高"
description: "当前使用率{{ $value }}%"
- alert: "磁盘使用率过高"
expr: round(100-100*(node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) > 90
for: 15s
labels:
severity: warning
annotations:
summary: "磁盘使用率过高"
description: "当前磁盘{{$labels.mountpoint}} 使用率{{ $value }}%"
- alert: "分区容量过低"
expr: round(node_filesystem_avail_bytes{fstype=~"ext4|xfs",instance!~"testnode",mountpoint!~"/boot.*"}/1024/1024/1024) < 5
for: 15s
labels:
severity: warning
annotations:
summary: "分区容量过低"
description: "当前分区{{$labels.mountpoint}} 容量{{ $value }}GB"
- alert: "网络流出速率过高"
expr: round(irate(node_network_receive_bytes_total{instance!~"data.*",device!~'tap.*|veth.*|br.*|docker.*|vir.*|lo.*|vnet.*'}[1m])/1024) > 12040
for: 1m
labels:
severity: warning
annotations:
summary: "网络流出速率过高"
description: "当前速率{{ $value }}KB/s"
node监控检查规则
[root@bogon rules]# vim /opt/prometheus/rules/node.yml
groups:
- name: node_exporter
rules:
- alert: node_exporter is down
expr: up{job="node_resources"} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} 已停止运行超过 15s!"
systemctl reload prometheus
浏览器访问http://192.168.0.15:9090/rules
查看已经生效。