编辑prometheus.yml
    rule_files区域新增/opt/prometheus/rules/*.yml

    1. rule_files:
    2. - "/opt/prometheus/rules/*.yml"

    网络/cpu/内存/分区监控规则

    mkdir -p /opt/prometheus/rules/
    vim /opt/prometheus/rules/linux.yml
    groups:
    - name: linux
      rules:
      - alert: "内存使用率过高"
        expr: round(((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Buffers_bytes-node_memory_Cached_bytes-node_memory_SReclaimable_bytes)/node_memory_MemTotal_bytes)*100) > 90
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "内存使用率过高"
          description: "当前使用率{{ $value }}%"
    
      - alert: "CPU使用率过高"
        expr: round(100 - ((avg by (instance,job,hostname)(irate(node_cpu_seconds_total{mode="idle",instance!~'bac-.*'}[5m]))) *100)) > 95
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "CPU使用率过高"
          description: "当前使用率{{ $value }}%"
    
      - alert: "磁盘使用率过高"
        expr: round(100-100*(node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) > 90
        for: 15s
        labels:
          severity: warning
        annotations:
          summary: "磁盘使用率过高"
          description: "当前磁盘{{$labels.mountpoint}} 使用率{{ $value }}%"
    
      - alert: "分区容量过低"
        expr: round(node_filesystem_avail_bytes{fstype=~"ext4|xfs",instance!~"testnode",mountpoint!~"/boot.*"}/1024/1024/1024) < 5
        for: 15s
        labels:
          severity: warning
        annotations:
          summary: "分区容量过低"
          description: "当前分区{{$labels.mountpoint}} 容量{{ $value }}GB"
    
      - alert: "网络流出速率过高"
        expr: round(irate(node_network_receive_bytes_total{instance!~"data.*",device!~'tap.*|veth.*|br.*|docker.*|vir.*|lo.*|vnet.*'}[1m])/1024) > 12040
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "网络流出速率过高"
          description: "当前速率{{ $value }}KB/s"
    

    node监控检查规则

    [root@bogon rules]# vim /opt/prometheus/rules/node.yml
    groups:
    - name: node_exporter
      rules:
      - alert: node_exporter is down
        expr: up{job="node_resources"} == 0
        for: 15s
        labels:
          severity: 1
          team: node
        annotations:
          summary: "{{ $labels.instance }} 已停止运行超过 15s!"
    
    systemctl reload prometheus
    

    浏览器访问http://192.168.0.15:9090/rules
    查看已经生效。
    image.png