关键报警规则 - 《Prometheus》

- name: IOPS告警
  rules:
  - alert: 系统磁盘读IOPS告警
    expr: irate(node_disk_reads_completed_total{group="linux",device="vda"}[1m]) > 400
    for: 1s
    annotations:
      summary: "{{ $labels.name }}: 系统磁盘读IOPS告警"
      description: "{{ $labels.instance }},系统磁盘读IOPS大于400 当前值为: {{ $value }}"
    labels:
      severity: warning
- name: IO使用率过高
  rules:
  - alert: IO性能
    expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
    for: 1s
    annotations:
      summary: "{{ $labels.name }}: 流入磁盘IO使用率过高！"
      description: "{{ $labels.instance }},流入磁盘IO大于60% 当前值为: {{ $value }}"
    labels:
      severity: warning
- name: 负载使用率告警
  rules:
  - alert: "node_cpu"
    expr: ceil(sum(node_load5 ) by (instance) / count(count(node_cpu_seconds_total) by(cpu,instance)) by (instance)*100) >150
    for: 1s
    annotations:
      summary: "负载使用过高"
      description: "{{$labels.instance}}在5分钟平均负载超过100,当前负载率为{{ printf \"%.0f\" $value }}%"
      value: "{{ $value }}"
    labels:
      severity: warning
- name: TCP会话告警
  rules:
  - alert: TCP会话
    expr: node_netstat_Tcp_CurrEstab > 1000
    for: 1s
    annotations:
      summary: "TCP_ESTABLISHED过高！"
      description: "{{ $labels.instance }},TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
    labels:
      severity: warning
- name: Mysql_High_QPS告警
  rules:
  - alert: Mysql_High_QPS
    expr: rate(mysql_global_status_questions[5m]) > 500 
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{$labels.name}}: Mysql_High_QPS 检测"
      description: "{{$labels.instance}}: Mysql操作速度超过每秒500次 ,(当前值: {{ $value }})"
- name: SQL 线程告警
  rules:
  - alert: SQL thread stopped
    expr: mysql_slave_status_slave_sql_running != 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} SQL 线程已停止"
      description: " This is usually because it cannot apply a SQL statement received from the master."
- name: mysql连接数告警
  rules:
  - alert: Mysql_Too_Many_Connections
    expr: rate(mysql_global_status_threads_connected[5m]) > 200
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "{{$labels.instance}}: 检测到Mysql连接太多"
      description: "{{$labels.instance}}: Mysql Connections is more than 200 per second ,(current value is: {{ $value }})"
- name: ES集群状态告警
  rules: 
  - alert: ES集群状态
    expr: elasticsearch_cluster_health_status{color="green",group="es"} != 1
    for: 1s
    annotations:
      summary: "{{ $labels.name }}:分片异常"
      description: "{{ $labels.instance }},主分片和副本分片异常 (目前使用:{{$value}})"
    labels:
      severity: warning
- name: ES副本告警
  rules: 
  - alert: ES副本分片丢失
    expr: elasticsearch_cluster_health_unassigned_shards{group="es"} != 0
    for: 1s
    annotations:
      summary: "{{ $labels.name }}:ES副本分片丢失"
      description: "{{ $labels.instance }},当前节点正在迁移到其他节点的分片数量，通常为0，集群中有节点新加入或者退出时该值会增加 (目前使用:{{$value}})"
    labels:
      severity: warning
- name: ES CPU使用率告警
  rules: 
  - alert: ES CPU使用率
    expr: elasticsearch_process_cpu_percent{group="es"} > 50
    for: 1s
    annotations:
      summary: "{{ $labels.name }}:ES CPU使用率异常"
      description: "{{ $labels.instance }},ES CPU使用率大于百分之50% (目前使用:{{$value}})"
    labels:
      severity: warning