- name: IOPS告警
rules:
- alert: 系统磁盘读IOPS告警
expr: irate(node_disk_reads_completed_total{group="linux",device="vda"}[1m]) > 400
for: 1s
annotations:
summary: "{{ $labels.name }}: 系统磁盘读IOPS告警"
description: "{{ $labels.instance }},系统磁盘读IOPS大于400 当前值为: {{ $value }}"
labels:
severity: warning
- name: IO使用率过高
rules:
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 1s
annotations:
summary: "{{ $labels.name }}: 流入磁盘IO使用率过高!"
description: "{{ $labels.instance }},流入磁盘IO大于60% 当前值为: {{ $value }}"
labels:
severity: warning
- name: 负载使用率告警
rules:
- alert: "node_cpu"
expr: ceil(sum(node_load5 ) by (instance) / count(count(node_cpu_seconds_total) by(cpu,instance)) by (instance)*100) >150
for: 1s
annotations:
summary: "负载使用过高"
description: "{{$labels.instance}}在5分钟平均负载超过100,当前负载率为{{ printf \"%.0f\" $value }}%"
value: "{{ $value }}"
labels:
severity: warning
- name: TCP会话告警
rules:
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1s
annotations:
summary: "TCP_ESTABLISHED过高!"
description: "{{ $labels.instance }},TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
labels:
severity: warning
- name: Mysql_High_QPS告警
rules:
- alert: Mysql_High_QPS
expr: rate(mysql_global_status_questions[5m]) > 500
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.name}}: Mysql_High_QPS 检测"
description: "{{$labels.instance}}: Mysql操作速度超过每秒500次 ,(当前值: {{ $value }})"
- name: SQL 线程告警
rules:
- alert: SQL thread stopped
expr: mysql_slave_status_slave_sql_running != 1
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} SQL 线程已停止"
description: " This is usually because it cannot apply a SQL statement received from the master."
- name: mysql连接数告警
rules:
- alert: Mysql_Too_Many_Connections
expr: rate(mysql_global_status_threads_connected[5m]) > 200
for: 2m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: 检测到Mysql连接太多"
description: "{{$labels.instance}}: Mysql Connections is more than 200 per second ,(current value is: {{ $value }})"
- name: ES集群状态告警
rules:
- alert: ES集群状态
expr: elasticsearch_cluster_health_status{color="green",group="es"} != 1
for: 1s
annotations:
summary: "{{ $labels.name }}:分片异常"
description: "{{ $labels.instance }},主分片和副本分片异常 (目前使用:{{$value}})"
labels:
severity: warning
- name: ES副本告警
rules:
- alert: ES副本分片丢失
expr: elasticsearch_cluster_health_unassigned_shards{group="es"} != 0
for: 1s
annotations:
summary: "{{ $labels.name }}:ES副本分片丢失"
description: "{{ $labels.instance }},当前节点正在迁移到其他节点的分片数量,通常为0,集群中有节点新加入或者退出时该值会增加 (目前使用:{{$value}})"
labels:
severity: warning
- name: ES CPU使用率告警
rules:
- alert: ES CPU使用率
expr: elasticsearch_process_cpu_percent{group="es"} > 50
for: 1s
annotations:
summary: "{{ $labels.name }}:ES CPU使用率异常"
description: "{{ $labels.instance }},ES CPU使用率大于百分之50% (目前使用:{{$value}})"
labels:
severity: warning