- name: IOPS告警 rules: - alert: 系统磁盘读IOPS告警 expr: irate(node_disk_reads_completed_total{group="linux",device="vda"}[1m]) > 400 for: 1s annotations: summary: "{{ $labels.name }}: 系统磁盘读IOPS告警" description: "{{ $labels.instance }},系统磁盘读IOPS大于400 当前值为: {{ $value }}" labels: severity: warning- name: IO使用率过高 rules: - alert: IO性能 expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60 for: 1s annotations: summary: "{{ $labels.name }}: 流入磁盘IO使用率过高!" description: "{{ $labels.instance }},流入磁盘IO大于60% 当前值为: {{ $value }}" labels: severity: warning- name: 负载使用率告警 rules: - alert: "node_cpu" expr: ceil(sum(node_load5 ) by (instance) / count(count(node_cpu_seconds_total) by(cpu,instance)) by (instance)*100) >150 for: 1s annotations: summary: "负载使用过高" description: "{{$labels.instance}}在5分钟平均负载超过100,当前负载率为{{ printf \"%.0f\" $value }}%" value: "{{ $value }}" labels: severity: warning- name: TCP会话告警 rules: - alert: TCP会话 expr: node_netstat_Tcp_CurrEstab > 1000 for: 1s annotations: summary: "TCP_ESTABLISHED过高!" description: "{{ $labels.instance }},TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)" labels: severity: warning- name: Mysql_High_QPS告警 rules: - alert: Mysql_High_QPS expr: rate(mysql_global_status_questions[5m]) > 500 for: 2m labels: severity: warning annotations: summary: "{{$labels.name}}: Mysql_High_QPS 检测" description: "{{$labels.instance}}: Mysql操作速度超过每秒500次 ,(当前值: {{ $value }})"- name: SQL 线程告警 rules: - alert: SQL thread stopped expr: mysql_slave_status_slave_sql_running != 1 for: 1m labels: severity: warning annotations: summary: "Instance {{ $labels.instance }} SQL 线程已停止" description: " This is usually because it cannot apply a SQL statement received from the master."- name: mysql连接数告警 rules: - alert: Mysql_Too_Many_Connections expr: rate(mysql_global_status_threads_connected[5m]) > 200 for: 2m labels: severity: warning annotations: summary: "{{$labels.instance}}: 检测到Mysql连接太多" description: "{{$labels.instance}}: Mysql Connections is more than 200 per second ,(current value is: {{ $value }})"- name: ES集群状态告警 rules: - alert: ES集群状态 expr: elasticsearch_cluster_health_status{color="green",group="es"} != 1 for: 1s annotations: summary: "{{ $labels.name }}:分片异常" description: "{{ $labels.instance }},主分片和副本分片异常 (目前使用:{{$value}})" labels: severity: warning- name: ES副本告警 rules: - alert: ES副本分片丢失 expr: elasticsearch_cluster_health_unassigned_shards{group="es"} != 0 for: 1s annotations: summary: "{{ $labels.name }}:ES副本分片丢失" description: "{{ $labels.instance }},当前节点正在迁移到其他节点的分片数量,通常为0,集群中有节点新加入或者退出时该值会增加 (目前使用:{{$value}})" labels: severity: warning- name: ES CPU使用率告警 rules: - alert: ES CPU使用率 expr: elasticsearch_process_cpu_percent{group="es"} > 50 for: 1s annotations: summary: "{{ $labels.name }}:ES CPU使用率异常" description: "{{ $labels.instance }},ES CPU使用率大于百分之50% (目前使用:{{$value}})" labels: severity: warning