1. - name: IOPS告警
    2. rules:
    3. - alert: 系统磁盘读IOPS告警
    4. expr: irate(node_disk_reads_completed_total{group="linux",device="vda"}[1m]) > 400
    5. for: 1s
    6. annotations:
    7. summary: "{{ $labels.name }}: 系统磁盘读IOPS告警"
    8. description: "{{ $labels.instance }},系统磁盘读IOPS大于400 当前值为: {{ $value }}"
    9. labels:
    10. severity: warning
    11. - name: IO使用率过高
    12. rules:
    13. - alert: IO性能
    14. expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
    15. for: 1s
    16. annotations:
    17. summary: "{{ $labels.name }}: 流入磁盘IO使用率过高!"
    18. description: "{{ $labels.instance }},流入磁盘IO大于60% 当前值为: {{ $value }}"
    19. labels:
    20. severity: warning
    21. - name: 负载使用率告警
    22. rules:
    23. - alert: "node_cpu"
    24. expr: ceil(sum(node_load5 ) by (instance) / count(count(node_cpu_seconds_total) by(cpu,instance)) by (instance)*100) >150
    25. for: 1s
    26. annotations:
    27. summary: "负载使用过高"
    28. description: "{{$labels.instance}}在5分钟平均负载超过100,当前负载率为{{ printf \"%.0f\" $value }}%"
    29. value: "{{ $value }}"
    30. labels:
    31. severity: warning
    32. - name: TCP会话告警
    33. rules:
    34. - alert: TCP会话
    35. expr: node_netstat_Tcp_CurrEstab > 1000
    36. for: 1s
    37. annotations:
    38. summary: "TCP_ESTABLISHED过高!"
    39. description: "{{ $labels.instance }},TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
    40. labels:
    41. severity: warning
    42. - name: Mysql_High_QPS告警
    43. rules:
    44. - alert: Mysql_High_QPS
    45. expr: rate(mysql_global_status_questions[5m]) > 500
    46. for: 2m
    47. labels:
    48. severity: warning
    49. annotations:
    50. summary: "{{$labels.name}}: Mysql_High_QPS 检测"
    51. description: "{{$labels.instance}}: Mysql操作速度超过每秒500次 ,(当前值: {{ $value }})"
    52. - name: SQL 线程告警
    53. rules:
    54. - alert: SQL thread stopped
    55. expr: mysql_slave_status_slave_sql_running != 1
    56. for: 1m
    57. labels:
    58. severity: warning
    59. annotations:
    60. summary: "Instance {{ $labels.instance }} SQL 线程已停止"
    61. description: " This is usually because it cannot apply a SQL statement received from the master."
    62. - name: mysql连接数告警
    63. rules:
    64. - alert: Mysql_Too_Many_Connections
    65. expr: rate(mysql_global_status_threads_connected[5m]) > 200
    66. for: 2m
    67. labels:
    68. severity: warning
    69. annotations:
    70. summary: "{{$labels.instance}}: 检测到Mysql连接太多"
    71. description: "{{$labels.instance}}: Mysql Connections is more than 200 per second ,(current value is: {{ $value }})"
    72. - name: ES集群状态告警
    73. rules:
    74. - alert: ES集群状态
    75. expr: elasticsearch_cluster_health_status{color="green",group="es"} != 1
    76. for: 1s
    77. annotations:
    78. summary: "{{ $labels.name }}:分片异常"
    79. description: "{{ $labels.instance }},主分片和副本分片异常 (目前使用:{{$value}})"
    80. labels:
    81. severity: warning
    82. - name: ES副本告警
    83. rules:
    84. - alert: ES副本分片丢失
    85. expr: elasticsearch_cluster_health_unassigned_shards{group="es"} != 0
    86. for: 1s
    87. annotations:
    88. summary: "{{ $labels.name }}:ES副本分片丢失"
    89. description: "{{ $labels.instance }},当前节点正在迁移到其他节点的分片数量,通常为0,集群中有节点新加入或者退出时该值会增加 (目前使用:{{$value}})"
    90. labels:
    91. severity: warning
    92. - name: ES CPU使用率告警
    93. rules:
    94. - alert: ES CPU使用率
    95. expr: elasticsearch_process_cpu_percent{group="es"} > 50
    96. for: 1s
    97. annotations:
    98. summary: "{{ $labels.name }}:ES CPU使用率异常"
    99. description: "{{ $labels.instance }},ES CPU使用率大于百分之50% (目前使用:{{$value}})"
    100. labels:
    101. severity: warning