主机:

    1. groups:
    2. - name: Instance
    3. rules:
    4. - alert: instance连接超时
    5. expr: up{type=~"machine"} == 0
    6. for: 1m
    7. labels:
    8. level: 3
    9. annotations:
    10. summary: "Instance连接超时"
    11. description: "{{ $labels.instance }} exporter连接超时一分钟"
    12. - alert: CPU告警
    13. expr: round(100 - ((irate(node_cpu_seconds_total{group=~"VAS",mode="idle"}[5m])) * 100)) > 75
    14. for: 1m
    15. labels:
    16. level: 3
    17. annotations:
    18. summary: "CPU使用率告警"
    19. description: "CPU当前使用率:{{ $value }}%"
    20. - alert: CPU告警
    21. expr: round((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}))) > 5000
    22. for: 5m
    23. labels:
    24. level: 3
    25. annotations:
    26. summary: "CPU上下文切换告警"
    27. description: "CPU上下文切换超过5000/s:当前{{ $value }}/s"
    28. - alert: 内存告警
    29. expr: round((1 - (node_memory_MemAvailable_bytes{group=~"DMP"} / (node_memory_MemTotal_bytes{group="DMP"})))* 100) > 85
    30. for: 30s
    31. labels:
    32. level: 3
    33. annotations:
    34. summary: "内存使用率告警"
    35. description: "内存当前占用:{{ $value }}%"
    36. - alert: 内存告警
    37. expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    38. for: 2m
    39. labels:
    40. level: 3
    41. annotations:
    42. summary: "内存页面错误告警"
    43. description: "一级页面的平均错误数 {{ $value }}"
    44. - alert: 网络告警
    45. expr: sum by (instance,group,project,cloud)(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    46. for: 5m
    47. labels:
    48. level: 3
    49. annotations:
    50. summary: "网络接口流入流量异常"
    51. description: "网络接口流入流量异常 流量 {{ $value }}"
    52. - alert: 网络告警
    53. expr: sum by (instance,group,project,cloud)(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    54. for: 5m
    55. labels:
    56. level: 3
    57. annotations:
    58. summary: "网络接口流出流量异常"
    59. description: "网络接口流出流量异常 流量 {{ $value }}"
    60. - alert: 网络告警
    61. expr: increase(node_network_receive_errs_total[5m]) > 0
    62. for: 5m
    63. labels:
    64. level: 3
    65. annotations:
    66. summary: "网络接收错误"
    67. description: "网络接收错误 {{ $value }}"
    68. - alert: 磁盘告警
    69. expr: sum by (instance,group,project,cloud) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    70. for: 5m
    71. labels:
    72. level: 3
    73. annotations:
    74. summary: "磁盘读取异常"
    75. description: "磁盘每秒读取数据 > 50MB/S 当前{{ $value }}/s"
    76. - alert: 磁盘告警
    77. expr: sum by (instance,group,project,cloud) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    78. for: 5m
    79. labels:
    80. level: 3
    81. annotations:
    82. summary: "磁盘写入异常"
    83. description: "磁盘每秒写入数据 > 50MB/S 当前{{ $value }}/s"
    84. - alert: 磁盘告警
    85. expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
    86. for: 2m
    87. labels:
    88. level: 3
    89. annotations:
    90. summary: "磁盘空间告警"
    91. description: "磁盘空间预计将在4小时内写满"
    92. - alert: 磁盘告警
    93. expr: (1 - (node_filesystem_files_free{fstype=~"ext3|ext4|xfs"} / node_filesystem_files{fstype=~"ext3|ext4|xfs"})) * 100 > 85
    94. for: 2m
    95. labels:
    96. level: 3
    97. annotations:
    98. summary: "磁盘inodes告警"
    99. description: "磁盘inodes空间不足,当前使用率{{ $value }}%"
    100. - alert: 磁盘告警
    101. expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
    102. for: 5m
    103. labels:
    104. level: 3
    105. annotations:
    106. summary: "磁盘延迟告警"
    107. description: "磁盘读取延迟超过100ms 当前{{ $value }}ms"
    108. - alert: 磁盘告警
    109. expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_write_completed_total[1m]) > 0.1 and rate(node_disk_write_completed_total[1m]) > 0
    110. for: 5m
    111. labels:
    112. level: 3
    113. annotations:
    114. summary: "磁盘延迟告警"
    115. description: "磁盘写入延迟超过100ms 当前{{ $value }}ms"

    Redis

    groups:
      - name: redis告警
        rules:
        - alert: Redis is Down
          expr: redis_up == 0
          for: 0m
          labels:
            level: 3
          annotations:
            summary: Redis is down
            description: Redis instance is down
        - alert: redis缺少主节点
          expr: (count by(cloud,instance,group,project,env) (redis_instance_info{role="master",type="redis"}) or vector(0)) < 1
          for: 0m
          labels:
            level: 3
          annotations:
            summary: redis主节点过多
            description: Redis cluster has no node marked as master
        - alert: RedisTooManyMasters
          expr: count by(cloud,instance,group,project,env)(redis_instance_info{role="master",type="redis"}) > 1
          for: 0m
          labels:
            level: 3
          annotations:
            summary: Redis too many masters
            description: Redis cluster has too many nodes marked as master
        - alert: Redis丢失Slave
          expr: delta(redis_connected_slaves[1m]) < 0
          for: 5m
          labels:
            level: 3
          annotations:
            summary: Redis丢失Slave
            description: Redis instance lost a slave
        - alert: RedisClusterFlapping
          expr: changes(redis_connected_slaves[5m]) > 2
          for: 5m
          labels:
            level: 3
          annotations:
            summary: Redis cluster flapping
            description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)
        - alert: Redis拒绝连接
          expr: increase(redis_rejected_connections_total[1m]) > 0
          for: 5m
          labels:
            level: 3
          annotations:
            summary: Redis拒绝连接
            description: Some connections to Redis has been rejected
    

    Elasticsearch

    groups:
    - name: elasticsearch告警
      rules:
      - alert: ElasticsearchHeapUsageTooHigh
        expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
        for: 2m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})
          description: "The heap usage is over 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchHeapUsageWarning
        expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
        for: 2m
        labels:
          severity: 3
        annotations:
          summary: Elasticsearch Heap Usage warning (instance {{ $labels.instance }})
          description: "The heap usage is over 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchDiskOutOfSpace
        expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch disk out of space (instance {{ $labels.instance }})
          description: "The disk usage is over 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchClusterRed
        expr: elasticsearch_cluster_health_status{color="red"} == 1
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch Cluster Red (instance {{ $labels.instance }})
          description: "Elastic Cluster Red status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchClusterYellow
        expr: elasticsearch_cluster_health_status{color="yellow"} == 1
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
          description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchRelocatingShards
        expr: elasticsearch_cluster_health_relocating_shards > 0
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch relocating shards (instance {{ $labels.instance }})
          description: "Elasticsearch is relocating shards\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchRelocatingShardsTooLong
        expr: elasticsearch_cluster_health_relocating_shards > 0
        for: 15m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch relocating shards too long (instance {{ $labels.instance }})
          description: "Elasticsearch has been relocating shards for 15min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchInitializingShards
        expr: elasticsearch_cluster_health_initializing_shards > 0
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch initializing shards (instance {{ $labels.instance }})
          description: "Elasticsearch is initializing shards\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchInitializingShardsTooLong
        expr: elasticsearch_cluster_health_initializing_shards > 0
        for: 15m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch initializing shards too long (instance {{ $labels.instance }})
          description: "Elasticsearch has been initializing shards for 15 min  VALUE = {{ $value }} LABELS: {{ $labels }}"
      - alert: ElasticsearchUnassignedShards
        expr: elasticsearch_cluster_health_unassigned_shards > 0
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch unassigned shards (instance {{ $labels.instance }})
          description: "Elasticsearch has unassigned shards\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
      - alert: ElasticsearchPendingTasks
        expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
        for: 15m
        labels:
          level: 3
        annotations:
          summary: Elasticsearch pending tasks (instance {{ $labels.instance }})
          description: "Elasticsearch has pending tasks. Cluster works slowly.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    

    Kubernetes

    groups:
    - name: Kubernetes
      rules:
      - alert: KubernetesNodeReady
        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
        for: 10m
        labels:
          level: 3
        annotations:
          summary: Kubernetes Node ready (instance {{ $labels.instance }})
          description: Node {{ $labels.node }} has been unready for a long time
      - alert: 集群节点内存或磁盘资源短缺
        expr: kube_node_status_condition{condition=~"OutOfDisk|MemoryPressure|DiskPressure",status!="false"} == 1
        for: 2m
        labels:
          level: 3
        annotations:
          summary: 集群节点内存或磁盘资源短缺{{ $labels.node }})
          description: 集群节点内存或磁盘资源短缺{{ $labels.node }})
      - alert: KubernetesNoode容量不足
        expr: sum(kube_pod_info) by (node,group,cloud,project,env) / sum(kube_node_status_allocatable_pods) by (node,group,cloud,project,env) * 100 > 90
        for: 2m
        labels:
          level: 3
        annotations:
          summary: KubernetesNoode容量不足
          description: KubernetesNoode容量不足 {{ $labels.node }}
      - alert: 集群StatefulSet down
        expr: (kube_statefulset_status_replicas_ready{group=~"DMP"} / kube_statefulset_status_replicas_current{group=~"DMP"}) != 1
        for: 1m
        labels:
          level: 3
        annotations:
          summary: 集群StatefulSet down
          description: 集群StatefulSet down {{ $labels.node }}
      - alert: KubernetesPodNotHealthy
        expr: min_over_time(sum by (namespace, pod,group,env,cloud,project) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
        for: 0m
        labels:
          level: 3
        annotations:
          summary: Kubernetes Pod not healthy
          description: Kubernetes Pod not healthy (instance {{ $labels.pod }})
      - alert: KubernetesPodCrashLooping
        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
        for: 2m
        labels:
          level: 3
        annotations:
          summary: Kubernetes pod crash looping (pod {{ $labels.pod }})
          description: Pod {{ $labels.pod }} is crash looping
      - alert: KubernetesReplicassetMismatch
        expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
        for: 10m
        labels:
          level: 3
        annotations:
          summary: Kubernetes ReplicasSet mismatch
          description: Deployment Replicas mismatch {{ $labels }}
      - alert: KubernetesDeploymentReplicasMismatch
        expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
        for: 10m
        labels:
          level: 3
        annotations:
          summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
          description: Deployment Replicas mismatch { $labels }}
      - alert: KubernetesStatefulsetReplicasMismatch
        expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
        for: 10m
        labels:
          level: 3
        annotations:
          summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
          description: A StatefulSet does not match the expected number of replicas VALUE = {{ $value }}
      - alert: KubernetesDeploymentGenerationMismatch
        expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
        for: 10m
        labels:
          level: 3
        annotations:
          summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
          description: A Deployment has failed but has not been rolled back VALUE = {{ $value }} {{ $labels }}
      - alert: KubernetesApiClientErrors
        expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
        for: 2m
        labels:
          level: 3
        annotations:
          summary: Kubernetes API client errors (instance {{ $labels.instance }})
          description: Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}