Prometheus - Prometheus告警规则 - 《运维笔记》

主机：

groups:
- name: Instance
  rules:
  - alert: instance连接超时
    expr: up{type=~"machine"} == 0
    for: 1m
    labels:
      level: 3
    annotations:
      summary: "Instance连接超时"
      description: "{{ $labels.instance }} exporter连接超时一分钟"
  - alert: CPU告警
    expr: round(100 - ((irate(node_cpu_seconds_total{group=~"VAS",mode="idle"}[5m])) * 100)) > 75
    for: 1m
    labels:
      level: 3
    annotations:
      summary: "CPU使用率告警"
      description: "CPU当前使用率：{{ $value }}%"
  - alert: CPU告警
    expr: round((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}))) > 5000
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "CPU上下文切换告警"
      description: "CPU上下文切换超过5000/s：当前{{ $value }}/s"
  - alert: 内存告警
    expr: round((1 - (node_memory_MemAvailable_bytes{group=~"DMP"} / (node_memory_MemTotal_bytes{group="DMP"})))* 100) > 85
    for: 30s
    labels:
      level: 3
    annotations:
      summary: "内存使用率告警"
      description: "内存当前占用：{{ $value }}%"
  - alert: 内存告警
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      level: 3
    annotations:
      summary: "内存页面错误告警"
      description: "一级页面的平均错误数 {{ $value }}"
  - alert: 网络告警
    expr: sum by (instance,group,project,cloud)(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "网络接口流入流量异常"
      description: "网络接口流入流量异常 流量 {{ $value }}"
  - alert: 网络告警
    expr: sum by (instance,group,project,cloud)(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "网络接口流出流量异常"
      description: "网络接口流出流量异常 流量 {{ $value }}"
  - alert: 网络告警
    expr: increase(node_network_receive_errs_total[5m]) > 0
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "网络接收错误"
      description: "网络接收错误 {{ $value }}"
  - alert: 磁盘告警
    expr: sum by (instance,group,project,cloud) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "磁盘读取异常"
      description: "磁盘每秒读取数据 > 50MB/S 当前{{ $value }}/s"
  - alert: 磁盘告警
    expr: sum by (instance,group,project,cloud) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "磁盘写入异常"
      description: "磁盘每秒写入数据 > 50MB/S 当前{{ $value }}/s"
  - alert: 磁盘告警
    expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
    for: 2m
    labels:
      level: 3
    annotations:
      summary: "磁盘空间告警"
      description: "磁盘空间预计将在4小时内写满"
  - alert: 磁盘告警
    expr: (1 - (node_filesystem_files_free{fstype=~"ext3|ext4|xfs"} / node_filesystem_files{fstype=~"ext3|ext4|xfs"})) * 100 > 85
    for: 2m
    labels:
      level: 3
    annotations:
      summary: "磁盘inodes告警"
      description: "磁盘inodes空间不足，当前使用率{{ $value }}%"
  - alert: 磁盘告警
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "磁盘延迟告警"
      description: "磁盘读取延迟超过100ms 当前{{ $value }}ms"
  - alert: 磁盘告警
    expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_write_completed_total[1m]) > 0.1 and rate(node_disk_write_completed_total[1m]) > 0
    for: 5m
    labels:
      level: 3
    annotations:
      summary: "磁盘延迟告警"
      description: "磁盘写入延迟超过100ms 当前{{ $value }}ms"

Redis

groups:
  - name: redis告警
    rules:
    - alert: Redis is Down
      expr: redis_up == 0
      for: 0m
      labels:
        level: 3
      annotations:
        summary: Redis is down
        description: Redis instance is down
    - alert: redis缺少主节点
      expr: (count by(cloud,instance,group,project,env) (redis_instance_info{role="master",type="redis"}) or vector(0)) < 1
      for: 0m
      labels:
        level: 3
      annotations:
        summary: redis主节点过多
        description: Redis cluster has no node marked as master
    - alert: RedisTooManyMasters
      expr: count by(cloud,instance,group,project,env)(redis_instance_info{role="master",type="redis"}) > 1
      for: 0m
      labels:
        level: 3
      annotations:
        summary: Redis too many masters
        description: Redis cluster has too many nodes marked as master
    - alert: Redis丢失Slave
      expr: delta(redis_connected_slaves[1m]) < 0
      for: 5m
      labels:
        level: 3
      annotations:
        summary: Redis丢失Slave
        description: Redis instance lost a slave
    - alert: RedisClusterFlapping
      expr: changes(redis_connected_slaves[5m]) > 2
      for: 5m
      labels:
        level: 3
      annotations:
        summary: Redis cluster flapping
        description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)
    - alert: Redis拒绝连接
      expr: increase(redis_rejected_connections_total[1m]) > 0
      for: 5m
      labels:
        level: 3
      annotations:
        summary: Redis拒绝连接
        description: Some connections to Redis has been rejected

Elasticsearch

groups:
- name: elasticsearch告警
  rules:
  - alert: ElasticsearchHeapUsageTooHigh
    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
    for: 2m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})
      description: "The heap usage is over 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchHeapUsageWarning
    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
    for: 2m
    labels:
      severity: 3
    annotations:
      summary: Elasticsearch Heap Usage warning (instance {{ $labels.instance }})
      description: "The heap usage is over 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchDiskOutOfSpace
    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch disk out of space (instance {{ $labels.instance }})
      description: "The disk usage is over 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchClusterRed
    expr: elasticsearch_cluster_health_status{color="red"} == 1
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch Cluster Red (instance {{ $labels.instance }})
      description: "Elastic Cluster Red status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchClusterYellow
    expr: elasticsearch_cluster_health_status{color="yellow"} == 1
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
      description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchRelocatingShards
    expr: elasticsearch_cluster_health_relocating_shards > 0
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch relocating shards (instance {{ $labels.instance }})
      description: "Elasticsearch is relocating shards\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchRelocatingShardsTooLong
    expr: elasticsearch_cluster_health_relocating_shards > 0
    for: 15m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch relocating shards too long (instance {{ $labels.instance }})
      description: "Elasticsearch has been relocating shards for 15min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchInitializingShards
    expr: elasticsearch_cluster_health_initializing_shards > 0
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch initializing shards (instance {{ $labels.instance }})
      description: "Elasticsearch is initializing shards\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchInitializingShardsTooLong
    expr: elasticsearch_cluster_health_initializing_shards > 0
    for: 15m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch initializing shards too long (instance {{ $labels.instance }})
      description: "Elasticsearch has been initializing shards for 15 min  VALUE = {{ $value }} LABELS: {{ $labels }}"
  - alert: ElasticsearchUnassignedShards
    expr: elasticsearch_cluster_health_unassigned_shards > 0
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch unassigned shards (instance {{ $labels.instance }})
      description: "Elasticsearch has unassigned shards\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ElasticsearchPendingTasks
    expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
    for: 15m
    labels:
      level: 3
    annotations:
      summary: Elasticsearch pending tasks (instance {{ $labels.instance }})
      description: "Elasticsearch has pending tasks. Cluster works slowly.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

Kubernetes

groups:
- name: Kubernetes
  rules:
  - alert: KubernetesNodeReady
    expr: kube_node_status_condition{condition="Ready",status="true"} == 0
    for: 10m
    labels:
      level: 3
    annotations:
      summary: Kubernetes Node ready (instance {{ $labels.instance }})
      description: Node {{ $labels.node }} has been unready for a long time
  - alert: 集群节点内存或磁盘资源短缺
    expr: kube_node_status_condition{condition=~"OutOfDisk|MemoryPressure|DiskPressure",status!="false"} == 1
    for: 2m
    labels:
      level: 3
    annotations:
      summary: 集群节点内存或磁盘资源短缺{{ $labels.node }})
      description: 集群节点内存或磁盘资源短缺{{ $labels.node }})
  - alert: KubernetesNoode容量不足
    expr: sum(kube_pod_info) by (node,group,cloud,project,env) / sum(kube_node_status_allocatable_pods) by (node,group,cloud,project,env) * 100 > 90
    for: 2m
    labels:
      level: 3
    annotations:
      summary: KubernetesNoode容量不足
      description: KubernetesNoode容量不足 {{ $labels.node }}
  - alert: 集群StatefulSet down
    expr: (kube_statefulset_status_replicas_ready{group=~"DMP"} / kube_statefulset_status_replicas_current{group=~"DMP"}) != 1
    for: 1m
    labels:
      level: 3
    annotations:
      summary: 集群StatefulSet down
      description: 集群StatefulSet down {{ $labels.node }}
  - alert: KubernetesPodNotHealthy
    expr: min_over_time(sum by (namespace, pod,group,env,cloud,project) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
    for: 0m
    labels:
      level: 3
    annotations:
      summary: Kubernetes Pod not healthy
      description: Kubernetes Pod not healthy (instance {{ $labels.pod }})
  - alert: KubernetesPodCrashLooping
    expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
    for: 2m
    labels:
      level: 3
    annotations:
      summary: Kubernetes pod crash looping (pod {{ $labels.pod }})
      description: Pod {{ $labels.pod }} is crash looping
  - alert: KubernetesReplicassetMismatch
    expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
    for: 10m
    labels:
      level: 3
    annotations:
      summary: Kubernetes ReplicasSet mismatch
      description: Deployment Replicas mismatch {{ $labels }}
  - alert: KubernetesDeploymentReplicasMismatch
    expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
    for: 10m
    labels:
      level: 3
    annotations:
      summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
      description: Deployment Replicas mismatch { $labels }}
  - alert: KubernetesStatefulsetReplicasMismatch
    expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
    for: 10m
    labels:
      level: 3
    annotations:
      summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
      description: A StatefulSet does not match the expected number of replicas VALUE = {{ $value }}
  - alert: KubernetesDeploymentGenerationMismatch
    expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
    for: 10m
    labels:
      level: 3
    annotations:
      summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
      description: A Deployment has failed but has not been rolled back VALUE = {{ $value }} {{ $labels }}
  - alert: KubernetesApiClientErrors
    expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
    for: 2m
    labels:
      level: 3
    annotations:
      summary: Kubernetes API client errors (instance {{ $labels.instance }})
      description: Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}