主机:
groups:
- name: Instance
rules:
- alert: instance连接超时
expr: up{type=~"machine"} == 0
for: 1m
labels:
level: 3
annotations:
summary: "Instance连接超时"
description: "{{ $labels.instance }} exporter连接超时一分钟"
- alert: CPU告警
expr: round(100 - ((irate(node_cpu_seconds_total{group=~"VAS",mode="idle"}[5m])) * 100)) > 75
for: 1m
labels:
level: 3
annotations:
summary: "CPU使用率告警"
description: "CPU当前使用率:{{ $value }}%"
- alert: CPU告警
expr: round((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}))) > 5000
for: 5m
labels:
level: 3
annotations:
summary: "CPU上下文切换告警"
description: "CPU上下文切换超过5000/s:当前{{ $value }}/s"
- alert: 内存告警
expr: round((1 - (node_memory_MemAvailable_bytes{group=~"DMP"} / (node_memory_MemTotal_bytes{group="DMP"})))* 100) > 85
for: 30s
labels:
level: 3
annotations:
summary: "内存使用率告警"
description: "内存当前占用:{{ $value }}%"
- alert: 内存告警
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
level: 3
annotations:
summary: "内存页面错误告警"
description: "一级页面的平均错误数 {{ $value }}"
- alert: 网络告警
expr: sum by (instance,group,project,cloud)(rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
level: 3
annotations:
summary: "网络接口流入流量异常"
description: "网络接口流入流量异常 流量 {{ $value }}"
- alert: 网络告警
expr: sum by (instance,group,project,cloud)(rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
level: 3
annotations:
summary: "网络接口流出流量异常"
description: "网络接口流出流量异常 流量 {{ $value }}"
- alert: 网络告警
expr: increase(node_network_receive_errs_total[5m]) > 0
for: 5m
labels:
level: 3
annotations:
summary: "网络接收错误"
description: "网络接收错误 {{ $value }}"
- alert: 磁盘告警
expr: sum by (instance,group,project,cloud) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
level: 3
annotations:
summary: "磁盘读取异常"
description: "磁盘每秒读取数据 > 50MB/S 当前{{ $value }}/s"
- alert: 磁盘告警
expr: sum by (instance,group,project,cloud) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
level: 3
annotations:
summary: "磁盘写入异常"
description: "磁盘每秒写入数据 > 50MB/S 当前{{ $value }}/s"
- alert: 磁盘告警
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
for: 2m
labels:
level: 3
annotations:
summary: "磁盘空间告警"
description: "磁盘空间预计将在4小时内写满"
- alert: 磁盘告警
expr: (1 - (node_filesystem_files_free{fstype=~"ext3|ext4|xfs"} / node_filesystem_files{fstype=~"ext3|ext4|xfs"})) * 100 > 85
for: 2m
labels:
level: 3
annotations:
summary: "磁盘inodes告警"
description: "磁盘inodes空间不足,当前使用率{{ $value }}%"
- alert: 磁盘告警
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 5m
labels:
level: 3
annotations:
summary: "磁盘延迟告警"
description: "磁盘读取延迟超过100ms 当前{{ $value }}ms"
- alert: 磁盘告警
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_write_completed_total[1m]) > 0.1 and rate(node_disk_write_completed_total[1m]) > 0
for: 5m
labels:
level: 3
annotations:
summary: "磁盘延迟告警"
description: "磁盘写入延迟超过100ms 当前{{ $value }}ms"
Redis
groups:
- name: redis告警
rules:
- alert: Redis is Down
expr: redis_up == 0
for: 0m
labels:
level: 3
annotations:
summary: Redis is down
description: Redis instance is down
- alert: redis缺少主节点
expr: (count by(cloud,instance,group,project,env) (redis_instance_info{role="master",type="redis"}) or vector(0)) < 1
for: 0m
labels:
level: 3
annotations:
summary: redis主节点过多
description: Redis cluster has no node marked as master
- alert: RedisTooManyMasters
expr: count by(cloud,instance,group,project,env)(redis_instance_info{role="master",type="redis"}) > 1
for: 0m
labels:
level: 3
annotations:
summary: Redis too many masters
description: Redis cluster has too many nodes marked as master
- alert: Redis丢失Slave
expr: delta(redis_connected_slaves[1m]) < 0
for: 5m
labels:
level: 3
annotations:
summary: Redis丢失Slave
description: Redis instance lost a slave
- alert: RedisClusterFlapping
expr: changes(redis_connected_slaves[5m]) > 2
for: 5m
labels:
level: 3
annotations:
summary: Redis cluster flapping
description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)
- alert: Redis拒绝连接
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 5m
labels:
level: 3
annotations:
summary: Redis拒绝连接
description: Some connections to Redis has been rejected
Elasticsearch
groups:
- name: elasticsearch告警
rules:
- alert: ElasticsearchHeapUsageTooHigh
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
for: 2m
labels:
level: 3
annotations:
summary: Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})
description: "The heap usage is over 90%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchHeapUsageWarning
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
for: 2m
labels:
severity: 3
annotations:
summary: Elasticsearch Heap Usage warning (instance {{ $labels.instance }})
description: "The heap usage is over 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchDiskOutOfSpace
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for: 0m
labels:
level: 3
annotations:
summary: Elasticsearch disk out of space (instance {{ $labels.instance }})
description: "The disk usage is over 90%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 0m
labels:
level: 3
annotations:
summary: Elasticsearch Cluster Red (instance {{ $labels.instance }})
description: "Elastic Cluster Red status\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 0m
labels:
level: 3
annotations:
summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchRelocatingShards
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 0m
labels:
level: 3
annotations:
summary: Elasticsearch relocating shards (instance {{ $labels.instance }})
description: "Elasticsearch is relocating shards\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchRelocatingShardsTooLong
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 15m
labels:
level: 3
annotations:
summary: Elasticsearch relocating shards too long (instance {{ $labels.instance }})
description: "Elasticsearch has been relocating shards for 15min\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchInitializingShards
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 0m
labels:
level: 3
annotations:
summary: Elasticsearch initializing shards (instance {{ $labels.instance }})
description: "Elasticsearch is initializing shards\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchInitializingShardsTooLong
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 15m
labels:
level: 3
annotations:
summary: Elasticsearch initializing shards too long (instance {{ $labels.instance }})
description: "Elasticsearch has been initializing shards for 15 min VALUE = {{ $value }} LABELS: {{ $labels }}"
- alert: ElasticsearchUnassignedShards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 0m
labels:
level: 3
annotations:
summary: Elasticsearch unassigned shards (instance {{ $labels.instance }})
description: "Elasticsearch has unassigned shards\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
for: 15m
labels:
level: 3
annotations:
summary: Elasticsearch pending tasks (instance {{ $labels.instance }})
description: "Elasticsearch has pending tasks. Cluster works slowly.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
Kubernetes
groups:
- name: Kubernetes
rules:
- alert: KubernetesNodeReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
level: 3
annotations:
summary: Kubernetes Node ready (instance {{ $labels.instance }})
description: Node {{ $labels.node }} has been unready for a long time
- alert: 集群节点内存或磁盘资源短缺
expr: kube_node_status_condition{condition=~"OutOfDisk|MemoryPressure|DiskPressure",status!="false"} == 1
for: 2m
labels:
level: 3
annotations:
summary: 集群节点内存或磁盘资源短缺{{ $labels.node }})
description: 集群节点内存或磁盘资源短缺{{ $labels.node }})
- alert: KubernetesNoode容量不足
expr: sum(kube_pod_info) by (node,group,cloud,project,env) / sum(kube_node_status_allocatable_pods) by (node,group,cloud,project,env) * 100 > 90
for: 2m
labels:
level: 3
annotations:
summary: KubernetesNoode容量不足
description: KubernetesNoode容量不足 {{ $labels.node }}
- alert: 集群StatefulSet down
expr: (kube_statefulset_status_replicas_ready{group=~"DMP"} / kube_statefulset_status_replicas_current{group=~"DMP"}) != 1
for: 1m
labels:
level: 3
annotations:
summary: 集群StatefulSet down
description: 集群StatefulSet down {{ $labels.node }}
- alert: KubernetesPodNotHealthy
expr: min_over_time(sum by (namespace, pod,group,env,cloud,project) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
for: 0m
labels:
level: 3
annotations:
summary: Kubernetes Pod not healthy
description: Kubernetes Pod not healthy (instance {{ $labels.pod }})
- alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
for: 2m
labels:
level: 3
annotations:
summary: Kubernetes pod crash looping (pod {{ $labels.pod }})
description: Pod {{ $labels.pod }} is crash looping
- alert: KubernetesReplicassetMismatch
expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
for: 10m
labels:
level: 3
annotations:
summary: Kubernetes ReplicasSet mismatch
description: Deployment Replicas mismatch {{ $labels }}
- alert: KubernetesDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 10m
labels:
level: 3
annotations:
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
description: Deployment Replicas mismatch { $labels }}
- alert: KubernetesStatefulsetReplicasMismatch
expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for: 10m
labels:
level: 3
annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
description: A StatefulSet does not match the expected number of replicas VALUE = {{ $value }}
- alert: KubernetesDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 10m
labels:
level: 3
annotations:
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
description: A Deployment has failed but has not been rolled back VALUE = {{ $value }} {{ $labels }}
- alert: KubernetesApiClientErrors
expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
for: 2m
labels:
level: 3
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description: Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}