https://github.com/prometheus-operator

https://prometheus.fuckcloudnative.io
https://yunlzheng.gitbook.io/prometheus-book

helm
https://github.com/prometheus-community/helm-charts
kube-prometheus
https://github.com/prometheus-operator/kube-prometheus

告警规则
https://github.com/junotx/mixin

CRD组件:
prometheus-operator
prometheus-operated
alertmanager-operated
thanos-ruler-operated
notification-manager-operator
image.png

kubesphere promethues operator - 图2

kubesphere promethues operator - 图3

  1. [root@UR-20210425NAMA ~]# kubectl -n kubesphere-monitoring-system get all
  2. NAME READY STATUS RESTARTS AGE
  3. pod/alertmanager-main-0 2/2 Running 0 16d
  4. pod/kube-state-metrics-7f65879cfd-txvh8 3/3 Running 0 4d14h
  5. pod/node-exporter-c44m7 2/2 Running 0 19m
  6. pod/node-exporter-l9sws 2/2 Running 0 20m
  7. pod/node-exporter-lk7b9 2/2 Running 0 20m
  8. pod/node-exporter-wshw5 2/2 Running 0 19m
  9. pod/node-exporter-zk466 2/2 Running 0 20m
  10. pod/node-exporter-znd6l 2/2 Running 0 20m
  11. pod/notification-manager-deployment-674dddcbd9-cwwx6 1/1 Running 1 16d
  12. pod/notification-manager-deployment-674dddcbd9-z8f2j 1/1 Running 0 4d14h
  13. pod/notification-manager-operator-7877c6574f-ns68t 2/2 Running 3 4d14h
  14. pod/prometheus-k8s-0 3/3 Running 1 16d
  15. pod/prometheus-operator-7d7684fc68-chjl2 2/2 Running 1 16d
  16. pod/thanos-ruler-kubesphere-0 2/2 Running 0 4d13h
  17. pod/thanos-ruler-kubesphere-1 2/2 Running 0 34d
  18. NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
  19. service/alertmanager-main ClusterIP 172.26.94.130 <none> 9093/TCP 125d
  20. service/alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 125d
  21. service/kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 125d
  22. service/node-exporter ClusterIP None <none> 9100/TCP 125d
  23. service/notification-manager-controller-metrics ClusterIP 172.26.201.224 <none> 8443/TCP
  24. 96d
  25. service/notification-manager-svc ClusterIP 172.26.224.139 <none> 19093/TCP
  26. 96d
  27. service/prometheus-k8s NodePort 172.26.128.137 <none> 9090:30890/TCP
  28. 125d
  29. service/prometheus-operated ClusterIP None <none> 9090/TCP
  30. 125d
  31. service/prometheus-operator ClusterIP None <none> 8443/TCP
  32. 125d
  33. service/thanos-ruler-operated ClusterIP None <none> 10902/TCP,10901/TCP 96d
  34. NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
  35. daemonset.apps/node-exporter 6 6 6 6 6 kubernetes.io/os=linux 125d
  36. NAME READY UP-TO-DATE AVAILABLE AGE
  37. deployment.apps/kube-state-metrics 1/1 1 1 125d
  38. deployment.apps/notification-manager-deployment 2/2 2 2 96d
  39. deployment.apps/notification-manager-operator 1/1 1 1 96d
  40. deployment.apps/prometheus-operator 1/1 1 1 125d
  41. NAME DESIRED CURRENT READY AGE
  42. replicaset.apps/kube-state-metrics-7f65879cfd 1 1 1 96d
  43. replicaset.apps/kube-state-metrics-95c974544 0 0 0 125d
  44. replicaset.apps/notification-manager-deployment-674dddcbd9 2 2 2 96d
  45. replicaset.apps/notification-manager-operator-7877c6574f 1 1 1 96d
  46. replicaset.apps/prometheus-operator-7d7684fc68 1 1 1 96d
  47. replicaset.apps/prometheus-operator-84d58bf775 0 0 0 125d
  48. NAME READY AGE
  49. statefulset.apps/alertmanager-main 1/1 125d
  50. statefulset.apps/prometheus-k8s 1/1 125d
  51. statefulset.apps/thanos-ruler-kubesphere 2/2 96d
  52. NAME COMPLETIONS DURATION AGE
  53. job.batch/prometheus-pvc-cleanup-1624634100 0/1 89d 89d
  54. NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE
  55. cronjob.batch/prometheus-pvc-cleanup 15 23 25 6 * False 0 89d 125d
  56. [root@UR-20210425NAMA ~]#
  57. [root@UR-20210425NAMA ~]# kubectl -n kubesphere-monitoring-system get all
  58. NAME READY STATUS RESTARTS AGE
  59. pod/alertmanager-main-0 2/2 Running 0 16d
  60. pod/kube-state-metrics-7f65879cfd-txvh8 3/3 Running 0 4d14h
  61. pod/node-exporter-c44m7 2/2 Running 0 24m
  62. pod/node-exporter-l9sws 2/2 Running 0 25m
  63. pod/node-exporter-lk7b9 2/2 Running 0 25m
  64. pod/node-exporter-wshw5 2/2 Running 0 24m
  65. pod/node-exporter-zk466 2/2 Running 0 25m
  66. pod/node-exporter-znd6l 2/2 Running 0 25m
  67. pod/notification-manager-deployment-674dddcbd9-cwwx6 1/1 Running 1 16d
  68. pod/notification-manager-deployment-674dddcbd9-z8f2j 1/1 Running 0 4d14h
  69. pod/notification-manager-operator-7877c6574f-ns68t 2/2 Running 3 4d14h
  70. pod/prometheus-k8s-0 3/3 Running 1 16d
  71. pod/prometheus-operator-7d7684fc68-chjl2 2/2 Running 1 16d
  72. pod/thanos-ruler-kubesphere-0 2/2 Running 0 4d14h
  73. pod/thanos-ruler-kubesphere-1 2/2 Running 0 34d
  74. NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
  75. service/alertmanager-main ClusterIP 172.26.94.130 <none> 9093/TCP 125d
  76. service/alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 125d
  77. service/kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 125d
  78. service/node-exporter ClusterIP None <none> 9100/TCP 125d
  79. service/notification-manager-controller-metrics ClusterIP 172.26.201.224 <none> 8443/TCP
  80. 96d
  81. service/notification-manager-svc ClusterIP 172.26.224.139 <none> 19093/TCP
  82. 96d
  83. service/prometheus-k8s NodePort 172.26.128.137 <none> 9090:30890/TCP
  84. 125d
  85. service/prometheus-operated ClusterIP None <none> 9090/TCP
  86. 125d
  87. service/prometheus-operator ClusterIP None <none> 8443/TCP
  88. 125d
  89. service/thanos-ruler-operated ClusterIP None <none> 10902/TCP,10901/TCP 96d
  90. NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
  91. daemonset.apps/node-exporter 6 6 6 6 6 kubernetes.io/os=linux 125d
  92. NAME READY UP-TO-DATE AVAILABLE AGE
  93. deployment.apps/kube-state-metrics 1/1 1 1 125d
  94. deployment.apps/notification-manager-deployment 2/2 2 2 96d
  95. deployment.apps/notification-manager-operator 1/1 1 1 96d
  96. deployment.apps/prometheus-operator 1/1 1 1 125d
  97. NAME DESIRED CURRENT READY AGE
  98. replicaset.apps/kube-state-metrics-7f65879cfd 1 1 1 96d
  99. replicaset.apps/kube-state-metrics-95c974544 0 0 0 125d
  100. replicaset.apps/notification-manager-deployment-674dddcbd9 2 2 2 96d
  101. replicaset.apps/notification-manager-operator-7877c6574f 1 1 1 96d
  102. replicaset.apps/prometheus-operator-7d7684fc68 1 1 1 96d
  103. replicaset.apps/prometheus-operator-84d58bf775 0 0 0 125d
  104. NAME READY AGE
  105. statefulset.apps/alertmanager-main 1/1 125d
  106. statefulset.apps/prometheus-k8s 1/1 125d
  107. statefulset.apps/thanos-ruler-kubesphere 2/2 96d
  108. NAME COMPLETIONS DURATION AGE
  109. job.batch/prometheus-pvc-cleanup-1624634100 0/1 89d 89d
  110. NAME SCHEDULE SUSPEND ACTIVE LAST SCHEDULE AGE
  111. cronjob.batch/prometheus-pvc-cleanup 15 23 25 6 * False 0 89d 125d
  112. [root@UR-20210425NAMA ~]#
  1. [root@UR-20210425NAMA ~]# kubectl -n kubesphere-monitoring-system get pod
  2. NAME READY STATUS RESTARTS AGE
  3. alertmanager-main-0 2/2 Running 0 16d
  4. kube-state-metrics-7f65879cfd-txvh8 3/3 Running 0 4d14h
  5. node-exporter-c44m7 2/2 Running 0 25m
  6. node-exporter-l9sws 2/2 Running 0 25m
  7. node-exporter-lk7b9 2/2 Running 0 25m
  8. node-exporter-wshw5 2/2 Running 0 25m
  9. node-exporter-zk466 2/2 Running 0 25m
  10. node-exporter-znd6l 2/2 Running 0 25m
  11. notification-manager-deployment-674dddcbd9-cwwx6 1/1 Running 1 16d
  12. notification-manager-deployment-674dddcbd9-z8f2j 1/1 Running 0 4d14h
  13. notification-manager-operator-7877c6574f-ns68t 2/2 Running 3 4d14h
  14. prometheus-k8s-0 3/3 Running 1 16d
  15. prometheus-operator-7d7684fc68-chjl2 2/2 Running 1 16d
  16. thanos-ruler-kubesphere-0 2/2 Running 0 4d14h
  17. thanos-ruler-kubesphere-1 2/2 Running 0 34d
  18. [root@UR-20210425NAMA ~]#
  1. [root@UR-20210425NAMA ~]# kubectl -n kubesphere-monitoring-system get statefulset
  2. NAME READY AGE
  3. alertmanager-main 1/1 125d
  4. prometheus-k8s 1/1 125d
  5. thanos-ruler-kubesphere 2/2 96d
  6. [root@UR-20210425NAMA ~]# kubectl -n kubesphere-monitoring-system get deployment
  7. NAME READY UP-TO-DATE AVAILABLE AGE
  8. kube-state-metrics 1/1 1 1 125d
  9. notification-manager-deployment 2/2 2 2 96d
  10. notification-manager-operator 1/1 1 1 96d
  11. prometheus-operator 1/1 1 1 125d

CRD 资源

  1. [root@UR-20210425NAMA ~]# kubectl get crd | grep coreos
  2. alertmanagers.monitoring.coreos.com 2021-05-20T10:26:22Z
  3. podmonitors.monitoring.coreos.com 2021-05-20T10:26:22Z
  4. probes.monitoring.coreos.com 2021-06-18T14:55:50Z
  5. prometheuses.monitoring.coreos.com 2021-05-20T10:26:23Z
  6. prometheusrules.monitoring.coreos.com 2021-05-20T10:26:23Z
  7. servicemonitors.monitoring.coreos.com 2021-05-20T10:26:24Z
  8. thanosrulers.monitoring.coreos.com 2021-05-20T10:26:24Z
  9. [root@UR-20210425NAMA ~]#

image.png

ServiceMonitor —/metrics —-service

  1. apiVersion: monitoring.coreos.com/v1
  2. kind: ServiceMonitor
  3. metadata:
  4. labels:
  5. k8s-app: kube-scheduler
  6. name: kube-scheduler
  7. namespace: monitoring
  8. spec:
  9. endpoints:
  10. - interval: 30s # 每30s获取一次信息
  11. port: http-metrics # 对应service的端口名
  12. jobLabel: k8s-app
  13. namespaceSelector: # 表示去匹配某一命名空间中的service,如果想从所有的namespace中匹配用any: true
  14. matchNames:
  15. - kube-system
  16. selector: # 匹配的 Service 的labels,如果使用mathLabels,则下面的所有标签都匹配时才会匹配该service,如果使用matchExpressions,则至少匹配一个标签的service都会被选择
  17. matchLabels:
  18. k8s-app: kube-scheduler
  1. [root@UR-20210425NAMA ~]# kubectl get servicemonitor -A
  2. NAMESPACE NAME AGE
  3. kubesphere-monitoring-system alertmanager 126d
  4. kubesphere-monitoring-system coredns 126d
  5. kubesphere-monitoring-system etcd 126d
  6. kubesphere-monitoring-system grafana 7h15m
  7. kubesphere-monitoring-system kube-apiserver 126d
  8. kubesphere-monitoring-system kube-controller-manager 126d
  9. kubesphere-monitoring-system kube-scheduler 126d
  10. kubesphere-monitoring-system kube-state-metrics 126d
  11. kubesphere-monitoring-system kubelet 126d
  12. kubesphere-monitoring-system node-exporter 126d
  13. kubesphere-monitoring-system prometheus 126d
  14. kubesphere-monitoring-system prometheus-operator 126d
  15. kubesphere-monitoring-system s2i-operator 126d
  16. ur-api-dev-test-project api-dev-test 8h
  17. ur-api-dev-test-project ur-api-gateway-test-project 8h
  18. ur-api-esb-test-project api-esb-test 29d
  19. ur-api-gateway-test-project api-gateway-private-test 27d
  20. ur-api-gateway-test-project api-gateway-public-test 8h
  21. [root@UR-20210425NAMA ~]#
  1. [root@UR-20210425NAMA ~]# kubectl -n kubesphere-monitoring-system get prometheusrules
  2. NAME AGE
  3. custom-alerting-rule-zpnmw 99d
  4. prometheus-k8s-etcd-rules 128d
  5. prometheus-k8s-rules 128d
  6. [root@UR-20210425NAMA ~]#

PromQL数据类型

即时向量 instant vector
区间向量 range vector
标量 scalar
字符串 string

Metric类型

Conter
Gauge
Histogram
Summary

PromQL聚合操作

  • sum (求和)
  • min (最小值)
  • max (最大值)
  • avg (平均值)
  • stddev (标准差)
  • stdvar (标准差异)
  • count (计数)
  • count_values (对value进行计数)
  • bottomk (后n条时序)
  • topk (前n条时序)
  • quantile (分布统计) ```yaml PromQL操作符 一、二元操作符 Prometheus的查询语言支持基本的逻辑运算和算术运算。对于两个瞬时向量, 匹配行为可以被改变。

1.1 算术二元运算符 在Prometheus支持下面的二元算术操作符:

  • 加法 – 减法
  • 乘法 / 除法 % 模 ^ 幂等 二元运算操作符定义在scalar/scalar(标量/标量)、vector/scalar(向量/标量)、和vector/vector(向量/向量)之间。

在两个标量之间:评估另一个标量,这是运算符应用于两个标量操作数的结果。 在瞬时向量和标量之间:将运算符应用于向量中的每个数据样本的值。 如果时间序列即时向量乘以2,则结果是另一个向量,其中原始向量的每个样本值乘以2。 在两个瞬时向量之间:应用于左侧向量中的每个条目及其右侧向量中的匹配元素。 结果将传播到结果向量中。 右侧向量中(没有匹配条目)不是结果的一部分。 1.2 比较二元操作符 在Prometheus系统中,比较二元操作符有:

== 等于 != 不等于

大于 < 小于 = 大于等于 <= 小于等于 比较二元操作符定义在scalar/scalar(标量/标量)、vector/scalar(向量/标量),和vector/vector(向量/向量)。默认情况下过滤。 可以通过在运算符之后提供bool来修改它们的行为,这将为值返回0或1而不是过滤。

在两个标量之间:必须提供bool修饰符,并且这些运算符会产生另一个标量,即0(假)或1(真),具体取决于比较结果。 在瞬时向量和标量之间:将这些运算符应用于向量中的每个数据样本的值,并且从结果向量中删除比较结果为假的向量元素。 如果提供了bool修饰符,则将被删除的向量元素的值为0,而将保留的向量元素的值为1。 在两个瞬时向量之间:这些运算符默认表现为过滤器,应用于匹配条目。 表达式不正确或在表达式的另一侧找不到匹配项的向量元素将从结果中删除,而其他元素将传播到具有其原始(左侧)度量标准名称的结果向量中 标签值。 如果提供了bool修饰符,则已经删除的向量元素的值为0,而保留的向量元素的值为1,左侧标签值为1。 如:

3 > 2

报错 “comparisons between scalars must use BOOL modifier”

3 > bool 2

返回 scalar 1

1 > bool 2 1.3 逻辑/集合二元操作符 逻辑/集合二元操作符只能作用在即时向量, 包括:

and 交集 or 并集 unless 补集 vector1 and vector2: 得到一个由vector1元素组成的向量,其中vector2中的元素具有完全匹配的标签集,其他元素被删除。

vector1 or vector2:得到包含vector1的所有原始元素(标签集+值)的向量以及vector2中vector1中没有匹配标签集的所有元素。

vector1 unless vector2:得到一个由vector1元素组成的向量,其中vector2中没有元素,具有完全匹配的标签集。 两个向量中的所有匹配元素都被删除。

二、向量匹配 向量之间的操作尝试在左侧的每个条目的右侧向量中找到匹配元素。 匹配行为有两种基本类型:一对一和多对一/一对多。

一对一从操作的每一侧找到一对唯一条目。 在默认情况下,这是格式为vector1vector2之后的操作。 如果两个条目具有完全相同的标签集和相应的值,则它们匹配。 忽略关键字允许在匹配时忽略某些标签,而on关键字允许将所考虑的标签集减少到提供的列表: [vector expr] [bin-op] ignoring([label list]) [vector expr]

[vector expr] [bin-op] on([lable list]) [vector expr] 例如样本数据:

method_code:http_errors:rate5m{method=”get”, code=”500”} 24 method_code:http_errors:rate5m{method=”get”, code=”404”} 30 method_code:http_errors:rate5m{method=”put”, code=”501”} 3 method_code:http_errors:rate5m{method=”post”, code=”404”} 21

method:http_requests:rate5m{method=”get”} 600 method:http_requests:rate5m{method=”delete”} 34 method:http_requests:rate5m{method=”post”} 120 查询例子:

method_code:http_errors:rate5m{code=”500”} / ignoring(code) method:http_requests:rate5m 这将返回一个结果向量,其中包含每个方法的状态代码为500的HTTP请求部分,在过去的5分钟内进行测量。 没有ignoring(code)就没有匹配,因为度量标准不共享同一组标签。 方法put和del的条目没有匹配,并且不会显示在结果中: {method=”get”} 0.04 // 24 / 600 {method=”post”} 0.05 // 6 / 120 2.2 多对一和一对多向量匹配 多对一和一对多匹配指的是“一”侧的每个向量元素可以与“多”侧的多个元素匹配的情况。 必须使用group_left或group_right修饰符明确请求,其中left/right确定哪个向量具有更高的基数。

ignoring(

ignoring(

on(

on(

method_code:http_errors:rate5m / ignoring(code) group_left method:http_requests:rate5m 在这种情况下,左向量每个method标签值包含多个条目。 因此,我们使用group_left表明这一点。 右侧的元素现在与多个元素匹配,左侧具有相同的method标签:

{method=”get”, code=”500”} 0.04 // 24 /600 {method=”get”, code=”404”} 0.05 // 30 /600

{method=”post”, code=”500”} 0.05 // 6 /600

{method=”post”, code=”404”} 0.175 // 21 /600 多对一和一对多匹配是高级用例,应该仔细考虑。 通常正确使用忽略ignoring()可提供所需的结果。

三、聚合操作符 Prometheus支持以下内置聚合运算符,这些运算符可用于聚合单个即时向量的元素,从而生成具有聚合值的较少元素的新向量:

sum (在维度上求和) max (在维度上求最大值) min (在维度上求最小值) avg (在维度上求平均值) stddev (求标准差) stdvar (求方差) count (统计向量元素的个数) count_values (统计相同数据值的元素数量) bottomk (样本值第k个最小值) topk (样本值第k个最大值) quantile (统计分位数) 这些运算符可以用于聚合所有标签维度,也可以通过包含without或by子句来保留不同的维度。

([parameter,] ) [without | by (

count_values输出每个唯一样本值的一个时间序列。每个序列都有一个额外的标签。该标签的名称由聚合参数给出,标签值是唯一的样本值。每个时间序列的值是样本值存在的次数。

topk和bottomk与其他聚合器的不同之处在于,输入样本的子集(包括原始标签)在结果向量中返回。 by和without仅用于存储输入向量。

例:如果度量标准http_requests_total具有按应用程序,实例和组标签扇出的时间序列,我们可以通过以下方式计算每个应用程序和组在所有实例上看到的HTTP请求总数:

sum(http_requests_total) without (instance) 等价于:

sum(http_requests_total) 要计算运行每个构建版本的二进制文件的数量,我们可以编写:

count_values(“version”, build_version) 要在所有实例中获取5个最大的HTTP请求计数,我们可以编写:

topk(5, http_requests_total) 四、二元运算符优先级 以下列表显示了Prometheus中二进制运算符的优先级,从最高到最低。

^ , /, % +, – ==, !=, <=, <, >=, > and, unless or 具有相同优先级的运算符是左关联的。 例如,2 3%2相当于(2 * 3)%2。但是^是右关联的,因此2 ^ 3 ^ 2相当于2 ^(3 ^ 2)。

PromQL函数 一些函数有默认的参数,例如:year(v=vector(time()) instant-vector)。意思是有一个参数v是一个瞬时向量,如果没有提供,它将默认为表达式vector(time())的值。可参考:Prometheus监控学习笔记之PromQL 内置函数 一、abs() abs(v instant-vector)返回输入向量,所有样本值都转换为其绝对值。

二、absent() absent(v instant-vector)如果传递给它的向量具有任何元素,则返回空向量;如果传递给它的向量没有元素,则返回为1的值。这对于在给定度量标准名称和标签组合不存在时间序列时发出警报非常有用。

absent(nonexistent{job=”myjob”})

=> {job=”myjob”}

absent(nonexistent{job=”myjob”, instance=~”.*”})

=> {job=”myjob”}

absent(sum(nonexistent{job=”myjob”}))

=> {}

在第二个例子中,absent()试图从输入向量中导出1元素输出向量的标签。

三、ceil() ceil(v instant-vector) 将v中所有元素的样本值四舍五入到最接近的整数。如:

node_load5{instance=”192.168.1.75:9100”} # 结果为 2.79 ceil(node_load5{instance=”192.168.1.75:9100”}) # 结果为 3 四、changes() 输入一个区间向量, 返回这个区间向量内每个样本数据值变化的次数(瞬时向量)。如:

如果样本数据值没有发生变化,则返回结果为 1 changes(node_load5{instance=”192.168.1.75:9100”}[1m]) # 结果为 1 五、clamp_max() clamp_max(v instant-vector, max scalar) 函数,输入一个瞬时向量和最大值,样本数据值若大于 max,则改为 max,否则不变。如:

node_load5{instance=”192.168.1.75:9100”} # 结果为 2.79 clamp_max(node_load5{instance=”192.168.1.75:9100”}, 2) # 结果为 2 六、clamp_min() clamp_min(v instant-vector, min scalar) 函数,输入一个瞬时向量和最小值,样本数据值若小于 min,则改为 min,否则不变。如:

node_load5{instance=”192.168.1.75:9100”} # 结果为 2.79 clamp_min(node_load5{instance=”192.168.1.75:9100”}, 3) # 结果为 3 七、day_of_month() day_of_month(v=vector(time()) instant-vector)返回UTC中每个给定时间的月中的某天。 返回值为1到31。

八、day_of_week() day_of_week(v=vector(time()) instant-vector)返回UTC中每个给定时间的星期几。 返回值为0到6,其中0表示星期日等。

九、days_in_month() days_in_month(v=vector(time()) instant-vector)返回UTC中每个给定时间的月中天数。 返回值为28到31。

十、delta() delta(v range-vector) 的参数是一个区间向量,返回一个瞬时向量。它计算一个区间向量 v 的第一个元素和最后一个元素之间的差值。由于这个值被外推到指定的整个时间范围,所以即使样本值都是整数,你仍然可能会得到一个非整数值。 如以下示例表达式返回现在和2小时之前CPU温度的差异:

delta(cpu_temp_celsius{host=”zeus”}[2h]) 这个函数一般只用在 Gauge 类型的时间序列上。

十一、deriv() deriv(v range-vector) 的参数是一个区间向量,返回一个瞬时向量。它使用简单的线性回归计算区间向量 v 中各个时间序列的导数。这个函数一般只用在 Gauge 类型的时间序列上。

十二、exp() exp(v instant-vector) 函数,输入一个瞬时向量,返回各个样本值的 e 的指数值,即 e 的 N 次方。当 N 的值足够大时会返回 +Inf。特殊情况为:

Exp(+inf) = +Inf Exp(NaN) = NaN 十三、floor() floor(v instant-vector)函数与 ceil() 函数相反,将 v 中所有元素的样本值向下四舍五入到最接近的整数。

十四、histogram_quantile() histogram_quatile(φ float, b instant-vector) 计算b向量的φ-直方图 (0 ≤ φ ≤ 1) 。(有关φ-分位数的详细解释和直方图度量类型的使用,请参见直方图和摘要。)b中的样本是每个桶中的观察计数。 每个样本必须具有标签le,其中标签值表示桶的包含上限。 (没有这种标签的样本会被忽略。)直方图度量标准类型自动提供带有_bucket后缀和相应标签的时间序列。使用rate()函数指定分位数计算的时间窗口。

示例:直方图度量标准称为http_request_duration_seconds。 要计算过去10m内请求持续时间的第90个百分位数,请使用以下表达式:

histogram_quantile(0.9, rate(http_request_duration_seconds_bucket[10m])) 在http_request_duration_seconds中为每个标签组合计算分位数。 要聚合,请在rate()函数周围使用sum()聚合器。 由于histogram_quantile()需要le标签,因此必须将其包含在by子句中。 以下表达式按作业聚合第90个百分点: histogram_quantile(0.9, sum(rate(http_request_duration_seconds_bucket[10m])) by (job, le)) 要聚合所有内容,请仅指定le标签:

histogram_quantile(0.9, sum(rate(http_request_duration_seconds_bucket[10m])) by (le)) histogram_quantile()函数通过假设桶内的线性分布来插值分位数值。 最高桶必须具有+Inf的上限。 (否则,返回NaN。)如果分位数位于最高桶中,则返回第二个最高桶的上限。 如果该桶的上限大于0,则假设最低桶的下限为0.在这种情况下,在该桶内应用通常的线性插值。 否则,对于位于最低桶中的分位数,返回最低桶的上限。

如果b包含少于两个桶,则返回NaN。 对于φ<0,返回-Inf。 对于φ> 1,返回+Inf。

十五、holt_winters() holt_winters(v range-vector, sf scalar, tf scalar) 函数基于区间向量 v,生成时间序列数据平滑值。平滑因子 sf 越低, 对旧数据的重视程度越高。趋势因子 tf 越高,对数据的趋势的考虑就越多。其中,0< sf, tf <=1。仅适用于 Gauge 类型的时间序列。

十六、hour() hour(v=vector(time()) instant-vector)返回UTC中每个给定时间的一天中的小时。 返回值为0到23。

十七、idelta() idelta(v range-vector) 的参数是一个区间向量, 返回一个瞬时向量。它计算最新的 2 个样本值之间的差值。这个函数一般只用在 Gauge 类型的时间序列上。

十八、increase() increase(v range-vector) 函数获取区间向量中的第一个和最后一个样本并返回其增长量, 它会在单调性发生变化时(如由于采样目标重启引起的计数器复位)自动中断。由于这个值被外推到指定的整个时间范围,所以即使样本值都是整数,你仍然可能会得到一个非整数值。如以下表达式返回区间向量中每个时间序列过去 5 分钟内 HTTP 请求数的增长数:

increase(http_requests_total{job=”api-server”}[5m]) increase 的返回值类型只能是计数器类型,主要作用是增加图表和数据的可读性。使用 rate 函数记录规则的使用率,以便持续跟踪数据样本值的变化。

十九、irate irate(v range-vector) 函数用于计算区间向量的增长率,但是其反应出的是瞬时增长率。irate 函数是通过区间向量中最后两个样本数据来计算区间向量的增长速率,它会在单调性发生变化时(如由于采样目标重启引起的计数器复位)自动中断。这种方式可以避免在时间窗口范围内的“长尾问题”,并且体现出更好的灵敏度,通过irate函数绘制的图标能够更好的反应样本数据的瞬时变化状态。如,以下表达式返回区间向量中每个时间序列过去 5 分钟内最后两个样本数据的 HTTP 请求数的增长率:

irate(http_requests_total{job=”api-server”}[5m]) irate 只能用于绘制快速变化的计数器,在长期趋势分析或者告警中更推荐使用 rate 函数。因为使用 irate 函数时,速率的简短变化会重置 FOR 语句,形成的图形有很多波峰,难以阅读。

注意,当将irate()与聚合运算符(例如sum())或随时间聚合的函数(任何以_over_time结尾的函数)组合时,请始终首先采用irate(),然后进行聚合。 否则,当目标重新启动时,irate()无法检测计数器重置。

二十、label_join() 函数可以将时间序列 v 中多个标签 src_label 的值,通过 separator 作为连接符写入到一个新的标签 dst_label 中。可以有多个 src_label 标签。如,以下表达式返回的时间序列多了一个 foo 标签,标签值为 etcd,etcd-k8s:

up{endpoint=”api”,instance=”192.168.123.248:2379”,job=”etcd”,namespace=”monitoring”,service=”etcd-k8s”} => up{endpoint=”api”,instance=”192.168.123.248:2379”,job=”etcd”,namespace=”monitoring”,service=”etcd-k8s”} 1

label_join(up{endpoint=”api”,instance=”192.168.123.248:2379”,job=”etcd”,namespace=”monitoring”,service=”etcd-k8s”}, “foo”, “,”, “job”, “service”) => up{endpoint=”api”,foo=”etcd,etcd-k8s”,instance=”192.168.123.248:2379”,job=”etcd”,namespace=”monitoring”,service=”etcd-k8s”} 1 label_replace() 二十一、label_replace() 为了能够让客户端的图标更具有可读性,可以通过 label_replace 函数为时间序列添加额外的标签。label_replace 的具体参数如下:

label_replace(v instant-vector, dst_label string, replacement string, src_label string, regex string) 该函数会依次对 v 中的每一条时间序列进行处理,通过 regex 匹配 src_label 的值,并将匹配部分 relacement 写入到 dst_label 标签中。如下所示:

label_replace(up, “host”, “$1”, “instance”, “(.):.“) 函数处理后,时间序列将包含一个 host 标签,host 标签的值为 Exporter 实例的 IP 地址:

up{host=”localhost”,instance=”localhost:8080”,job=”cadvisor”} 1 up{host=”localhost”,instance=”localhost:9090”,job=”prometheus”} 1 up{host=”localhost”,instance=”localhost:9100”,job=”node”} 1 二十二、ln() 计算瞬时向量 v 中所有样本数据的自然对数。特殊情况:

ln(+Inf) = +Inf ln(0) = -Inf ln(x<0) = NaN ln(NaN) = NaN 二十三、log2() log2(v instant-vector)计算v中所有元素的二进制对数。特殊情况等同于ln中的特殊情况。

二十四、log10() log10(v instant-vector)计算v中所有元素的10进制对数。特殊情况等同于ln中的特殊情况。

二十五、minute() minute(v=vector(time()) instant-vector)以UTC为单位返回每个给定时间的分钟。 返回值为0到59。

二十六、month() month(v=vector(time()) instant-vector)返回UTC中每个给定时间的一年中的月份。 返回值为1到12,其中1表示1月等。

二十七、predict_linear() predict_linear(v range-vector, t scalar) 函数可以预测时间序列 v 在 t 秒后的值。它基于简单线性回归的方式,对时间窗口内的样本数据进行统计,从而可以对时间序列的变化趋势做出预测。该函数的返回结果不带有度量指标,只有标签列表。如,基于 2 小时的样本数据,来预测主机可用磁盘空间的是否在 4 个小时候被占满,可以使用如下表达式:

predict_linear(node_filesystem_free{job=”node”}[2h], 4 * 3600) < 0 通过下面的例子来观察返回值:

predict_linear(http_requests_total{code=”200”,instance=”120.77.65.193:9090”,job=”prometheus”,method=”get”}[5m], 5) 结果: {code=”200”,handler=”query_range”,instance=”120.77.65.193:9090”,job=”prometheus”,method=”get”} 1 {code=”200”,handler=”prometheus”,instance=”120.77.65.193:9090”,job=”prometheus”,method=”get”} 4283.449995397104 {code=”200”,handler=”static”,instance=”120.77.65.193:9090”,job=”prometheus”,method=”get”} 22.99999999999999 … 二十八、rate() rate(v range-vector) 函数可以直接计算区间向量 v 在时间窗口内平均增长速率,它会在单调性发生变化时(如由于采样目标重启引起的计数器复位)自动中断。该函数的返回结果不带有度量指标,只有标签列表。

例如,以下表达式返回区间向量中每个时间序列过去 5 分钟内 HTTP 请求数的每秒增长率:

rate(http_requests_total{job=”api-server”}[5m]) rate() 函数返回值类型只能用计数器,在长期趋势分析或者告警中推荐使用这个函数。

注意,当将 rate() 函数与聚合运算符(例如 sum())或随时间聚合的函数(任何以 _over_time 结尾的函数)一起使用时,必须先执行 rate 函数,然后再进行聚合操作,否则当采样目标重新启动时 rate() 无法检测到计数器是否被重置。

二十九、resets() resets(v range-vector) 的参数是一个区间向量。对于每个时间序列,它都返回一个计数器重置的次数。两个连续样本之间的值的减少被认为是一次计数器重置。

这个函数一般只用在计数器类型的时间序列上。

三十、round() round(v instant-vector, to_nearest=1 scalar) 函数与 ceil 和 floor 函数类似,返回向量中所有样本值的最接近的整数。to_nearest 参数是可选的,默认为 1,表示样本返回的是最接近 1 的整数倍的值。你也可以将该参数指定为任意值(也可以是小数),表示样本返回的是最接近它的整数倍的值。

三十一、scalar() scalar(v instant-vector) 函数的参数是一个单元素的瞬时向量,它返回其唯一的时间序列的值作为一个标量。如果度量指标的样本数量大于 1 或者等于 0, 则返回 NaN。

三十二、sort() sort(v instant-vector) 函数对向量按元素的值进行升序排序,返回结果:key: value = 度量指标:样本值[升序排列]。

三十三、sort_desc() sort(v instant-vector) 函数对向量按元素的值进行降序排序,返回结果:key: value = 度量指标:样本值[降序排列]。

三十四、sqrt() sqrt(v instant-vector) 函数计算向量 v 中所有元素的平方根。

三十五、time() time() 函数返回从 1970-01-01 到现在的秒数。注意:它不是直接返回当前时间,而是时间戳三十六、timestamp()

三十七、vector() vector(s scalar) 函数将标量 s 作为没有标签的向量返回,即返回结果为:key: value= {}, s。

三十八、year() year(v=vector(time()) instant-vector)以UTC格式返回每个给定时间的年份。

三十九、_over_time() year(v=vector(time()) instant-vector) 函数返回被给定 UTC 时间的当前年份。

_over_time() 下面的函数列表允许传入一个区间向量,它们会聚合每个时间序列的范围,并返回一个瞬时向量:

avg_over_time(range-vector) : 区间向量内每个度量指标的平均值。

min_over_time(range-vector) : 区间向量内每个度量指标的最小值。

max_over_time(range-vector) : 区间向量内每个度量指标的最大值。

sum_over_time(range-vector) : 区间向量内每个度量指标的求和。

count_over_time(range-vector) : 区间向量内每个度量指标的样本数据个数。

quantile_over_time(scalar, range-vector) : 区间向量内每个度量指标的样本数据值分位数,φ-quantile (0 ≤ φ ≤ 1)。

stddev_over_time(range-vector) : 区间向量内每个度量指标的总体标准差。

stdvar_over_time(range-vector) : 区间向量内每个度量指标的总体标准方差。

请注意,即使值在整个时间间隔内的间隔不均匀,指定时间间隔内的所有值在聚合中都具有相同的权重。注意:即使区间向量内的值分布不均匀,它们在聚合时的权重也是相同的。

PromQL例子 一、简单的时间序列选择 使用度量标准http_requests_total返回所有时间序列:

http_requests_total 使用度量标准http_requests_total以及给定的job和handler标签返回所有时间系列:

http_requests_total{job=”apiserver”, handler=”/api/comments”} 返回相同向量的整个时间范围(在本例中为5分钟),使其成为范围向量:

http_requests_total{job=”apiserver”, handler=”/api/comments”}[5m] 请注意,导致范围向量的表达式不能直接绘制,而是在表达式浏览器的表格(”Console”)视图中查看。

使用正则表达式,您只能为名称与特定模式匹配的作业选择时间序列,在本例中为所有以server结尾的作业。 请注意,这会进行子字符串匹配,而不是完整的字符串匹配:

http_requests_total{job=~”server$”} 要选择除4xx之外的所有HTTP状态代码,您可以运行:

http_requests_total{status!~”^4..$”} 二、子查询 此查询返回过去30分钟的5分钟http_requests_total指标率,分辨率为1分钟:

rate(http_requests_total[5m])[30m:1m] 这是嵌套子查询的示例。 deri函数的子查询使用默认分辨率。 请注意,不必要地使用子查询是不明智的。

max_over_time(deriv(rate(distance_covered_total[5s])[30s:5s])[10m:]) 三、使用函数,操作符等 使用http_requests_total指标名称返回所有时间序列的每秒速率,在过去5分钟内的增长率:

rate(http_requests_total[5m]) 假设http_requests_total时间序列都有标签job(按作业名称扇出)和instance(按作业实例扇出),我们可能想要总结所有实例的速率,因此我们得到的输出时间序列更少,但仍然 保留job维度 sum(rate(http_requests_total)[5m]) by (job) 如果我们有两个具有相同维度标签的不同指标,我们可以对它们应用二元运算符,并且两侧具有相同标签集的元素将匹配并传播到输出。 例如,此表达式为每个实例返回MiB中未使用的内存(在虚构的群集调度程序上公开它运行的实例的这些度量标准): (instance_memory_limit_byte - instant_memory_usage_bytes) / 1024 / 1024 相同的表达式,但由应用程序总结,可以这样写:

sum( instance_memory_limit_bytes - instance_memory_usage_bytes) by (app, proc) / 1024 / 1024 如果相同的虚构集群调度程序为每个实例公开了如下所示的CPU使用率指标:

instance_cpu_time_ns{app=”lion”, pro=”web”, rev=”34d0f99”, env=”prod”, job=”cluster-manager”} instance_cpu_time_ns{app=”elephant”, proc=”worker”, rev=”34d0f99”, env=”prod”, job=”cluster-manager”} instance_cpu_time_ns{app=”turtle”, proc=”api”, rev=”4d3a513”, env=”prod”, job=”cluster-manager”} … 我们可以按应用程序(app)和进程类型(proc)分组排名前3位的CPU用户:

topk(3, sum(rate(instance_cpu_time_ns[5m])) by(app, proc)) 假设此度量标准包含每个运行实例的一个时间系列,您可以计算每个应用程序运行实例的数量,如下所示:

count(instance_cpu_time_ns) by (app)

  1. <a name="Kx3LA"></a>
  2. ### PromQL内置函数
  3. [https://www.cnblogs.com/JetpropelledSnake/p/10446878.html#top](https://www.cnblogs.com/JetpropelledSnake/p/10446878.html#top)
  4. ```yaml
  5. 概述
  6. Prometheus 提供了其它大量的内置函数,可以对时序数据进行丰富的处理。某些函数有默认的参数,例如:year(v=vector(time()) instant-vector)。其中参数 v 是一个瞬时向量,如果不提供该参数,将使用默认值 vector(time())。instant-vector 表示参数类型。
  7. abs()
  8. abs(v instant-vector) 返回输入向量的所有样本的绝对值。
  9. absent()
  10. absent(v instant-vector),如果传递给它的向量参数具有样本数据,则返回空向量;如果传递的向量参数没有样本数据,则返回不带度量指标名称且带有标签的时间序列,且样本值为1。
  11. 当监控度量指标时,如果获取到的样本数据是空的, 使用 absent 方法对告警是非常有用的。例如:
  12. 复制代码
  13. # 这里提供的向量有样本数据
  14. absent(http_requests_total{method="get"}) => no data
  15. absent(sum(http_requests_total{method="get"})) => no data
  16. # 由于不存在度量指标 nonexistent,所以 返回不带度量指标名称且带有标签的时间序列,且样本值为1
  17. absent(nonexistent{job="myjob"}) => {job="myjob"} 1
  18. # 正则匹配的 instance 不作为返回 labels 中的一部分
  19. absent(nonexistent{job="myjob",instance=~".*"}) => {job="myjob"} 1
  20. # sum 函数返回的时间序列不带有标签,且没有样本数据
  21. absent(sum(nonexistent{job="myjob"})) => {} 1
  22. 复制代码
  23. ceil()
  24. ceil(v instant-vector) 将 v 中所有元素的样本值向上四舍五入到最接近的整数。例如:
  25. node_load5{instance="192.168.1.75:9100"} # 结果为 2.79
  26. ceil(node_load5{instance="192.168.1.75:9100"}) # 结果为 3
  27. changes()
  28. changes(v range-vector) 输入一个区间向量, 返回这个区间向量内每个样本数据值变化的次数(瞬时向量)。例如
  29. # 如果样本数据值没有发生变化,则返回结果为 1
  30. changes(node_load5{instance="192.168.1.75:9100"}[1m]) # 结果为 1
  31. clamp_max()
  32. clamp_max(v instant-vector, max scalar) 函数,输入一个瞬时向量和最大值,样本数据值若大于 max,则改为 max,否则不变。例如:
  33. node_load5{instance="192.168.1.75:9100"} # 结果为 2.79
  34. clamp_max(node_load5{instance="192.168.1.75:9100"}, 2) # 结果为 2
  35. clamp_min()
  36. clamp_min(v instant-vector, min scalar) 函数,输入一个瞬时向量和最小值,样本数据值若小于 min,则改为 min,否则不变。例如:
  37. node_load5{instance="192.168.1.75:9100"} # 结果为 2.79
  38. clamp_min(node_load5{instance="192.168.1.75:9100"}, 3) # 结果为 3
  39. day_of_month()
  40. day_of_month(v=vector(time()) instant-vector) 函数,返回被给定 UTC 时间所在月的第几天。返回值范围:1~31。
  41. day_of_week()
  42. day_of_week(v=vector(time()) instant-vector) 函数,返回被给定 UTC 时间所在周的第几天。返回值范围:0~6,0 表示星期天。
  43. days_in_month()
  44. days_in_month(v=vector(time()) instant-vector) 函数,返回当月一共有多少天。返回值范围:28~31。
  45. delta()
  46. delta(v range-vector) 的参数是一个区间向量,返回一个瞬时向量。它计算一个区间向量 v 的第一个元素和最后一个元素之间的差值。由于这个值被外推到指定的整个时间范围,所以即使样本值都是整数,你仍然可能会得到一个非整数值。
  47. 例如,下面的例子返回过去两小时的 CPU 温度差:
  48. delta(cpu_temp_celsius{host="zeus"}[2h])
  49. 这个函数一般只用在 Gauge 类型的时间序列上。
  50. deriv()
  51. deriv(v range-vector) 的参数是一个区间向量,返回一个瞬时向量。它使用简单的线性回归计算区间向量 v 中各个时间序列的导数。
  52. 这个函数一般只用在 Gauge 类型的时间序列上。
  53. exp()
  54. exp(v instant-vector) 函数,输入一个瞬时向量,返回各个样本值的 e 的指数值,即 e 的 N 次方。当 N 的值足够大时会返回 +Inf。特殊情况为:
  55. Exp(+Inf) = +Inf
  56. Exp(NaN) = NaN
  57. floor()
  58. floor(v instant-vector) 函数与 ceil() 函数相反,将 v 中所有元素的样本值向下四舍五入到最接近的整数。
  59. histogram_quantile()
  60. histogram_quantile(φ float, b instant-vector) 从 bucket 类型的向量 b 中计算 φ (0 ≤ φ ≤ 1) 分位数(百分位数的一般形式)的样本的最大值。(有关 φ 分位数的详细说明以及直方图指标类型的使用,请参阅直方图和摘要)。向量 b 中的样本是每个 bucket 的采样点数量。每个样本的 labels 中必须要有 le 这个 label 来表示每个 bucket 的上边界,没有 le 标签的样本会被忽略。直方图指标类型自动提供带有 _bucket 后缀和相应标签的时间序列。
  61. 可以使用 rate() 函数来指定分位数计算的时间窗口。
  62. 例如,一个直方图指标名称为 employee_age_bucket_bucket,要计算过去 10 分钟内 第 90 个百分位数,请使用以下表达式:
  63. histogram_quantile(0.9, rate(employee_age_bucket_bucket[10m]))
  64. 返回:
  65. {instance="10.0.86.71:8080",job="prometheus"} 35.714285714285715
  66. 这表示最近 10 分钟之内 90% 的样本的最大值为 35.714285714285715。
  67. 这个计算结果是每组标签组合成一个时间序列。我们可能不会对所有这些维度(如 job、instance 和 method)感兴趣,并希望将其中的一些维度进行聚合,则可以使用 sum() 函数。例如,以下表达式根据 job 标签来对第 90 个百分位数进行聚合:
  68. # histogram_quantile() 函数必须包含 le 标签
  69. histogram_quantile(0.9, sum(rate(employee_age_bucket_bucket[10m])) by (job, le))
  70. 如果要聚合所有的标签,则使用如下表达式:
  71. histogram_quantile(0.9,sum(rate(employee_age_bucket_bucket[10m])) by (le))
  72. [info] 注意
  73. histogram_quantile 这个函数是根据假定每个区间内的样本分布是线性分布来计算结果值的(也就是说它的结果未必准确),最高的 bucket 必须是 le="+Inf" (否则就返回 NaN)。
  74. 如果分位数位于最高的 bucket(+Inf) 中,则返回第二个最高的 bucket 的上边界。如果该 bucket 的上边界大于 0,则假设最低的 bucket 的的下边界为 0,这种情况下在该 bucket 内使用常规的线性插值。
  75. 如果分位数位于最低的 bucket 中,则返回最低 bucket 的上边界。
  76. 如果 b 含有少于 2 个 buckets,那么会返回 NaN,如果 φ < 0 会返回 -Inf,如果 φ > 1 会返回 +Inf。
  77. holt_winters()
  78. holt_winters(v range-vector, sf scalar, tf scalar) 函数基于区间向量 v,生成时间序列数据平滑值。平滑因子 sf 越低, 对旧数据的重视程度越高。趋势因子 tf 越高,对数据的趋势的考虑就越多。其中,0< sf, tf <=1。
  79. holt_winters 仅适用于 Gauge 类型的时间序列。
  80. hour()
  81. hour(v=vector(time()) instant-vector) 函数返回被给定 UTC 时间的当前第几个小时,时间范围:0~23。
  82. idelta()
  83. idelta(v range-vector) 的参数是一个区间向量, 返回一个瞬时向量。它计算最新的 2 个样本值之间的差值。
  84. 这个函数一般只用在 Gauge 类型的时间序列上。
  85. increase()
  86. increase(v range-vector) 函数获取区间向量中的第一个和最后一个样本并返回其增长量, 它会在单调性发生变化时(如由于采样目标重启引起的计数器复位)自动中断。由于这个值被外推到指定的整个时间范围,所以即使样本值都是整数,你仍然可能会得到一个非整数值。
  87. 例如,以下表达式返回区间向量中每个时间序列过去 5 分钟内 HTTP 请求数的增长数:
  88. increase(http_requests_total{job="apiserver"}[5m])
  89. increase 的返回值类型只能是计数器类型,主要作用是增加图表和数据的可读性。使用 rate 函数记录规则的使用率,以便持续跟踪数据样本值的变化。
  90. irate()
  91. irate(v range-vector) 函数用于计算区间向量的增长率,但是其反应出的是瞬时增长率。irate 函数是通过区间向量中最后两个两本数据来计算区间向量的增长速率,它会在单调性发生变化时(如由于采样目标重启引起的计数器复位)自动中断。这种方式可以避免在时间窗口范围内的“长尾问题”,并且体现出更好的灵敏度,通过irate函数绘制的图标能够更好的反应样本数据的瞬时变化状态。
  92. 例如,以下表达式返回区间向量中每个时间序列过去 5 分钟内最后两个样本数据的 HTTP 请求数的增长率:
  93. irate(http_requests_total{job="api-server"}[5m])
  94. irate 只能用于绘制快速变化的计数器,在长期趋势分析或者告警中更推荐使用 rate 函数。因为使用 irate 函数时,速率的简短变化会重置 FOR 语句,形成的图形有很多波峰,难以阅读。
  95. [info] 注意
  96. 当将 irate() 函数与聚合运算符(例如 sum())或随时间聚合的函数(任何以 _over_time 结尾的函数)一起使用时,必须先执行 irate 函数,然后再进行聚合操作,否则当采样目标重新启动时 irate() 无法检测到计数器是否被重置。
  97. label_join()
  98. label_join(v instant-vector, dst_label string, separator string, src_label_1 string, src_label_2 string, ...)
  99. 函数可以将时间序列 v 中多个标签 src_label 的值,通过 separator 作为连接符写入到一个新的标签 dst_label 中。可以有多个 src_label 标签。
  100. 例如,以下表达式返回的时间序列多了一个 foo 标签,标签值为 etcd,etcd-k8s:
  101. 复制代码
  102. up{endpoint="api",instance="192.168.123.248:2379",job="etcd",namespace="monitoring",service="etcd-k8s"}
  103. => up{endpoint="api",instance="192.168.123.248:2379",job="etcd",namespace="monitoring",service="etcd-k8s"} 1
  104. label_join(up{endpoint="api",instance="192.168.123.248:2379",job="etcd",namespace="monitoring",service="etcd-k8s"}, "foo", ",", "job", "service")
  105. => up{endpoint="api",foo="etcd,etcd-k8s",instance="192.168.123.248:2379",job="etcd",namespace="monitoring",service="etcd-k8s"} 1
  106. label_replace()
  107. 复制代码
  108. 为了能够让客户端的图标更具有可读性,可以通过 label_replace 函数为时间序列添加额外的标签。label_replace 的具体参数如下:
  109. label_replace(v instant-vector, dst_label string, replacement string, src_label string, regex string)
  110. 该函数会依次对 v 中的每一条时间序列进行处理,通过 regex 匹配 src_label 的值,并将匹配部分 relacement 写入到 dst_label 标签中。如下所示:
  111. label_replace(up, "host", "$1", "instance", "(.*):.*")
  112. 函数处理后,时间序列将包含一个 host 标签,host 标签的值为 Exporter 实例的 IP 地址:
  113. up{host="localhost",instance="localhost:8080",job="cadvisor"} 1
  114. up{host="localhost",instance="localhost:9090",job="prometheus"} 1
  115. up{host="localhost",instance="localhost:9100",job="node"} 1
  116. ln()
  117. ln(v instant-vector) 计算瞬时向量 v 中所有样本数据的自然对数。特殊情况:
  118. ln(+Inf) = +Inf
  119. ln(0) = -Inf
  120. ln(x < 0) = NaN
  121. ln(NaN) = NaN
  122. log2()
  123. log2(v instant-vector) 函数计算瞬时向量 v 中所有样本数据的二进制对数。特殊情况同上。
  124. log10()
  125. log10(v instant-vector) 计算瞬时向量 v 中所有样本数据的十进制对数。特殊情况同上。
  126. minute()
  127. minute(v=vector(time()) instant-vector) 函数返回给定 UTC 时间当前小时的第多少分钟。结果范围:0~59。
  128. month()
  129. month(v=vector(time()) instant-vector) 函数返回给定 UTC 时间当前属于第几个月,结果范围:0~12。
  130. predict_linear()
  131. predict_linear(v range-vector, t scalar) 函数可以预测时间序列 v 在 t 秒后的值。它基于简单线性回归的方式,对时间窗口内的样本数据进行统计,从而可以对时间序列的变化趋势做出预测。该函数的返回结果不带有度量指标,只有标签列表。
  132. 例如,基于 2 小时的样本数据,来预测主机可用磁盘空间的是否在 4 个小时候被占满,可以使用如下表达式:
  133. predict_linear(node_filesystem_free{job="node"}[2h], 4 * 3600) < 0
  134. 通过下面的例子来观察返回值:
  135. 复制代码
  136. predict_linear(http_requests_total{code="200",instance="120.77.65.193:9090",job="prometheus",method="get"}[5m], 5)
  137. 结果:
  138. {code="200",handler="query_range",instance="120.77.65.193:9090",job="prometheus",method="get"} 1
  139. {code="200",handler="prometheus",instance="120.77.65.193:9090",job="prometheus",method="get"} 4283.449995397104
  140. {code="200",handler="static",instance="120.77.65.193:9090",job="prometheus",method="get"} 22.99999999999999
  141. ...
  142. 复制代码
  143. 这个函数一般只用在 Gauge 类型的时间序列上。
  144. rate()
  145. rate(v range-vector) 函数可以直接计算区间向量 v 在时间窗口内平均增长速率,它会在单调性发生变化时(如由于采样目标重启引起的计数器复位)自动中断。该函数的返回结果不带有度量指标,只有标签列表。
  146. 例如,以下表达式返回区间向量中每个时间序列过去 5 分钟内 HTTP 请求数的每秒增长率:
  147. 复制代码
  148. rate(http_requests_total[5m])
  149. 结果:
  150. {code="200",handler="label_values",instance="120.77.65.193:9090",job="prometheus",method="get"} 0
  151. {code="200",handler="query_range",instance="120.77.65.193:9090",job="prometheus",method="get"} 0
  152. {code="200",handler="prometheus",instance="120.77.65.193:9090",job="prometheus",method="get"} 0.2
  153. ...
  154. 复制代码
  155. rate() 函数返回值类型只能用计数器,在长期趋势分析或者告警中推荐使用这个函数。
  156. [info] 注意
  157. 当将 rate() 函数与聚合运算符(例如 sum())或随时间聚合的函数(任何以 _over_time 结尾的函数)一起使用时,必须先执行 rate 函数,然后再进行聚合操作,否则当采样目标重新启动时 rate() 无法检测到计数器是否被重置。
  158. resets()
  159. resets(v range-vector) 的参数是一个区间向量。对于每个时间序列,它都返回一个计数器重置的次数。两个连续样本之间的值的减少被认为是一次计数器重置。
  160. 这个函数一般只用在计数器类型的时间序列上。
  161. round()
  162. round(v instant-vector, to_nearest=1 scalar) 函数与 ceil 和 floor 函数类似,返回向量中所有样本值的最接近的整数。to_nearest 参数是可选的,默认为 1,表示样本返回的是最接近 1 的整数倍的值。你也可以将该参数指定为任意值(也可以是小数),表示样本返回的是最接近它的整数倍的值。
  163. scalar()
  164. scalar(v instant-vector) 函数的参数是一个单元素的瞬时向量,它返回其唯一的时间序列的值作为一个标量。如果度量指标的样本数量大于 1 或者等于 0, 则返回 NaN。
  165. sort()
  166. sort(v instant-vector) 函数对向量按元素的值进行升序排序,返回结果:key: value = 度量指标:样本值[升序排列]。
  167. sort_desc()
  168. sort(v instant-vector) 函数对向量按元素的值进行降序排序,返回结果:key: value = 度量指标:样本值[降序排列]。
  169. sqrt()
  170. sqrt(v instant-vector) 函数计算向量 v 中所有元素的平方根。
  171. time()
  172. time() 函数返回从 1970-01-01 到现在的秒数。注意:它不是直接返回当前时间,而是时间戳
  173. timestamp()
  174. timestamp(v instant-vector) 函数返回向量 v 中的每个样本的时间戳(从 1970-01-01 到现在的秒数)。
  175. 该函数从 Prometheus 2.0 版本开始引入。
  176. vector()
  177. vector(s scalar) 函数将标量 s 作为没有标签的向量返回,即返回结果为:key: value= {}, s。
  178. year()
  179. year(v=vector(time()) instant-vector) 函数返回被给定 UTC 时间的当前年份。
  180. <aggregation>_over_time()
  181. 下面的函数列表允许传入一个区间向量,它们会聚合每个时间序列的范围,并返回一个瞬时向量:
  182. avg_over_time(range-vector) : 区间向量内每个度量指标的平均值。
  183. min_over_time(range-vector) : 区间向量内每个度量指标的最小值。
  184. max_over_time(range-vector) : 区间向量内每个度量指标的最大值。
  185. sum_over_time(range-vector) : 区间向量内每个度量指标的求和。
  186. count_over_time(range-vector) : 区间向量内每个度量指标的样本数据个数。
  187. quantile_over_time(scalar, range-vector) : 区间向量内每个度量指标的样本数据值分位数,φ-quantile (0 ≤ φ ≤ 1)。
  188. stddev_over_time(range-vector) : 区间向量内每个度量指标的总体标准差。
  189. stdvar_over_time(range-vector) : 区间向量内每个度量指标的总体标准方差。
  190. [info] 注意
  191. 即使区间向量内的值分布不均匀,它们在聚合时的权重也是相同的。

Prometheus Operator 自动发现

Alertmanager

Alertmanager主要负责对Prometheus产生的告警进行统一处理,因此在Alertmanager配置中一般会包含以下几个主要部分:

  • 全局配置(global):用于定义一些全局的公共参数,如全局的SMTP配置,Slack配置等内容;
  • 模板(templates):用于定义告警通知时的模板,如HTML模板,邮件模板等;
  • 告警路由(route):根据标签匹配,确定当前告警应该如何处理;
  • 接收人(receivers):接收人是一个抽象的概念,它可以是一个邮箱也可以是微信,Slack或者Webhook等,接收人一般配合告警路由使用;
  • 抑制规则(inhibit_rules):合理设置抑制规则可以减少垃圾告警的产生

alertmanager

  1. # global块配置下的配置选项在本配置文件内的所有配置项下可见
  2. global:
  3. # 在Alertmanager内管理的每一条告警均有两种状态: "resolved"或者"firing". 在altermanager首次发送告警通知后, 该告警会一直处于firing状态,设置resolve_timeout可以指定处于firing状态的告警间隔多长时间会被设置为resolved状态, 在设置为resolved状态的告警后,altermanager不会再发送firing的告警通知.
  4. resolve_timeout: 1h
  5. # 邮件告警配置
  6. smtp_smarthost: 'smtp.exmail.qq.com:25'
  7. smtp_from: 'dukuan@xxx.com'
  8. smtp_auth_username: 'dukuan@xxx.com'
  9. smtp_auth_password: 'DKxxx'
  10. # HipChat告警配置
  11. # hipchat_auth_token: '123456789'
  12. # hipchat_auth_url: 'https://hipchat.foobar.org/'
  13. # wechat
  14. wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  15. wechat_api_secret: 'JJ'
  16. wechat_api_corp_id: 'ww'
  17. # 告警通知模板
  18. templates:
  19. - '/etc/alertmanager/config/*.tmpl'
  20. # route: 根路由,该模块用于该根路由下的节点及子路由routes的定义. 子树节点如果不对相关配置进行配置,则默认会从父路由树继承该配置选项。每一条告警都要进入route,即要求配置选项group_by的值能够匹配到每一条告警的至少一个labelkey(即通过POST请求向altermanager服务接口所发送告警的labels项所携带的<labelname>),告警进入到route后,将会根据子路由routes节点中的配置项match_re或者match来确定能进入该子路由节点的告警(由在match_re或者match下配置的labelkey: labelvalue是否为告警labels的子集决定,是的话则会进入该子路由节点,否则不能接收进入该子路由节点).
  21. route:
  22. # 例如所有labelkey:labelvalue含cluster=A及altertname=LatencyHigh labelkey的告警都会被归入单一组中
  23. group_by: ['job', 'altername', 'cluster', 'service','severity']
  24. # 若一组新的告警产生,则会等group_wait后再发送通知,该功能主要用于当告警在很短时间内接连产生时,在group_wait内合并为单一的告警后再发送
  25. group_wait: 30s
  26. # 再次告警时间间隔
  27. group_interval: 5m
  28. # 如果一条告警通知已成功发送,且在间隔repeat_interval后,该告警仍然未被设置为resolved,则会再次发送该告警通知
  29. repeat_interval: 12h
  30. # 默认告警通知接收者,凡未被匹配进入各子路由节点的告警均被发送到此接收者
  31. receiver: 'wechat'
  32. # 上述route的配置会被传递给子路由节点,子路由节点进行重新配置才会被覆盖
  33. # 子路由树
  34. routes:
  35. # 该配置选项使用正则表达式来匹配告警的labels,以确定能否进入该子路由树
  36. # match_re和match均用于匹配labelkey为service,labelvalue分别为指定值的告警,被匹配到的告警会将通知发送到对应的receiver
  37. - match_re:
  38. service: ^(foo1|foo2|baz)$
  39. receiver: 'wechat'
  40. # 在带有service标签的告警同时有severity标签时,他可以有自己的子路由,同时具有severity != critical的告警则被发送给接收者team-ops-mails,对severity == critical的告警则被发送到对应的接收者即team-ops-pager
  41. routes:
  42. - match:
  43. severity: critical
  44. receiver: 'wechat'
  45. # 比如关于数据库服务的告警,如果子路由没有匹配到相应的owner标签,则都默认由team-DB-pager接收
  46. - match:
  47. service: database
  48. receiver: 'wechat'
  49. # 我们也可以先根据标签service:database将数据库服务告警过滤出来,然后进一步将所有同时带labelkey为database
  50. - match:
  51. severity: critical
  52. receiver: 'wechat'
  53. # 抑制规则,当出现critical告警时 忽略warning
  54. inhibit_rules:
  55. - source_match:
  56. severity: 'critical'
  57. target_match:
  58. severity: 'warning'
  59. # Apply inhibition if the alertname is the same.
  60. # equal: ['alertname', 'cluster', 'service']
  61. #
  62. # 收件人配置
  63. receivers:
  64. - name: 'team-ops-mails'
  65. email_configs:
  66. - to: 'dukuan@xxx.com'
  67. - name: 'wechat'
  68. wechat_configs:
  69. - send_resolved: true
  70. corp_id: 'ww'
  71. api_secret: 'JJ'
  72. to_tag: '1'
  73. agent_id: '1000002'
  74. api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  75. message: '{{ template "wechat.default.message" . }}'
  76. #- name: 'team-X-pager'
  77. # email_configs:
  78. # - to: 'team-X+alerts-critical@example.org'
  79. # pagerduty_configs:
  80. # - service_key: <team-X-key>
  81. #
  82. #- name: 'team-Y-mails'
  83. # email_configs:
  84. # - to: 'team-Y+alerts@example.org'
  85. #
  86. #- name: 'team-Y-pager'
  87. # pagerduty_configs:
  88. # - service_key: <team-Y-key>
  89. #
  90. #- name: 'team-DB-pager'
  91. # pagerduty_configs:
  92. # - service_key: <team-DB-key>
  93. #
  94. #- name: 'team-X-hipchat'
  95. # hipchat_configs:
  96. # - auth_token: <auth_token>
  97. # room_id: 85
  98. # message_format: html
  99. # notify: true

内置规则调整

  1. # 内置告警规则调整参考
  2. ## 默认调整
  3. 这里对KubeSphere的内置告警规则进行了适应性调整,指标类告警规则保留了平台资源和配额、节点资源类告警规则,kube-apiserverkubeletkube-schedulerkube-controller-managerprometheus等平台组件的告警规则,以及k8s应用类的告警规则。事件告警规则仅保留启用了集群关键事件的告警规则。
  4. 请参考以下步骤更新到集群。
  5. 1. 内置指标告警规则
  6. 如果k8s版本大于等于v1.16,使用如下命令更新:
  7. kubectl apply -f https://raw.githubusercontent.com/junotx/mixin/main/ks/ee/kuais/rules/prometheus-rules-v1.16+.yaml
  8. 否则,请使用下列命令:
  9. kubectl apply -f https://raw.githubusercontent.com/junotx/mixin/main/ks/ee/kuais/rules/prometheus-rules.yaml
  10. 2. 内置事件告警规则
  11. kubectl apply -f https://raw.githubusercontent.com/junotx/mixin/main/prom/rules/kuais/ks-events-cluster-rules-default.yaml
  12. ## 自定义调整
  13. https://github.com/junotx/mixin/blob/main/ks/ee/kuais/builtin_rules_refer.md

KubeSphere内置规则参考

https://github.com/junotx/mixin/blob/main/ks/ee/kuais/doc.md

1 指标规则

指标规则用来对平台物理资源、应用资源和关键性组件的各类指标(Prometheus格式的指标)进行评估和告警。

1.1 规则结构

指标规则即Prometheus规则,在KubeSphere平台存储在prometheusrules.monitoring.coreos.com所定义的资源中,其Spec结构如下:

  1. spec
  2. └──groups
  3. |
  4. | ┌──name (group name)
  5. 0──|
  6. | └──rules
  7. | |
  8. | | ┌──expr
  9. | 0——|──labels (recording rule)
  10. | | └──record
  11. | |
  12. | 1,2...
  13. |
  14. |
  15. | ┌──name (group name)
  16. 1——|
  17. | └──rules
  18. | |
  19. | | ┌──alert
  20. | | |──annotations
  21. | 0——|——expr (alerting rule)
  22. | | |——for
  23. | | └──labels
  24. | |
  25. | 1,2...
  26. |
  27. 2,3...

一个规则组中的规则通常只包括recording rules或只包括alerting rules

1.2 内置规则更新

这里仅针对用于告警目的的内置指标规则进行操作

KubeSphere内置了一些必要的指标告警规则,对平台物理资源、应用资源、关键性组件的各类指标进行告警。各内置告警规则的含义请参考附录中的内置指标告警规则

规则主要位于kubesphere-monitoring-system项目下的prometheus-k8s-rules资源中,通过以下命令可修改其中的规则:

  1. kubectl -n kubesphere-monitoring-system edit prometheusrules.monitoring.coreos.com prometheus-k8s-rules

该命令会进入到资源的编辑界面,编辑用法与linux中编辑文件的vim命令类似。

请参考前文的指标规则结构,对需要调整的告警规则进行操作,比如更新、删除等,然后保存后(同vim命令的保存操作)即可自动同步更新至Prometheus组件。

当只针对个别的告警规则进行删除操作时,可以参考使用以下删除单个告警规则的快捷命令:

  1. # 这里将删除prometheus-k8s-rules资源中名称为KubePodCrashLooping、级别为warning的告警规则
  2. # 若要删除其他规则,请调整命令中相应位置处的规则名称和规则级别
  3. kubectl -n kubesphere-monitoring-system get prometheusrules.monitoring.coreos.com prometheus-k8s-rules -ojson | jq 'delpaths([path(..|select(.alert?=="KubePodCrashLooping" and .labels.severity?=="warning"))])' | kubectl apply -f -

2 事件规则

事件规则用来对K8S中的Event资源进行评估和告警

2.1 规则结构

事件规则在KubeSphere平台存储在rules.events.kubesphere.io所定义的资源中,其Spec结构如下:

  1. spec
  2. └──rules
  3. |
  4. | ┌──annotations
  5. | |——condition
  6. 0——|——enable (取值truefalse来启用或禁用规则,默认false)
  7. | |——labels
  8. | |——name
  9. | └──type (取值alertnotification表示规则的类型)
  10. |
  11. |
  12. 1,2...

对于type=notification的非告警规则在当前的KubeSphere版本中未有应用,暂可忽略。

2.2 内置规则更新

这里仅针对用于告警目的的内置事件规则进行操作

KubeSphere内置了一些必要的事件告警规则,对平台各类事件进行告警。各内置事件告警规则的定义请参考附录中的内置事件告警规则

基本规则位于kubesphere-logging-system项目下的ks-events-cluster-rules-default资源中,通过以下命令可修改其中的规则:

  1. kubectl -n kubesphere-logging-system edit rules.events.kubesphere.io ks-events-cluster-rules-default

该命令会进入到资源的编辑界面,编辑用法与linux中编辑文件的vim命令类似。

请参考前文的事件规则结构,对需要调整的告警规则进行操作,比如规则禁用、更新、删除等,然后保存后(同vim命令的保存操作)即可自动同步更新至EventsRuler组件(该组件负责加载事件规则、触发事件告警)。

当只针对个别的告警规则进行删除操作时,可以参考使用以下删除单个告警规则的快捷命令:

  1. # 这里将删除ks-events-cluster-rules-default资源中名称为ContainerBackoff、级别为warning的告警规则
  2. # 若要删除其他规则,请调整命令中相应位置处的规则名称和规则级别
  3. kubectl -n kubesphere-logging-system get rules.events.kubesphere.io ks-events-cluster-rules-default -ojson | jq 'delpaths([path(..|select(.type?=="alert" and .name?=="ContainerBackoff" and .labels.severity?=="warning"))])' | kubectl apply -f -

3 附录

3.1 内置指标告警规则

规则名称 级别 说明
kube-state-metrics KubeStateMetricsListErrors critical kube-state-metrics执行k8s资源的list操作异常,可能无法导出对应资源的指标数据
KubeStateMetricsWatchErrors critical kube-state-metrics执行k8s资源的watch操作异常,可能无法导出对应资源的指标数据
node-exporter NodeFilesystemSpaceFillingUp warning 节点存储空间即将用尽
NodeFilesystemSpaceFillingUp critical 节点存储空间即将用尽
NodeFilesystemAlmostOutOfSpace warning 节点存储空间几乎用尽
NodeFilesystemAlmostOutOfSpace critical 节点存储空间几乎用尽
NodeFilesystemFilesFillingUp warning 节点inodes即将用尽
NodeFilesystemFilesFillingUp critical 节点inodes即将用尽
NodeFilesystemAlmostOutOfFiles warning 节点inodes几乎用尽
NodeFilesystemAlmostOutOfFiles critical 节点inodes几乎用尽
NodeNetworkReceiveErrs warning 节点接收网络数据异常多
NodeNetworkTransmitErrs warning 节点发送网络数据异常多
NodeHighNumberConntrackEntriesUsed warning 节点conntrack使用量接近限制
NodeClockSkewDetected warning 节点时钟倾斜
kubernetes-apps KubePodCrashLooping warning 容器组频繁重启
KubePodNotReady warning 容器组长时间未就绪
KubeDeploymentGenerationMismatch warning Deployment版本号不匹配
KubeDeploymentReplicasMismatch warning Deployment副本数不匹配
KubeStatefulSetReplicasMismatch warning StatefulSet副本数不匹配
KubeStatefulSetGenerationMismatch warning StatefulSet版本号不匹配
KubeStatefulSetUpdateNotRolledOut warning StatefulSet更新未被回滚
KubeDaemonSetRolloutStuck warning DaemonSet回滚阻塞
KubeContainerWaiting warning 容器长时间处于等待状态
KubeDaemonSetNotScheduled warning DaemonSet的pod未调度
KubeDaemonSetMisScheduled warning DaemonSet的pod调度位置不对
KubeCronJobRunning warning CronJob完成任务耗时久
KubeJobCompletion warning Job耗时久
KubeJobFailed warning Job执行失败
KubeHpaReplicasMismatch warning HPA副本数不匹配
KubeHpaMaxedOut warning HPA长时间处于最大副本状态
kubernetes-resources KubeCPUOvercommit warning k8s集群CPU资源请求超额,将无法容忍节点故障
KubeMemoryOvercommit warning k8s集群内存资源请求超额,将无法容忍节点故障
KubeCPUQuotaOvercommit warning namespace的cpu资源请求超额
KubeMemoryQuotaOvercommit warning namespace的内存资源请求超额
KubeQuotaExceeded warning namespace的资源用量高
CPUThrottlingHigh warning cpu处于节制状态时间占比高
kubernetes-storage KubePersistentVolumeFillingUp critical 持久化存储卷空间即将用尽
KubePersistentVolumeFillingUp warning 持久化存储卷空间即将用尽
KubePersistentVolumeErrors critical 持久化存储卷状态异常
kube-apiserver-slos KubeAPIErrorBudgetBurn critical kube-apiserver组件异常多
KubeAPIErrorBudgetBurn critical kube-apiserver组件异常多
KubeAPIErrorBudgetBurn warning kube-apiserver组件异常多
KubeAPIErrorBudgetBurn warning kube-apiserver组件异常多
kubernetes-system-apiserver KubeAPILatencyHigh warning KubeAPI资源请求延迟时间长
KubeAPIErrorsHigh warning KubeAPI资源请求异常率高
KubeClientCertificateExpiration warning k8s客户端证书将过期
KubeClientCertificateExpiration critical k8s客户端证书将过期
AggregatedAPIErrors warning AggregatedAPI异常,异常值高表示相关服务的可用性频繁切换
AggregatedAPIDown warning AggregatedAPI不可用
KubeAPIDown critical KubeAPI不可用
kubernetes-system-kubelet KubeNodeNotReady warning k8s节点长时间未就绪
KubeNodeUnreachable warning k8s节点不可达
KubeletTooManyPods warning 节点的pod使用率高
KubeNodeReadinessFlapping warning 节点就绪状态频繁变化
KubeletPlegDurationHigh warning kubelet的PLEG操作耗时长
KubeletPodStartUpLatencyHigh warning kubelet启动pod时间长
KubeletDown critical kubelet不可用
kubernetes-system-scheduler KubeSchedulerDown critical kube-scheduler不可用
kubernetes-system-controller-manager KubeControllerManagerDown critical kube-controller-manager不可用
prometheus PrometheusBadConfig critical prometheus加载配置文件失败
PrometheusNotificationQueueRunningFull warning prometheus的告警通知队列将满
PrometheusErrorSendingAlertsToSomeAlertmanagers warning prometheus发送告警到部分alertmanager实例出错
PrometheusErrorSendingAlertsToAnyAlertmanager critical prometheus发送告警到所有alertmanager实例出错
PrometheusNotConnectedToAlertmanagers warning prometheus未连接任何alertmanager
PrometheusTSDBReloadsFailing warning prometheus加载磁盘块数据失败
PrometheusTSDBCompactionsFailing warning prometheus执行compact操作失败
PrometheusNotIngestingSamples warning prometheus未摄入数据
PrometheusDuplicateTimestamps warning prometheus摄入数据的时间戳重复,重复时间戳的数据将被丢弃
PrometheusOutOfOrderTimestamps warning prometheus摄入数据的时间戳出现乱序,相应的数据将被丢弃
PrometheusRemoteStorageFailures critical prometheus写远程数据失败
PrometheusRemoteWriteBehind critical prometheus写远程数据滞后时间长
PrometheusRemoteWriteDesiredShards warning prometheus写远程需要更多shards。prometheus写远程时会启用多个shards并行写,当计算的最优shards数大于配置shards数时,会触发该告警
PrometheusRuleFailures critical prometheus规则评估异常
PrometheusMissingRuleEvaluations warning prometheus错过规则评估,一般是由于规则评估过慢
alertmanager.rules AlertmanagerConfigInconsistent critical alertmanager配置不同步
AlertmanagerFailedReload warning alertmanager加载配置失败
AlertmanagerMembersInconsistent critical alertmanager节点状态不一致,找不到集群内其他节点
general.rules TargetDown warning Target服务的副本不可用率高
Watchdog none
node-network NodeNetworkInterfaceFlapping warning 节点网络接口状态频繁变化
prometheus-operator PrometheusOperatorReconcileErrors warning prometheus-operator reconcile操作异常
PrometheusOperatorNodeLookupErrors warning prometheus-operator reconcile prometheus异常

3.2 内置事件告警规则

规则名称 级别 说明
ContainerFailed warning 容器失败
ContainerPreempting warning 容器抢占中
ContainerBackoff warning 容器回退
ContainerUnhealthy warning 容器状态不良
ContainerProbeWarning warning 容器探测警告
PodKillingExceededGracePeriod warning pod终止超时
PodKillFailed warning pod终止失败
PodContainerCreateFailed warning pod容器创建失败
PodFailed warning pod失败
PodNetworkNotReady warning Pod网络异常
ImagePullPolicyError warning 镜像拉取策略错误
ImageInspectFailed warning 镜像检查失败
KubeletSetupFailed warning kubelet安装失败
VolumeAttachFailed warning 存储卷装载失败
VolumeMountFailed warning 存储卷挂载失败
VolumeResizeFailed warning 存储卷扩缩容失败
FileSystemResizeFailed warning 文件系统扩缩容失败
VolumeMapFailed warning 存储卷映射失败
VolumeAlreadyMounted warning 存储卷已被挂载
NodeRebooted warning 节点重启
ContainerGCFailed warning 容器GC失败
ImageGCFailed warning 镜像GC失败
NodeAllocatableEnforcementFailed warning 节点可分配资源更新失败
SandboxCreateFailed warning Sandbox创建失败
SandboxStatusFailed warning 获取Sandbox状态错误
DiskCapacityInvalid warning 磁盘容量配置不合法
DiskSpaceFreeFailed warning 磁盘空间释放失败
PodStatusSyncFailed warning Pod状态同步失败
ConfigurationValidationFailed warning 配置验证失败
LifecycleHookPostStartFailed warning 容器启动后的生命周期钩子运行失败
LifecycleHookPreStopFailed warning 容器停止前的生命周期钩子运行失败
HPASelectorError warning HPA选择器错误
HPAMetricError warning HPA对象指标错误
HPAConvertFailed warning HPA转换失败
HPAGetScaleFailed warning HPA规模获取失败
HPAComputeReplicasFailed warning HPA副本计算失败
HPARescaleFailed warning HPA规模调整失败
NodeSystemOOM warning 节点内存溢出
VolumeBindingFailed warning 存储卷绑定失败
VolumeMismatch warning 存储卷不匹配
VolumeRecycleFailed warning 存储卷回收失败
VolumeRecyclerPodError warning 存储卷回收器错误
VolumeDeleteFailed warning 存储卷删除失败
VolumeProvisionFailed warning 存储申请失败
VolumeProvisionCleanupFailed warning 清理存储失败
VolumeExternalExpandingError warning 存储外部扩展错误
PodScheduleFailed warning pod调度失败
PodCreateFailed warning pod创建失败
PodDeleteFailed warning pod删除失败
ReplicaSetCreateError warning 副本集创建错误
DeploymentRollbackFailed warning 部署回滚失败
DeploySelectorAll warning deploy选择了所有pod
DaemonSelectorAll warning daemonset选择了所有pod
DaemonPodFailed warning daemonset的pod失败
LoadBalancerSyncFailed warning 负载据衡器不可用
LoadBalancerUnAvailable warning 负载据衡器不可用
LoadBalancerUpdateFailed warning 更新负载据衡器失败
LoadBalancerDeleteFailed warning 负载据衡器删除失败
JobGetFailed warning 任务获取失败
JobCreateFailed warning 任务创建失败
JobDeleteFailed warning 任务删除失败
JobUnexpected warning 任务非预期
JobScheduleFailed warning 任务调度失败
  1. # 调整 Alertmanager replica
  2. kubectl -n kubesphere-monitoring-system edit alertmanagers.monitoring.coreos.com main
  3. # 调整 Alertmanager 配置 , 需要把内容拷贝出来 base64 解码,改完后再base64编码写进去
  4. kubectl -n kubesphere-monitoring-system edit secrets alertmanager-main
  5. # 修改 rule 也要改crd
  6. kubectl -n kubesphere-monitoring-system edit prometheusrules.monitoring.coreos.com prometheus-k8s-rules

Inactive:非活动状态,表示正在监控,但是还未有任何警报触发。
Pending:表示这个警报必须被触发。由于警报可以被分组、压抑/抑制或静默/静音,所以等待验证,一旦所有的验证都通过,则将转到 Firing 状态。
Firing:将警报发送到 AlertManager,它将按照配置将警报的发送给所有接收者。一旦警报解除,则将状态转到 Inactive,如此循环

KubeSphere平台从3.1版本开始重新调整了告警系统的设计,可以兼容Prometheus风格的告警规则。这里所提及的告警仅针对各类资源的指标进行告警。

KubeSphere定义了自定义告警策略的API,并提供了界面化的交互式UI,方便用户对自定义告警策略的增删改查操作。区别于自定义告警策略,3.1版本之前Prometheus所内置的告警规则保留为内置告警策略,为兼容平台外部Prometheus,Console仅提供了内置告警策略的查看功能。

为支持多租户场景,自定义告警策略分为了集群和项目两个层级。这两个层级都针对常用的指标告警场景提供了便捷的模板化配置,也开放了自定义PromQL来满足复杂的业务。

告警策略存储方式

无论是自定义告警策略,还是内置告警策略(这里仅指平台内置Prometheus的策略),它们都首先存储在prometheus-operator所定义的prometheusrules.monitoring.coreos.com资源中。该资源的Spec结构请参考下图。这些资源的更新将由prometheus-operator同步至告警系统中。

  1. spec
  2. └──groups
  3. |
  4. | ┌──name (group name)
  5. 0──|
  6. | └──rules
  7. | |
  8. | | ┌──expr
  9. | 0——|──labels (recording rule)
  10. | | └──record
  11. | |
  12. | 1,2...
  13. |
  14. |
  15. | ┌──name (group name)
  16. 1——|
  17. | └──rules
  18. | |
  19. | | ┌──alert
  20. | | |──annotations
  21. | 0——|——expr (alerting rule)
  22. | | |——for
  23. | | └──labels
  24. | |
  25. | 1,2...
  26. |
  27. 2,3...

这里请只参考告警规则,即alerting rules。
一个规则组中的规则通常只包括recording rules或只包括alerting rules。

平台默认配置下,可以通过命令kubectl -n kubesphere-monitoring-system get prometheusrules -l prometheus=k8s,role=alert-rules获取所有内置告警策略存储的资源,通过命令kubectl get prometheusrules -l thanosruler=thanos-ruler,role=thanos-alerting-rules -A获取自定义告警策略存储的资源。

请勿手动修改自定义告警策略的CRD资源,而应通过Console或API调用来更新策略。

以下是单个告警策略的存储结构说明:

  1. alert: <string>
  2. expr: <string>
  3. for: <duration>
  4. labels:
  5. [<label_name>: <label_value>...]
  6. annotations:
  7. [<annotation_name>: <annotation_value>...]
  • alert: 策略名称/规则名称/告警名称。
  • expr: 规则表达式,一个合法的PromQL表达式。
  • for: 告警持续时间。达到该持续时间的告警消息才被下发。
  • labels: 标签集。通常会有一个名称为severity,值为warning/error/critical的标签来标识告警的严重程度。这些labels将被加入到告警消息的labels中。
  • annotations: 注解集。用来丰富通知消息的内容。通常会有一个名称为summary的注解说明告警消息的摘要信息,和一个名称为message的注解说明告警消息的详细信息。

告警原理说明

这里以TargetDown这个内置告警策略为例,进行告警原理的说明。

该策略的目的是,针对Prometheus的抓取目标服务异常情况进行告警,当某个目标服务的副本不可用率大于10%,且持续超过10分钟时,发送告警消息。

  1. alert: TargetDown
  2. annotations:
  3. message: >-
  4. {{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{
  5. $labels.service }} targets in {{ $labels.namespace }} namespace
  6. are down.
  7. expr: >-
  8. 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY
  9. (job, namespace, service)) > 10
  10. for: 10m
  11. labels:
  12. severity: warning

告警系统在发现该策略后,将通过expr表达式来周期性地查询指标系统,结果集将是副本不可用率大于10%的那些目标服务。如果在for所指定的时间范围内,每次查询的结果集之中都包含目标服务A,那么,以TargetDown命名且包含服务A属性的告警消息,就将被发送到下游通知系统。这之后,如果查询结果集中继续包含A服务,相应的告警消息将继续发送,反之则在下次查询结果集中出现服务A时进行重新计时,直到再次满足for所指定的时间范围。

告警消息主要包括alertnamelabelsannotations三个属性。alertname来自于告警策略名称,labels包含了表达式查询结果中的labels和告警策略中的labelsannotations来自于告警策略的annotations

告警策略中的annotaions支持配置模板,具体请参考这里,模板执行后的结果会放在告警消息中的annotations

持续时间未设置或被设置为0时,告警系统通过expr查询到结果后,将不等待就发送告警消息到下游。

自定义告警策略

配置说明

具体的配置步骤请参考集群告警策略配置文档项目告警策略配置文档

这里针对自定义告警策略API所定义的告警策略数据传输结构与Console上的界面要素的关系进行说明。前者的结构如下:

  1. name: <rule_name>
  2. query: <query_string>
  3. duration: <duration>
  4. labels:
  5. [<label_name>: <label_value>...]
  6. annotations:
  7. [<annotation_name>: <annotation_value>...]

这里的name, query, duration,分别与前文告警策略存储结构中的alert, expr, for一一对应。

策略名称:自定义策略的名称要求是一个合法的k8s资源名称,这与Prometheus有所区别。

持续时间:对应到duration属性。

告警级别:将作为标签添加到labels中,标签名severity,标签值支持warningerrorcritical,依次是一般告警重要告警危险告警,告警的严重程度或紧急程度依次递增。

规则模板自定义规则:使用规则模板配置时,将根据输入来自动组装PromQL表达式,填充到query中。而自定义规则则是直接配置query表达式。

通知内容: 用来丰富告警消息的内容。其中的标题作为名称为summary的注解添加到annotations消息则对应到名称为message的注解。

query表达式和duration的评估结果,决定了告警消息是否产生和是否下发。据此,告警策略的告警状态分为了以下三种:

  • 未触发:表示二者条件都不满足,此时未产生告警消息。
  • 待触发:表示满足query但不满足duration,可以认为此时已产生告警消息,但暂未下发。
  • 触发中,表示二者条件都满足,此时已开始(或正准备)发送告警消息到下游通知系统。

模板规则配置参考

集群级别

Console上集群层级的告警策略提供了配置模板,可以针对节点的CPU、内存、本地磁盘、网络等各种资源类指标,进行快速的告警规则配置。下表列出了针对这些指标的建议阈值配置,提供给用户配置时参考。

指标名称 操作符 建议阈值 单位
容器组异常率 >
>=
3 %
容器组利用率 >
>=
80 %
CPU利用率 >
>=
80 %
CPU 1分钟平均负载 >
>=
- Core
CPU 5分钟平均负载 >
>=
- Core
可用内存 <
<=
- GB
内存利用率 >
>=
80 %
本地磁盘可用空间 <
<=
- GB
本地磁盘空间利用率 >
>=
80 %
inode利用率 >
>=
80 %
本地磁盘读取IOPS >
>=
- 次数/s
本地磁盘写入IOPS >
>=
- 次数/s
本地磁盘读取吞吐量 >
>=
- KB/s
本地磁盘写入吞吐量 >
>=
- KB/s
网络发送数据速率 >
>=
- Mbps
网络接收数据速率 >
>=
- Mbps
  • 单位已由Console指定,配置时无需设定。
  • 未给出建议阈值的指标,用户请根据平台规模和业务需要自行配置。

项目级别

Console为项目层级的告警规则配置,提供了针对部署、有状态副本集、守护进程集等工作负载,CPU用量、内存用量、网路数据收发速率、副本不可用率等指标在内的模板化告警规则配置

指标名称 操作符 建议阈值 单位
CPU用量 >
>=
- Core
内存用量 >
>=
- Mi
内存用量(包含缓存) >
>=
- Mi
网络发送数据速率 >
>=
- Kbps
网络接收数据速率 >
>=
- Kbps
副本不可用率 >
>=
- %
  • 单位已由Console指定,配置时无需设定。
  • 这里未给出建议阈值,请根据实际业务需求进行配置。

内置告警策略

KubeSphere内置了一些必要的告警策略,对平台物理资源、应用资源、关键性组件的各类指标进行告警。这些内置告警策略将由Prometheus组件来评估和告警,它们的含义请参考附录一:内置告警规则表

通过集群管理的告警策略页可以查询和查看内置告警策略。通常不建议对这些内置告警策略进行调整,若有需求,请参考后续的配置说明。

通过命令kubectl -n kubesphere-monitoring-system get prometheusrules -l prometheus=k8s,role=alert-rules可以获取存储内置告警策略的资源。

配置说明

内置告警策略的绝大部分位于kubesphere-monitoring-system项目下的prometheus-k8s-rules资源中,该资源的结构请参考前述的告警策略存储方式。通过以下命令可修改其中的策略规则:

  1. kubectl -n kubesphere-monitoring-system edit prometheusrules.monitoring.coreos.com prometheus-k8s-rules

该命令会进入到资源的编辑界面,编辑用法与linux中编辑文件的vim命令类似。

请参考前文的告警策略结构,对需要调整的告警策略进行操作,比如更新、删除等,然后保存后(同vim命令的保存操作)即可自动同步更新至Prometheus组件。

当只针对个别的内置告警策略进行删除操作时,请参考使用以下删除单个告警策略的快捷命令:

  1. # 这里将删除prometheus-k8s-rules资源中名称为KubePodCrashLooping、级别为warning的告警规则
  2. # 若要删除其他规则,请调整命令中相应位置处的规则名称和规则级别
  3. kubectl -n kubesphere-monitoring-system get prometheusrules.monitoring.coreos.com prometheus-k8s-rules -ojson | jq 'delpaths([path(..|select(.alert?=="KubePodCrashLooping" and .labels.severity?=="warning"))])' | kubectl apply -f -

etcd相关的内置告警策略位于kubesphere-monitoring-system项目下的prometheus-k8s-etcd-rules资源中。

附录一:内置告警策略表

规则名称 持续时间 级别 说明
kube-state-metrics KubeStateMetricsListErrors 15m critical kube-state-metrics执行k8s资源的list操作异常,可能无法导出对应资源的指标数据
KubeStateMetricsWatchErrors 15m critical kube-state-metrics执行k8s资源的watch操作异常,可能无法导出对应资源的指标数据
node-exporter NodeFilesystemSpaceFillingUp 1h warning 节点存储空间即将用尽(预计未来24小时将用尽时)
NodeFilesystemSpaceFillingUp 1h critical 节点存储空间即将用尽(预计未来4小时将用尽时)
NodeFilesystemAlmostOutOfSpace 1h warning 节点存储空间几乎用尽(存储少于5%)
NodeFilesystemAlmostOutOfSpace 1h critical 节点存储空间几乎用尽(存储少于3%)
NodeFilesystemFilesFillingUp 1h warning 节点inodes即将用尽(预计未来24小时将用尽时)
NodeFilesystemFilesFillingUp 1h critical 节点inodes即将用尽(预计未来4小时将用尽时)
NodeFilesystemAlmostOutOfFiles 1h warning 节点inodes几乎用尽(inodes少于5%)
NodeFilesystemAlmostOutOfFiles 1h critical 节点inodes几乎用尽(inodes少于3%)
NodeNetworkReceiveErrs 1h warning 节点接收网络数据异常多
NodeNetworkTransmitErrs 1h warning 节点发送网络数据异常多
NodeHighNumberConntrackEntriesUsed warning 节点conntrack使用量接近限制
NodeClockSkewDetected 10m warning 节点时钟倾斜
kubernetes-apps KubePodCrashLooping 15m warning 容器组频繁重启
KubePodNotReady 15m warning 容器组长时间未就绪
KubeDeploymentGenerationMismatch 15m warning Deployment版本号不匹配
KubeDeploymentReplicasMismatch 15m warning Deployment副本数不匹配
KubeStatefulSetReplicasMismatch 15m warning StatefulSet副本数不匹配
KubeStatefulSetGenerationMismatch 15m warning StatefulSet版本号不匹配
KubeStatefulSetUpdateNotRolledOut 15m warning StatefulSet更新未被回滚
KubeDaemonSetRolloutStuck 15m warning DaemonSet回滚阻塞
KubeContainerWaiting 1h warning 容器长时间处于等待状态
KubeDaemonSetNotScheduled 10m warning DaemonSet的pod未调度
KubeDaemonSetMisScheduled 15m warning DaemonSet的pod调度位置不对
KubeCronJobRunning 1h warning CronJob完成任务耗时久
KubeJobCompletion 1h warning Job耗时久
KubeJobFailed 15m warning Job执行失败
KubeHpaReplicasMismatch 15m warning HPA副本数不匹配
KubeHpaMaxedOut 15m warning HPA长时间处于最大副本状态
kubernetes-resources KubeCPUOvercommit 5m warning k8s集群CPU资源请求超额,将无法容忍节点故障
KubeMemoryOvercommit 5m warning k8s集群内存资源请求超额,将无法容忍节点故障
KubeCPUQuotaOvercommit 5m warning namespace的cpu资源请求超额
KubeMemoryQuotaOvercommit 5m warning namespace的内存资源请求超额
KubeQuotaExceeded 15m warning namespace的资源用量高
CPUThrottlingHigh 15m warning cpu处于节制状态时间占比高
kubernetes-storage KubePersistentVolumeFillingUp 1m critical 持久化存储卷空间即将用尽(存储剩余少于3%时)
KubePersistentVolumeFillingUp 1h warning 持久化存储卷空间即将用尽(存储剩余少于15%并且预计未来4天将用尽时)
KubePersistentVolumeErrors 5m critical 持久化存储卷状态异常
kube-apiserver-slos KubeAPIErrorBudgetBurn 2m critical kube-apiserver组件异常多(高时延+返回码5xx的请求占比在最近1小时内和5分钟内都大于14.4%时)
KubeAPIErrorBudgetBurn 15m critical kube-apiserver组件异常多(高时延+返回码5xx的请求占比在最近6小时内和30分钟内都大于6%时)
KubeAPIErrorBudgetBurn 1h warning kube-apiserver组件异常多(高时延+返回码5xx的请求占比在最近1天内和2小时内都大于3%时)
KubeAPIErrorBudgetBurn 3h warning kube-apiserver组件异常多(高时延+返回码5xx的请求占比在最近3天内和6小时内都大于1%时)
kubernetes-system-apiserver KubeAPILatencyHigh 5m warning KubeAPI资源请求延迟时间长
KubeAPIErrorsHigh 10m warning KubeAPI资源请求异常率高
KubeClientCertificateExpiration warning k8s客户端证书将过期(距离证书过期少于7天时)
KubeClientCertificateExpiration critical k8s客户端证书将过期(距离证书过期少于24小时)
AggregatedAPIErrors warning AggregatedAPI异常,异常值高表示相关服务的可用性频繁切换
AggregatedAPIDown 5m warning AggregatedAPI不可用
KubeAPIDown 15m critical KubeAPI不可用
kubernetes-system-kubelet KubeNodeNotReady 15m warning k8s节点长时间未就绪
KubeNodeUnreachable 2m warning k8s节点不可达
KubeletTooManyPods 15m warning 节点的pod使用率高
KubeNodeReadinessFlapping 15m warning 节点就绪状态频繁变化
KubeletPlegDurationHigh 5m warning kubelet的PLEG操作耗时长
KubeletPodStartUpLatencyHigh 15m warning kubelet启动pod时间长
KubeletDown 15m critical kubelet不可用
kubernetes-system-scheduler KubeSchedulerDown 15m critical kube-scheduler不可用
kubernetes-system-controller-manager KubeControllerManagerDown 15m critical kube-controller-manager不可用
prometheus PrometheusBadConfig 10m critical prometheus加载配置文件失败
PrometheusNotificationQueueRunningFull 15m warning prometheus的告警通知队列将满
PrometheusErrorSendingAlertsToSomeAlertmanagers 15m warning prometheus发送告警到部分alertmanager实例出错
PrometheusErrorSendingAlertsToAnyAlertmanager 15m critical prometheus发送告警到所有alertmanager实例出错
PrometheusNotConnectedToAlertmanagers 10m warning prometheus未连接任何alertmanager
PrometheusTSDBReloadsFailing 4h warning prometheus加载磁盘块数据失败
PrometheusTSDBCompactionsFailing 4h warning prometheus执行compact操作失败
PrometheusNotIngestingSamples 10m warning prometheus未摄入数据
PrometheusDuplicateTimestamps 10m warning prometheus摄入数据的时间戳重复,重复时间戳的数据将被丢弃
PrometheusOutOfOrderTimestamps 10m warning prometheus摄入数据的时间戳出现乱序,相应的数据将被丢弃
PrometheusRemoteStorageFailures 15m critical prometheus写远程数据失败
PrometheusRemoteWriteBehind 15m critical prometheus写远程数据滞后时间长
PrometheusRemoteWriteDesiredShards 15m warning prometheus写远程需要更多shards。prometheus写远程时会启用多个shards并行写,当计算的最优shards数大于配置shards数时,会触发该告警
PrometheusRuleFailures 15m critical prometheus规则评估异常
PrometheusMissingRuleEvaluations 15m warning prometheus错过规则评估,一般是由于规则评估过慢
alertmanager.rules AlertmanagerConfigInconsistent 5m critical alertmanager配置不同步
AlertmanagerFailedReload 10m warning alertmanager加载配置失败
AlertmanagerMembersInconsistent 5m critical alertmanager节点状态不一致,找不到集群内其他节点
general.rules TargetDown 10m warning Target服务的副本不可用率高
Watchdog none
node-network NodeNetworkInterfaceFlapping 2m warning 节点网络接口状态频繁变化
prometheus-operator PrometheusOperatorReconcileErrors 10m warning prometheus-operator reconcile操作异常
PrometheusOperatorNodeLookupErrors 10m warning prometheus-operator reconcile prometheus异常
etcd etcdMembersDown 3m critical etcd节点不可用
etcdInsufficientMembers 3m critical etcd可用节点不足
etcdNoLeader 1m critical etcd没有leader节点
etcdHighNumberOfLeaderChanges 5m warning etcd的leader节点频繁变更
etcdHighNumberOfFailedGRPCRequests 10m warning etcd的grpc请求失败率高(失败请求占比超过1%)
etcdHighNumberOfFailedGRPCRequests 5m critical etcd的grpc请求失败率高(失败请求占比超过5%)
etcdGRPCRequestsSlow 10m critical etcd处理GRPC请求慢
etcdMemberCommunicationSlow 10m warning etcd节点间通信慢
etcdHighNumberOfFailedProposals 15m warning etcd的proposal失败率高
etcdHighFsyncDurations 10m warning etcd的fsync操作高延迟
etcdHighCommitDurations 10m warning etcd的commit操作高延迟
etcdHighNumberOfFailedHTTPRequests 10m warning etcd的http请求失败率高(失败请求占比超过1%)
etcdHighNumberOfFailedHTTPRequests 10m critical etcd的http请求失败率高(失败请求占比超过5%)
etcdHTTPRequestsSlow 10m warning etcd处理http请求慢


Notification Manager

告警通知
https://github.com/kubesphere/notification-manager

使用 Notification Manager 构建云原生通知系统.pptx

租户级别告警
https://www.yuque.com/liweiming/pguwkg/zm2nu8

参考
Prometheus Operator 介绍与配置解析
策略类型说明
Prometheus系列课程.pdf
http://www.zhaowenyu.com/prometheus-doc/operator/what-is-prometheus-operator.html

监控nacos

服务发现

image.png

  1. apiVersion: monitoring.coreos.com/v1
  2. kind: ServiceMonitor
  3. metadata:
  4. name: nacos
  5. namespace: infrastructure-prod
  6. spec:
  7. endpoints:
  8. - port: http
  9. interval: 30s
  10. path: /nacos/actuator/prometheus
  11. jobLabel: app.kubernetes.io/name
  12. namespaceSelector:
  13. matchNames:
  14. - infrastructure-prod
  15. selector:
  16. matchLabels:
  17. app.kubernetes.io/name: nacos

image.png
监控面板

  1. {
  2. "annotations": {
  3. "list": [
  4. {
  5. "builtIn": 1,
  6. "datasource": "-- Grafana --",
  7. "enable": true,
  8. "hide": true,
  9. "iconColor": "rgba(0, 211, 255, 1)",
  10. "name": "Annotations & Alerts",
  11. "type": "dashboard"
  12. }
  13. ]
  14. },
  15. "editable": true,
  16. "gnetId": null,
  17. "graphTooltip": 0,
  18. "id": 36,
  19. "iteration": 1653032675292,
  20. "links": [],
  21. "panels": [
  22. {
  23. "collapsed": true,
  24. "datasource": null,
  25. "gridPos": {
  26. "h": 1,
  27. "w": 24,
  28. "x": 0,
  29. "y": 0
  30. },
  31. "id": 80,
  32. "panels": [
  33. {
  34. "cacheTimeout": null,
  35. "colorBackground": false,
  36. "colorValue": false,
  37. "colors": [
  38. "#299c46",
  39. "rgba(237, 129, 40, 0.89)",
  40. "#d44a3a"
  41. ],
  42. "datasource": "Prometheus-hw-cce-prod",
  43. "fieldConfig": {
  44. "defaults": {
  45. "custom": {}
  46. },
  47. "overrides": []
  48. },
  49. "format": "none",
  50. "gauge": {
  51. "maxValue": 100,
  52. "minValue": 0,
  53. "show": false,
  54. "thresholdLabels": false,
  55. "thresholdMarkers": true
  56. },
  57. "gridPos": {
  58. "h": 3,
  59. "w": 3,
  60. "x": 0,
  61. "y": 1
  62. },
  63. "id": 89,
  64. "interval": null,
  65. "links": [],
  66. "mappingType": 1,
  67. "mappingTypes": [
  68. {
  69. "name": "value to text",
  70. "value": 1
  71. },
  72. {
  73. "name": "range to text",
  74. "value": 2
  75. }
  76. ],
  77. "maxDataPoints": 100,
  78. "nullPointMode": "connected",
  79. "nullText": null,
  80. "postfix": "",
  81. "postfixFontSize": "50%",
  82. "prefix": "",
  83. "prefixFontSize": "50%",
  84. "rangeMaps": [
  85. {
  86. "from": "null",
  87. "text": "N/A",
  88. "to": "null"
  89. }
  90. ],
  91. "sparkline": {
  92. "fillColor": "rgba(31, 118, 189, 0.18)",
  93. "full": false,
  94. "lineColor": "rgb(31, 120, 193)",
  95. "show": false
  96. },
  97. "tableColumn": "",
  98. "targets": [
  99. {
  100. "expr": "count(nacos_monitor{name=\"configCount\"})",
  101. "format": "time_series",
  102. "intervalFactor": 1,
  103. "refId": "A"
  104. }
  105. ],
  106. "thresholds": "",
  107. "title": "UP",
  108. "type": "singlestat",
  109. "valueFontSize": "80%",
  110. "valueMaps": [
  111. {
  112. "op": "=",
  113. "text": "N/A",
  114. "value": "null"
  115. }
  116. ],
  117. "valueName": "current"
  118. },
  119. {
  120. "cacheTimeout": null,
  121. "colorBackground": false,
  122. "colorValue": false,
  123. "colors": [
  124. "#299c46",
  125. "rgba(237, 129, 40, 0.89)",
  126. "#d44a3a"
  127. ],
  128. "datasource": "Prometheus-hw-cce-prod",
  129. "fieldConfig": {
  130. "defaults": {
  131. "custom": {}
  132. },
  133. "overrides": []
  134. },
  135. "format": "none",
  136. "gauge": {
  137. "maxValue": 100,
  138. "minValue": 0,
  139. "show": false,
  140. "thresholdLabels": false,
  141. "thresholdMarkers": true
  142. },
  143. "gridPos": {
  144. "h": 3,
  145. "w": 3,
  146. "x": 3,
  147. "y": 1
  148. },
  149. "id": 90,
  150. "interval": null,
  151. "links": [],
  152. "mappingType": 1,
  153. "mappingTypes": [
  154. {
  155. "name": "value to text",
  156. "value": 1
  157. },
  158. {
  159. "name": "range to text",
  160. "value": 2
  161. }
  162. ],
  163. "maxDataPoints": 100,
  164. "nullPointMode": "connected",
  165. "nullText": null,
  166. "postfix": "",
  167. "postfixFontSize": "50%",
  168. "prefix": "",
  169. "prefixFontSize": "50%",
  170. "rangeMaps": [
  171. {
  172. "from": "null",
  173. "text": "N/A",
  174. "to": "null"
  175. }
  176. ],
  177. "sparkline": {
  178. "fillColor": "rgba(31, 118, 189, 0.18)",
  179. "full": false,
  180. "lineColor": "rgb(31, 120, 193)",
  181. "show": false
  182. },
  183. "tableColumn": "",
  184. "targets": [
  185. {
  186. "expr": "max(nacos_monitor{name='serviceCount'})",
  187. "format": "time_series",
  188. "interval": "",
  189. "intervalFactor": 1,
  190. "legendFormat": "",
  191. "refId": "A"
  192. }
  193. ],
  194. "thresholds": "",
  195. "title": "service count",
  196. "type": "singlestat",
  197. "valueFontSize": "80%",
  198. "valueMaps": [
  199. {
  200. "op": "=",
  201. "text": "N/A",
  202. "value": "null"
  203. }
  204. ],
  205. "valueName": "current"
  206. },
  207. {
  208. "cacheTimeout": null,
  209. "colorBackground": false,
  210. "colorValue": false,
  211. "colors": [
  212. "#299c46",
  213. "rgba(237, 129, 40, 0.89)",
  214. "#d44a3a"
  215. ],
  216. "datasource": "Prometheus-hw-cce-prod",
  217. "fieldConfig": {
  218. "defaults": {
  219. "custom": {}
  220. },
  221. "overrides": []
  222. },
  223. "format": "none",
  224. "gauge": {
  225. "maxValue": 100,
  226. "minValue": 0,
  227. "show": false,
  228. "thresholdLabels": false,
  229. "thresholdMarkers": true
  230. },
  231. "gridPos": {
  232. "h": 3,
  233. "w": 3,
  234. "x": 6,
  235. "y": 1
  236. },
  237. "id": 93,
  238. "interval": null,
  239. "links": [],
  240. "mappingType": 1,
  241. "mappingTypes": [
  242. {
  243. "name": "value to text",
  244. "value": 1
  245. },
  246. {
  247. "name": "range to text",
  248. "value": 2
  249. }
  250. ],
  251. "maxDataPoints": 100,
  252. "nullPointMode": "connected",
  253. "nullText": null,
  254. "postfix": "",
  255. "postfixFontSize": "50%",
  256. "prefix": "",
  257. "prefixFontSize": "50%",
  258. "rangeMaps": [
  259. {
  260. "from": "null",
  261. "text": "N/A",
  262. "to": "null"
  263. }
  264. ],
  265. "sparkline": {
  266. "fillColor": "rgba(31, 118, 189, 0.18)",
  267. "full": false,
  268. "lineColor": "rgb(31, 120, 193)",
  269. "show": false
  270. },
  271. "tableColumn": "",
  272. "targets": [
  273. {
  274. "expr": "max(nacos_monitor{name='ipCount'})",
  275. "format": "time_series",
  276. "intervalFactor": 1,
  277. "refId": "A"
  278. }
  279. ],
  280. "thresholds": "",
  281. "title": "ip count",
  282. "type": "singlestat",
  283. "valueFontSize": "80%",
  284. "valueMaps": [
  285. {
  286. "op": "=",
  287. "text": "N/A",
  288. "value": "null"
  289. }
  290. ],
  291. "valueName": "current"
  292. },
  293. {
  294. "cacheTimeout": null,
  295. "colorBackground": false,
  296. "colorValue": false,
  297. "colors": [
  298. "#299c46",
  299. "rgba(237, 129, 40, 0.89)",
  300. "#d44a3a"
  301. ],
  302. "datasource": "Prometheus-hw-cce-prod",
  303. "fieldConfig": {
  304. "defaults": {
  305. "custom": {}
  306. },
  307. "overrides": []
  308. },
  309. "format": "none",
  310. "gauge": {
  311. "maxValue": 100,
  312. "minValue": 0,
  313. "show": false,
  314. "thresholdLabels": false,
  315. "thresholdMarkers": true
  316. },
  317. "gridPos": {
  318. "h": 3,
  319. "w": 3,
  320. "x": 9,
  321. "y": 1
  322. },
  323. "id": 92,
  324. "interval": null,
  325. "links": [],
  326. "mappingType": 1,
  327. "mappingTypes": [
  328. {
  329. "name": "value to text",
  330. "value": 1
  331. },
  332. {
  333. "name": "range to text",
  334. "value": 2
  335. }
  336. ],
  337. "maxDataPoints": 100,
  338. "nullPointMode": "connected",
  339. "nullText": null,
  340. "postfix": "",
  341. "postfixFontSize": "50%",
  342. "prefix": "",
  343. "prefixFontSize": "50%",
  344. "rangeMaps": [
  345. {
  346. "from": "null",
  347. "text": "N/A",
  348. "to": "null"
  349. }
  350. ],
  351. "sparkline": {
  352. "fillColor": "rgba(31, 118, 189, 0.18)",
  353. "full": false,
  354. "lineColor": "rgb(31, 120, 193)",
  355. "show": false
  356. },
  357. "tableColumn": "",
  358. "targets": [
  359. {
  360. "expr": "max(nacos_monitor{name='configCount', instance=~'$instance'})",
  361. "format": "time_series",
  362. "intervalFactor": 1,
  363. "refId": "A"
  364. }
  365. ],
  366. "thresholds": "",
  367. "title": "config count",
  368. "type": "singlestat",
  369. "valueFontSize": "80%",
  370. "valueMaps": [
  371. {
  372. "op": "=",
  373. "text": "N/A",
  374. "value": "null"
  375. }
  376. ],
  377. "valueName": "current"
  378. },
  379. {
  380. "cacheTimeout": null,
  381. "colorBackground": false,
  382. "colorValue": false,
  383. "colors": [
  384. "#299c46",
  385. "rgba(237, 129, 40, 0.89)",
  386. "#d44a3a"
  387. ],
  388. "datasource": "Prometheus-hw-cce-prod",
  389. "fieldConfig": {
  390. "defaults": {
  391. "custom": {}
  392. },
  393. "overrides": []
  394. },
  395. "format": "none",
  396. "gauge": {
  397. "maxValue": 100,
  398. "minValue": 0,
  399. "show": false,
  400. "thresholdLabels": false,
  401. "thresholdMarkers": true
  402. },
  403. "gridPos": {
  404. "h": 3,
  405. "w": 3,
  406. "x": 12,
  407. "y": 1
  408. },
  409. "id": 91,
  410. "interval": null,
  411. "links": [],
  412. "mappingType": 1,
  413. "mappingTypes": [
  414. {
  415. "name": "value to text",
  416. "value": 1
  417. },
  418. {
  419. "name": "range to text",
  420. "value": 2
  421. }
  422. ],
  423. "maxDataPoints": 100,
  424. "nullPointMode": "connected",
  425. "nullText": null,
  426. "postfix": "",
  427. "postfixFontSize": "50%",
  428. "prefix": "",
  429. "prefixFontSize": "50%",
  430. "rangeMaps": [
  431. {
  432. "from": "null",
  433. "text": "N/A",
  434. "to": "null"
  435. }
  436. ],
  437. "sparkline": {
  438. "fillColor": "rgba(31, 118, 189, 0.18)",
  439. "full": false,
  440. "lineColor": "rgb(31, 120, 193)",
  441. "show": false
  442. },
  443. "tableColumn": "",
  444. "targets": [
  445. {
  446. "expr": "sum(nacos_monitor{name='longPolling'})",
  447. "format": "time_series",
  448. "intervalFactor": 1,
  449. "refId": "A"
  450. }
  451. ],
  452. "thresholds": "",
  453. "title": "long polling",
  454. "type": "singlestat",
  455. "valueFontSize": "80%",
  456. "valueMaps": [
  457. {
  458. "op": "=",
  459. "text": "N/A",
  460. "value": "null"
  461. }
  462. ],
  463. "valueName": "current"
  464. },
  465. {
  466. "cacheTimeout": null,
  467. "colorBackground": false,
  468. "colorValue": false,
  469. "colors": [
  470. "#299c46",
  471. "rgba(237, 129, 40, 0.89)",
  472. "#d44a3a"
  473. ],
  474. "datasource": "Prometheus-hw-cce-prod",
  475. "fieldConfig": {
  476. "defaults": {
  477. "custom": {}
  478. },
  479. "overrides": []
  480. },
  481. "format": "none",
  482. "gauge": {
  483. "maxValue": 100,
  484. "minValue": 0,
  485. "show": false,
  486. "thresholdLabels": false,
  487. "thresholdMarkers": true
  488. },
  489. "gridPos": {
  490. "h": 3,
  491. "w": 3,
  492. "x": 15,
  493. "y": 1
  494. },
  495. "id": 88,
  496. "interval": null,
  497. "links": [],
  498. "mappingType": 1,
  499. "mappingTypes": [
  500. {
  501. "name": "value to text",
  502. "value": 1
  503. },
  504. {
  505. "name": "range to text",
  506. "value": 2
  507. }
  508. ],
  509. "maxDataPoints": 100,
  510. "nullPointMode": "connected",
  511. "nullText": null,
  512. "postfix": "",
  513. "postfixFontSize": "50%",
  514. "prefix": "",
  515. "prefixFontSize": "50%",
  516. "rangeMaps": [
  517. {
  518. "from": "null",
  519. "text": "N/A",
  520. "to": "null"
  521. }
  522. ],
  523. "sparkline": {
  524. "fillColor": "rgba(31, 118, 189, 0.18)",
  525. "full": false,
  526. "lineColor": "rgb(31, 120, 193)",
  527. "show": false
  528. },
  529. "tableColumn": "",
  530. "targets": [
  531. {
  532. "expr": "sum(nacos_monitor{name='getConfig', instance=~'$instance'}) by (name)",
  533. "format": "time_series",
  534. "intervalFactor": 1,
  535. "refId": "A"
  536. }
  537. ],
  538. "thresholds": "",
  539. "title": "config push total",
  540. "type": "singlestat",
  541. "valueFontSize": "80%",
  542. "valueMaps": [
  543. {
  544. "op": "=",
  545. "text": "N/A",
  546. "value": "null"
  547. }
  548. ],
  549. "valueName": "current"
  550. },
  551. {
  552. "datasource": null,
  553. "fieldConfig": {
  554. "defaults": {
  555. "custom": {}
  556. },
  557. "overrides": []
  558. },
  559. "gridPos": {
  560. "h": 3,
  561. "w": 6,
  562. "x": 18,
  563. "y": 1
  564. },
  565. "id": 82,
  566. "links": [],
  567. "options": {
  568. "content": "<a href=\"https://nacos.io\">\n<img src=\"https://nacos.io/img/nacos.png\" style=\"height: 50px;\" >\n</a>",
  569. "mode": "html"
  570. },
  571. "pluginVersion": "7.4.3",
  572. "title": "",
  573. "type": "text"
  574. },
  575. {
  576. "cacheTimeout": null,
  577. "colorBackground": false,
  578. "colorPrefix": false,
  579. "colorValue": false,
  580. "colors": [
  581. "#299c46",
  582. "rgba(237, 129, 40, 0.89)",
  583. "#d44a3a"
  584. ],
  585. "datasource": "Prometheus-hw-cce-prod",
  586. "decimals": null,
  587. "fieldConfig": {
  588. "defaults": {
  589. "custom": {}
  590. },
  591. "overrides": []
  592. },
  593. "format": "none",
  594. "gauge": {
  595. "maxValue": 100,
  596. "minValue": 0,
  597. "show": true,
  598. "thresholdLabels": false,
  599. "thresholdMarkers": true
  600. },
  601. "gridPos": {
  602. "h": 4,
  603. "w": 9,
  604. "x": 0,
  605. "y": 4
  606. },
  607. "id": 33,
  608. "interval": "",
  609. "links": [],
  610. "mappingType": 1,
  611. "mappingTypes": [
  612. {
  613. "name": "value to text",
  614. "value": 1
  615. },
  616. {
  617. "name": "range to text",
  618. "value": 2
  619. }
  620. ],
  621. "maxDataPoints": 100,
  622. "nullPointMode": "connected",
  623. "nullText": null,
  624. "postfix": "%",
  625. "postfixFontSize": "50%",
  626. "prefix": "",
  627. "prefixFontSize": "50%",
  628. "rangeMaps": [
  629. {
  630. "from": "null",
  631. "text": "N/A",
  632. "to": "null"
  633. }
  634. ],
  635. "repeat": null,
  636. "repeatDirection": "h",
  637. "sparkline": {
  638. "fillColor": "rgba(31, 118, 189, 0.18)",
  639. "full": false,
  640. "lineColor": "rgb(31, 120, 193)",
  641. "show": false
  642. },
  643. "tableColumn": "",
  644. "targets": [
  645. {
  646. "expr": "max(system_cpu_usage{instance=~'$instance'}) * 100",
  647. "format": "time_series",
  648. "interval": "",
  649. "intervalFactor": 1,
  650. "legendFormat": "",
  651. "refId": "A"
  652. }
  653. ],
  654. "thresholds": "50,80",
  655. "title": "cpu",
  656. "type": "singlestat",
  657. "valueFontSize": "70%",
  658. "valueMaps": [
  659. {
  660. "op": "=",
  661. "text": "N/A",
  662. "value": "null"
  663. }
  664. ],
  665. "valueName": "current"
  666. },
  667. {
  668. "cacheTimeout": null,
  669. "colorBackground": false,
  670. "colorPrefix": false,
  671. "colorValue": false,
  672. "colors": [
  673. "#299c46",
  674. "rgba(237, 129, 40, 0.89)",
  675. "#d44a3a"
  676. ],
  677. "datasource": "Prometheus-hw-cce-prod",
  678. "decimals": null,
  679. "fieldConfig": {
  680. "defaults": {
  681. "custom": {}
  682. },
  683. "overrides": []
  684. },
  685. "format": "none",
  686. "gauge": {
  687. "maxValue": 70,
  688. "minValue": 0,
  689. "show": true,
  690. "thresholdLabels": false,
  691. "thresholdMarkers": true
  692. },
  693. "gridPos": {
  694. "h": 4,
  695. "w": 9,
  696. "x": 9,
  697. "y": 4
  698. },
  699. "id": 32,
  700. "interval": null,
  701. "links": [],
  702. "mappingType": 1,
  703. "mappingTypes": [
  704. {
  705. "name": "value to text",
  706. "value": 1
  707. },
  708. {
  709. "name": "range to text",
  710. "value": 2
  711. }
  712. ],
  713. "maxDataPoints": 100,
  714. "nullPointMode": "connected",
  715. "nullText": null,
  716. "postfix": "%",
  717. "postfixFontSize": "50%",
  718. "prefix": "",
  719. "prefixFontSize": "50%",
  720. "rangeMaps": [
  721. {
  722. "from": "null",
  723. "text": "N/A",
  724. "to": "null"
  725. }
  726. ],
  727. "sparkline": {
  728. "fillColor": "rgba(31, 118, 189, 0.18)",
  729. "full": false,
  730. "lineColor": "rgb(31, 120, 193)",
  731. "show": false
  732. },
  733. "tableColumn": "",
  734. "targets": [
  735. {
  736. "expr": "sum(jvm_memory_used_bytes{area=\"heap\", instance=~'$instance'})/sum(jvm_memory_max_bytes{area=\"heap\", instance=~'$instance'}) * 100",
  737. "format": "time_series",
  738. "intervalFactor": 1,
  739. "refId": "A"
  740. }
  741. ],
  742. "thresholds": "50,70",
  743. "title": "memory",
  744. "type": "singlestat",
  745. "valueFontSize": "70%",
  746. "valueMaps": [
  747. {
  748. "op": "=",
  749. "text": "N/A",
  750. "value": "null"
  751. }
  752. ],
  753. "valueName": "current"
  754. },
  755. {
  756. "dashboardFilter": "",
  757. "dashboardTags": [],
  758. "datasource": null,
  759. "fieldConfig": {
  760. "defaults": {
  761. "custom": {}
  762. },
  763. "overrides": []
  764. },
  765. "folderId": null,
  766. "gridPos": {
  767. "h": 16,
  768. "w": 6,
  769. "x": 18,
  770. "y": 4
  771. },
  772. "id": 48,
  773. "limit": 10,
  774. "links": [],
  775. "nameFilter": "",
  776. "onlyAlertsOnDashboard": false,
  777. "repeat": null,
  778. "show": "current",
  779. "sortOrder": 1,
  780. "stateFilter": [],
  781. "title": "alert list",
  782. "type": "alertlist"
  783. },
  784. {
  785. "cacheTimeout": null,
  786. "colorBackground": false,
  787. "colorPrefix": false,
  788. "colorValue": false,
  789. "colors": [
  790. "#299c46",
  791. "rgba(237, 129, 40, 0.89)",
  792. "#d44a3a"
  793. ],
  794. "datasource": "Prometheus-hw-cce-prod",
  795. "decimals": null,
  796. "fieldConfig": {
  797. "defaults": {
  798. "custom": {}
  799. },
  800. "overrides": []
  801. },
  802. "format": "none",
  803. "gauge": {
  804. "maxValue": 1500,
  805. "minValue": 0,
  806. "show": true,
  807. "thresholdLabels": false,
  808. "thresholdMarkers": true
  809. },
  810. "gridPos": {
  811. "h": 4,
  812. "w": 9,
  813. "x": 0,
  814. "y": 8
  815. },
  816. "id": 29,
  817. "interval": null,
  818. "links": [],
  819. "mappingType": 1,
  820. "mappingTypes": [
  821. {
  822. "name": "value to text",
  823. "value": 1
  824. },
  825. {
  826. "name": "range to text",
  827. "value": 2
  828. }
  829. ],
  830. "maxDataPoints": 100,
  831. "nullPointMode": "connected",
  832. "nullText": null,
  833. "postfix": "",
  834. "postfixFontSize": "50%",
  835. "prefix": "",
  836. "prefixFontSize": "50%",
  837. "rangeMaps": [
  838. {
  839. "from": "null",
  840. "text": "N/A",
  841. "to": "null"
  842. }
  843. ],
  844. "sparkline": {
  845. "fillColor": "rgba(31, 118, 189, 0.18)",
  846. "full": false,
  847. "lineColor": "rgb(31, 120, 193)",
  848. "show": false
  849. },
  850. "tableColumn": "",
  851. "targets": [
  852. {
  853. "expr": "max(jvm_threads_daemon_threads{instance=~'$instance'})",
  854. "format": "time_series",
  855. "intervalFactor": 1,
  856. "refId": "A"
  857. }
  858. ],
  859. "thresholds": "800,1500",
  860. "title": "threads",
  861. "type": "singlestat",
  862. "valueFontSize": "70%",
  863. "valueMaps": [
  864. {
  865. "op": "=",
  866. "text": "N/A",
  867. "value": "null"
  868. }
  869. ],
  870. "valueName": "current"
  871. },
  872. {
  873. "cacheTimeout": null,
  874. "colorBackground": false,
  875. "colorPrefix": false,
  876. "colorValue": false,
  877. "colors": [
  878. "#299c46",
  879. "rgba(237, 129, 40, 0.89)",
  880. "#d44a3a"
  881. ],
  882. "datasource": "Prometheus-hw-cce-prod",
  883. "decimals": null,
  884. "fieldConfig": {
  885. "defaults": {
  886. "custom": {}
  887. },
  888. "overrides": []
  889. },
  890. "format": "none",
  891. "gauge": {
  892. "maxValue": 20,
  893. "minValue": 0,
  894. "show": true,
  895. "thresholdLabels": false,
  896. "thresholdMarkers": true
  897. },
  898. "gridPos": {
  899. "h": 4,
  900. "w": 9,
  901. "x": 9,
  902. "y": 8
  903. },
  904. "id": 30,
  905. "interval": null,
  906. "links": [],
  907. "mappingType": 1,
  908. "mappingTypes": [
  909. {
  910. "name": "value to text",
  911. "value": 1
  912. },
  913. {
  914. "name": "range to text",
  915. "value": 2
  916. }
  917. ],
  918. "maxDataPoints": 100,
  919. "nullPointMode": "connected",
  920. "nullText": null,
  921. "postfix": "",
  922. "postfixFontSize": "50%",
  923. "prefix": "",
  924. "prefixFontSize": "50%",
  925. "rangeMaps": [
  926. {
  927. "from": "null",
  928. "text": "N/A",
  929. "to": "null"
  930. }
  931. ],
  932. "sparkline": {
  933. "fillColor": "rgba(31, 118, 189, 0.18)",
  934. "full": false,
  935. "lineColor": "rgb(31, 120, 193)",
  936. "show": false
  937. },
  938. "tableColumn": "",
  939. "targets": [
  940. {
  941. "expr": "max(system_load_average_1m{instance=~'$instance'})",
  942. "format": "time_series",
  943. "intervalFactor": 1,
  944. "refId": "A"
  945. }
  946. ],
  947. "thresholds": "5,10",
  948. "title": "load",
  949. "type": "singlestat",
  950. "valueFontSize": "70%",
  951. "valueMaps": [
  952. {
  953. "op": "=",
  954. "text": "N/A",
  955. "value": "null"
  956. }
  957. ],
  958. "valueName": "current"
  959. },
  960. {
  961. "cacheTimeout": null,
  962. "colorBackground": false,
  963. "colorPrefix": false,
  964. "colorValue": false,
  965. "colors": [
  966. "#299c46",
  967. "rgba(237, 129, 40, 0.89)",
  968. "#d44a3a"
  969. ],
  970. "datasource": "Prometheus-hw-cce-prod",
  971. "decimals": null,
  972. "fieldConfig": {
  973. "defaults": {
  974. "custom": {}
  975. },
  976. "overrides": []
  977. },
  978. "format": "none",
  979. "gauge": {
  980. "maxValue": 5000,
  981. "minValue": 0,
  982. "show": true,
  983. "thresholdLabels": false,
  984. "thresholdMarkers": true
  985. },
  986. "gridPos": {
  987. "h": 4,
  988. "w": 9,
  989. "x": 0,
  990. "y": 12
  991. },
  992. "id": 61,
  993. "interval": null,
  994. "links": [],
  995. "mappingType": 1,
  996. "mappingTypes": [
  997. {
  998. "name": "value to text",
  999. "value": 1
  1000. },
  1001. {
  1002. "name": "range to text",
  1003. "value": 2
  1004. }
  1005. ],
  1006. "maxDataPoints": 100,
  1007. "nullPointMode": "connected",
  1008. "nullText": null,
  1009. "postfix": "ms",
  1010. "postfixFontSize": "50%",
  1011. "prefix": "",
  1012. "prefixFontSize": "50%",
  1013. "rangeMaps": [
  1014. {
  1015. "from": "null",
  1016. "text": "N/A",
  1017. "to": "null"
  1018. }
  1019. ],
  1020. "sparkline": {
  1021. "fillColor": "rgba(31, 118, 189, 0.18)",
  1022. "full": false,
  1023. "lineColor": "rgb(31, 120, 193)",
  1024. "show": false
  1025. },
  1026. "tableColumn": "",
  1027. "targets": [
  1028. {
  1029. "expr": "sum(rate(nacos_timer_seconds_sum{instance=~'$instance'}[1m]))/sum(rate(nacos_timer_seconds_count{instance=~'$instance'}[1m])) * 1000",
  1030. "format": "time_series",
  1031. "intervalFactor": 1,
  1032. "legendFormat": "",
  1033. "refId": "A"
  1034. }
  1035. ],
  1036. "thresholds": "3000,5000",
  1037. "title": "notify rt",
  1038. "type": "singlestat",
  1039. "valueFontSize": "80%",
  1040. "valueMaps": [
  1041. {
  1042. "op": "=",
  1043. "text": "N/A",
  1044. "value": "null"
  1045. }
  1046. ],
  1047. "valueName": "current"
  1048. },
  1049. {
  1050. "cacheTimeout": null,
  1051. "colorBackground": false,
  1052. "colorPrefix": false,
  1053. "colorValue": false,
  1054. "colors": [
  1055. "#299c46",
  1056. "rgba(237, 129, 40, 0.89)",
  1057. "#d44a3a"
  1058. ],
  1059. "datasource": "Prometheus-hw-cce-prod",
  1060. "decimals": null,
  1061. "fieldConfig": {
  1062. "defaults": {
  1063. "custom": {}
  1064. },
  1065. "overrides": []
  1066. },
  1067. "format": "none",
  1068. "gauge": {
  1069. "maxValue": 5000,
  1070. "minValue": 0,
  1071. "show": true,
  1072. "thresholdLabels": false,
  1073. "thresholdMarkers": true
  1074. },
  1075. "gridPos": {
  1076. "h": 4,
  1077. "w": 9,
  1078. "x": 9,
  1079. "y": 12
  1080. },
  1081. "id": 26,
  1082. "interval": null,
  1083. "links": [],
  1084. "mappingType": 1,
  1085. "mappingTypes": [
  1086. {
  1087. "name": "value to text",
  1088. "value": 1
  1089. },
  1090. {
  1091. "name": "range to text",
  1092. "value": 2
  1093. }
  1094. ],
  1095. "maxDataPoints": 100,
  1096. "nullPointMode": "connected",
  1097. "nullText": null,
  1098. "postfix": "ms",
  1099. "postfixFontSize": "50%",
  1100. "prefix": "",
  1101. "prefixFontSize": "50%",
  1102. "rangeMaps": [
  1103. {
  1104. "from": "null",
  1105. "text": "N/A",
  1106. "to": "null"
  1107. }
  1108. ],
  1109. "sparkline": {
  1110. "fillColor": "rgba(31, 118, 189, 0.18)",
  1111. "full": false,
  1112. "lineColor": "rgb(31, 120, 193)",
  1113. "show": false
  1114. },
  1115. "tableColumn": "",
  1116. "targets": [
  1117. {
  1118. "expr": "sum(rate(http_server_requests_seconds_sum{instance=~'$instance'}[1m]))/sum(rate(http_server_requests_seconds_count{instance=~'$instance'}[1m])) * 1000",
  1119. "format": "time_series",
  1120. "intervalFactor": 1,
  1121. "legendFormat": "",
  1122. "refId": "A"
  1123. }
  1124. ],
  1125. "thresholds": "3000,5000",
  1126. "title": "rt",
  1127. "type": "singlestat",
  1128. "valueFontSize": "80%",
  1129. "valueMaps": [
  1130. {
  1131. "op": "=",
  1132. "text": "N/A",
  1133. "value": "null"
  1134. }
  1135. ],
  1136. "valueName": "current"
  1137. },
  1138. {
  1139. "cacheTimeout": null,
  1140. "colorBackground": false,
  1141. "colorPrefix": false,
  1142. "colorValue": false,
  1143. "colors": [
  1144. "#299c46",
  1145. "rgba(237, 129, 40, 0.89)",
  1146. "#d44a3a"
  1147. ],
  1148. "datasource": "Prometheus-hw-cce-prod",
  1149. "decimals": null,
  1150. "fieldConfig": {
  1151. "defaults": {
  1152. "custom": {}
  1153. },
  1154. "overrides": []
  1155. },
  1156. "format": "none",
  1157. "gauge": {
  1158. "maxValue": 2000,
  1159. "minValue": 0,
  1160. "show": true,
  1161. "thresholdLabels": false,
  1162. "thresholdMarkers": true
  1163. },
  1164. "gridPos": {
  1165. "h": 4,
  1166. "w": 9,
  1167. "x": 0,
  1168. "y": 16
  1169. },
  1170. "id": 25,
  1171. "interval": null,
  1172. "links": [],
  1173. "mappingType": 1,
  1174. "mappingTypes": [
  1175. {
  1176. "name": "value to text",
  1177. "value": 1
  1178. },
  1179. {
  1180. "name": "range to text",
  1181. "value": 2
  1182. }
  1183. ],
  1184. "maxDataPoints": 100,
  1185. "nullPointMode": "connected",
  1186. "nullText": null,
  1187. "postfix": "",
  1188. "postfixFontSize": "50%",
  1189. "prefix": "",
  1190. "prefixFontSize": "50%",
  1191. "rangeMaps": [
  1192. {
  1193. "from": "null",
  1194. "text": "N/A",
  1195. "to": "null"
  1196. }
  1197. ],
  1198. "sparkline": {
  1199. "fillColor": "rgba(31, 118, 189, 0.18)",
  1200. "full": false,
  1201. "lineColor": "rgb(31, 120, 193)",
  1202. "show": false
  1203. },
  1204. "tableColumn": "",
  1205. "targets": [
  1206. {
  1207. "expr": "sum(rate(http_server_requests_seconds_count{instance=~'$instance'}[1m]))",
  1208. "format": "time_series",
  1209. "intervalFactor": 1,
  1210. "legendFormat": "",
  1211. "refId": "A"
  1212. }
  1213. ],
  1214. "thresholds": "1000,2000",
  1215. "title": "qps",
  1216. "type": "singlestat",
  1217. "valueFontSize": "70%",
  1218. "valueMaps": [
  1219. {
  1220. "op": "=",
  1221. "text": "N/A",
  1222. "value": "null"
  1223. }
  1224. ],
  1225. "valueName": "current"
  1226. },
  1227. {
  1228. "cacheTimeout": null,
  1229. "colorBackground": false,
  1230. "colorPrefix": false,
  1231. "colorValue": false,
  1232. "colors": [
  1233. "#299c46",
  1234. "rgba(237, 129, 40, 0.89)",
  1235. "#d44a3a"
  1236. ],
  1237. "datasource": "Prometheus-hw-cce-prod",
  1238. "decimals": null,
  1239. "fieldConfig": {
  1240. "defaults": {
  1241. "custom": {}
  1242. },
  1243. "overrides": []
  1244. },
  1245. "format": "none",
  1246. "gauge": {
  1247. "maxValue": 5000,
  1248. "minValue": 0,
  1249. "show": true,
  1250. "thresholdLabels": false,
  1251. "thresholdMarkers": true
  1252. },
  1253. "gridPos": {
  1254. "h": 4,
  1255. "w": 9,
  1256. "x": 9,
  1257. "y": 16
  1258. },
  1259. "id": 70,
  1260. "interval": null,
  1261. "links": [],
  1262. "mappingType": 1,
  1263. "mappingTypes": [
  1264. {
  1265. "name": "value to text",
  1266. "value": 1
  1267. },
  1268. {
  1269. "name": "range to text",
  1270. "value": 2
  1271. }
  1272. ],
  1273. "maxDataPoints": 100,
  1274. "nullPointMode": "connected",
  1275. "nullText": null,
  1276. "postfix": "ms",
  1277. "postfixFontSize": "50%",
  1278. "prefix": "",
  1279. "prefixFontSize": "50%",
  1280. "rangeMaps": [
  1281. {
  1282. "from": "null",
  1283. "text": "N/A",
  1284. "to": "null"
  1285. }
  1286. ],
  1287. "sparkline": {
  1288. "fillColor": "rgba(31, 118, 189, 0.18)",
  1289. "full": false,
  1290. "lineColor": "rgb(31, 120, 193)",
  1291. "show": false
  1292. },
  1293. "tableColumn": "",
  1294. "targets": [
  1295. {
  1296. "expr": "max(nacos_monitor{name='avgPushCost', instance=~'$instance'})",
  1297. "format": "time_series",
  1298. "intervalFactor": 1,
  1299. "refId": "A"
  1300. }
  1301. ],
  1302. "thresholds": "1000,5000",
  1303. "title": "avgPushCost",
  1304. "type": "singlestat",
  1305. "valueFontSize": "70%",
  1306. "valueMaps": [
  1307. {
  1308. "op": "=",
  1309. "text": "N/A",
  1310. "value": "null"
  1311. }
  1312. ],
  1313. "valueName": "current"
  1314. }
  1315. ],
  1316. "title": "nacos monitor",
  1317. "type": "row"
  1318. },
  1319. {
  1320. "collapsed": true,
  1321. "datasource": null,
  1322. "gridPos": {
  1323. "h": 1,
  1324. "w": 24,
  1325. "x": 0,
  1326. "y": 1
  1327. },
  1328. "id": 78,
  1329. "panels": [
  1330. {
  1331. "aliasColors": {},
  1332. "bars": false,
  1333. "dashLength": 10,
  1334. "dashes": false,
  1335. "datasource": "Prometheus-hw-cce-prod",
  1336. "fieldConfig": {
  1337. "defaults": {
  1338. "custom": {}
  1339. },
  1340. "overrides": []
  1341. },
  1342. "fill": 1,
  1343. "fillGradient": 0,
  1344. "gridPos": {
  1345. "h": 5,
  1346. "w": 8,
  1347. "x": 0,
  1348. "y": 2
  1349. },
  1350. "hiddenSeries": false,
  1351. "id": 20,
  1352. "legend": {
  1353. "avg": false,
  1354. "current": false,
  1355. "max": false,
  1356. "min": false,
  1357. "show": true,
  1358. "total": false,
  1359. "values": false
  1360. },
  1361. "lines": true,
  1362. "linewidth": 1,
  1363. "links": [],
  1364. "nullPointMode": "null",
  1365. "options": {
  1366. "alertThreshold": true
  1367. },
  1368. "percentage": false,
  1369. "pluginVersion": "7.4.3",
  1370. "pointradius": 5,
  1371. "points": false,
  1372. "renderer": "flot",
  1373. "seriesOverrides": [],
  1374. "spaceLength": 10,
  1375. "stack": false,
  1376. "steppedLine": false,
  1377. "targets": [
  1378. {
  1379. "expr": "sum(rate(http_server_requests_seconds_sum{uri=~'/v1/cs/configs|/nacos/v1/ns', instance=~'$instance'}[1m])/rate(http_server_requests_seconds_count{uri=~'/v1/cs/configs|/nacos/v1/ns/instance|/nacos/v1/ns/health', instance=~'$instance'}[1m])) by (method,uri) * 1000",
  1380. "format": "time_series",
  1381. "intervalFactor": 1,
  1382. "refId": "A"
  1383. },
  1384. {
  1385. "expr": "sum(rate(http_server_requests_seconds_sum{instance=~'$instance'}[1m]))/sum(rate(http_server_requests_seconds_count{instance=~'$instance'}[1m])) * 1000",
  1386. "format": "time_series",
  1387. "hide": false,
  1388. "intervalFactor": 1,
  1389. "legendFormat": "all",
  1390. "refId": "B"
  1391. }
  1392. ],
  1393. "thresholds": [],
  1394. "timeFrom": null,
  1395. "timeRegions": [],
  1396. "timeShift": null,
  1397. "title": "rt",
  1398. "tooltip": {
  1399. "shared": true,
  1400. "sort": 0,
  1401. "value_type": "individual"
  1402. },
  1403. "type": "graph",
  1404. "xaxis": {
  1405. "buckets": null,
  1406. "mode": "time",
  1407. "name": null,
  1408. "show": true,
  1409. "values": []
  1410. },
  1411. "yaxes": [
  1412. {
  1413. "format": "short",
  1414. "label": null,
  1415. "logBase": 1,
  1416. "max": null,
  1417. "min": null,
  1418. "show": true
  1419. },
  1420. {
  1421. "format": "short",
  1422. "label": null,
  1423. "logBase": 1,
  1424. "max": null,
  1425. "min": null,
  1426. "show": true
  1427. }
  1428. ],
  1429. "yaxis": {
  1430. "align": false,
  1431. "alignLevel": null
  1432. }
  1433. },
  1434. {
  1435. "aliasColors": {},
  1436. "bars": false,
  1437. "dashLength": 10,
  1438. "dashes": false,
  1439. "datasource": "Prometheus-hw-cce-prod",
  1440. "fieldConfig": {
  1441. "defaults": {
  1442. "custom": {}
  1443. },
  1444. "overrides": []
  1445. },
  1446. "fill": 1,
  1447. "fillGradient": 0,
  1448. "gridPos": {
  1449. "h": 5,
  1450. "w": 8,
  1451. "x": 8,
  1452. "y": 2
  1453. },
  1454. "hiddenSeries": false,
  1455. "id": 41,
  1456. "legend": {
  1457. "avg": false,
  1458. "current": false,
  1459. "max": false,
  1460. "min": false,
  1461. "show": true,
  1462. "total": false,
  1463. "values": false
  1464. },
  1465. "lines": true,
  1466. "linewidth": 1,
  1467. "links": [],
  1468. "nullPointMode": "null",
  1469. "options": {
  1470. "alertThreshold": true
  1471. },
  1472. "percentage": false,
  1473. "pluginVersion": "7.4.3",
  1474. "pointradius": 5,
  1475. "points": false,
  1476. "renderer": "flot",
  1477. "repeat": "group",
  1478. "repeatDirection": "h",
  1479. "seriesOverrides": [],
  1480. "spaceLength": 10,
  1481. "stack": false,
  1482. "steppedLine": false,
  1483. "targets": [
  1484. {
  1485. "expr": "sum(nacos_monitor{name='longPolling', instance=~'$instance'})",
  1486. "format": "time_series",
  1487. "intervalFactor": 1,
  1488. "legendFormat": "",
  1489. "refId": "A"
  1490. }
  1491. ],
  1492. "thresholds": [],
  1493. "timeFrom": null,
  1494. "timeRegions": [],
  1495. "timeShift": null,
  1496. "title": "long polling",
  1497. "tooltip": {
  1498. "shared": true,
  1499. "sort": 0,
  1500. "value_type": "individual"
  1501. },
  1502. "type": "graph",
  1503. "xaxis": {
  1504. "buckets": null,
  1505. "mode": "time",
  1506. "name": null,
  1507. "show": true,
  1508. "values": []
  1509. },
  1510. "yaxes": [
  1511. {
  1512. "format": "short",
  1513. "label": "",
  1514. "logBase": 1,
  1515. "max": null,
  1516. "min": null,
  1517. "show": true
  1518. },
  1519. {
  1520. "format": "short",
  1521. "label": null,
  1522. "logBase": 1,
  1523. "max": null,
  1524. "min": null,
  1525. "show": true
  1526. }
  1527. ],
  1528. "yaxis": {
  1529. "align": false,
  1530. "alignLevel": null
  1531. }
  1532. },
  1533. {
  1534. "aliasColors": {},
  1535. "bars": false,
  1536. "dashLength": 10,
  1537. "dashes": false,
  1538. "datasource": "Prometheus-hw-cce-prod",
  1539. "fieldConfig": {
  1540. "defaults": {
  1541. "custom": {}
  1542. },
  1543. "overrides": []
  1544. },
  1545. "fill": 1,
  1546. "fillGradient": 0,
  1547. "gridPos": {
  1548. "h": 5,
  1549. "w": 8,
  1550. "x": 16,
  1551. "y": 2
  1552. },
  1553. "hiddenSeries": false,
  1554. "id": 37,
  1555. "legend": {
  1556. "avg": false,
  1557. "current": false,
  1558. "max": false,
  1559. "min": false,
  1560. "show": true,
  1561. "total": false,
  1562. "values": false
  1563. },
  1564. "lines": true,
  1565. "linewidth": 1,
  1566. "links": [],
  1567. "nullPointMode": "null",
  1568. "options": {
  1569. "alertThreshold": true
  1570. },
  1571. "percentage": false,
  1572. "pluginVersion": "7.4.3",
  1573. "pointradius": 5,
  1574. "points": false,
  1575. "renderer": "flot",
  1576. "seriesOverrides": [],
  1577. "spaceLength": 10,
  1578. "stack": false,
  1579. "steppedLine": false,
  1580. "targets": [
  1581. {
  1582. "expr": "max(system_load_average_1m{instance=~'$instance'})",
  1583. "format": "time_series",
  1584. "intervalFactor": 1,
  1585. "refId": "A"
  1586. }
  1587. ],
  1588. "thresholds": [],
  1589. "timeFrom": null,
  1590. "timeRegions": [],
  1591. "timeShift": null,
  1592. "title": "load 1m",
  1593. "tooltip": {
  1594. "shared": true,
  1595. "sort": 0,
  1596. "value_type": "individual"
  1597. },
  1598. "type": "graph",
  1599. "xaxis": {
  1600. "buckets": null,
  1601. "mode": "time",
  1602. "name": null,
  1603. "show": true,
  1604. "values": []
  1605. },
  1606. "yaxes": [
  1607. {
  1608. "format": "short",
  1609. "label": null,
  1610. "logBase": 1,
  1611. "max": null,
  1612. "min": null,
  1613. "show": true
  1614. },
  1615. {
  1616. "format": "short",
  1617. "label": null,
  1618. "logBase": 1,
  1619. "max": null,
  1620. "min": null,
  1621. "show": true
  1622. }
  1623. ],
  1624. "yaxis": {
  1625. "align": false,
  1626. "alignLevel": null
  1627. }
  1628. },
  1629. {
  1630. "aliasColors": {},
  1631. "bars": false,
  1632. "dashLength": 10,
  1633. "dashes": false,
  1634. "datasource": "Prometheus-hw-cce-prod",
  1635. "fieldConfig": {
  1636. "defaults": {
  1637. "custom": {}
  1638. },
  1639. "overrides": []
  1640. },
  1641. "fill": 1,
  1642. "fillGradient": 0,
  1643. "gridPos": {
  1644. "h": 5,
  1645. "w": 8,
  1646. "x": 0,
  1647. "y": 7
  1648. },
  1649. "hiddenSeries": false,
  1650. "id": 18,
  1651. "legend": {
  1652. "avg": false,
  1653. "current": false,
  1654. "max": false,
  1655. "min": false,
  1656. "show": true,
  1657. "total": false,
  1658. "values": false
  1659. },
  1660. "lines": true,
  1661. "linewidth": 1,
  1662. "links": [],
  1663. "nullPointMode": "null",
  1664. "options": {
  1665. "alertThreshold": true
  1666. },
  1667. "percentage": false,
  1668. "pluginVersion": "7.4.3",
  1669. "pointradius": 5,
  1670. "points": false,
  1671. "renderer": "flot",
  1672. "seriesOverrides": [],
  1673. "spaceLength": 10,
  1674. "stack": false,
  1675. "steppedLine": false,
  1676. "targets": [
  1677. {
  1678. "expr": "sum(rate(http_server_requests_seconds_count{uri=~'/v1/cs/configs|/nacos/v1/ns/instance|/nacos/v1/ns/health', instance=~'$instance'}[1m])) by (method,uri)",
  1679. "format": "time_series",
  1680. "intervalFactor": 1,
  1681. "refId": "A"
  1682. },
  1683. {
  1684. "expr": "sum(rate(http_server_requests_seconds_count[1m]))",
  1685. "format": "time_series",
  1686. "intervalFactor": 1,
  1687. "refId": "B"
  1688. }
  1689. ],
  1690. "thresholds": [],
  1691. "timeFrom": null,
  1692. "timeRegions": [],
  1693. "timeShift": null,
  1694. "title": "qps",
  1695. "tooltip": {
  1696. "shared": true,
  1697. "sort": 0,
  1698. "value_type": "individual"
  1699. },
  1700. "type": "graph",
  1701. "xaxis": {
  1702. "buckets": null,
  1703. "mode": "time",
  1704. "name": null,
  1705. "show": true,
  1706. "values": []
  1707. },
  1708. "yaxes": [
  1709. {
  1710. "format": "short",
  1711. "label": null,
  1712. "logBase": 1,
  1713. "max": null,
  1714. "min": null,
  1715. "show": true
  1716. },
  1717. {
  1718. "format": "short",
  1719. "label": null,
  1720. "logBase": 1,
  1721. "max": null,
  1722. "min": null,
  1723. "show": true
  1724. }
  1725. ],
  1726. "yaxis": {
  1727. "align": false,
  1728. "alignLevel": null
  1729. }
  1730. },
  1731. {
  1732. "aliasColors": {},
  1733. "bars": false,
  1734. "dashLength": 10,
  1735. "dashes": false,
  1736. "datasource": "Prometheus-hw-cce-prod",
  1737. "fieldConfig": {
  1738. "defaults": {
  1739. "custom": {}
  1740. },
  1741. "overrides": []
  1742. },
  1743. "fill": 1,
  1744. "fillGradient": 0,
  1745. "gridPos": {
  1746. "h": 5,
  1747. "w": 8,
  1748. "x": 8,
  1749. "y": 7
  1750. },
  1751. "hiddenSeries": false,
  1752. "id": 52,
  1753. "legend": {
  1754. "avg": false,
  1755. "current": false,
  1756. "max": false,
  1757. "min": false,
  1758. "show": true,
  1759. "total": false,
  1760. "values": false
  1761. },
  1762. "lines": true,
  1763. "linewidth": 1,
  1764. "links": [],
  1765. "nullPointMode": "null",
  1766. "options": {
  1767. "alertThreshold": true
  1768. },
  1769. "percentage": false,
  1770. "pluginVersion": "7.4.3",
  1771. "pointradius": 5,
  1772. "points": false,
  1773. "renderer": "flot",
  1774. "seriesOverrides": [],
  1775. "spaceLength": 10,
  1776. "stack": false,
  1777. "steppedLine": false,
  1778. "targets": [
  1779. {
  1780. "expr": "sum(nacos_monitor{name='leaderStatus', instance=~'$instance'})",
  1781. "format": "time_series",
  1782. "intervalFactor": 1,
  1783. "refId": "B"
  1784. }
  1785. ],
  1786. "thresholds": [],
  1787. "timeFrom": null,
  1788. "timeRegions": [],
  1789. "timeShift": null,
  1790. "title": "leaderStatus",
  1791. "tooltip": {
  1792. "shared": true,
  1793. "sort": 0,
  1794. "value_type": "individual"
  1795. },
  1796. "type": "graph",
  1797. "xaxis": {
  1798. "buckets": null,
  1799. "mode": "time",
  1800. "name": null,
  1801. "show": true,
  1802. "values": []
  1803. },
  1804. "yaxes": [
  1805. {
  1806. "format": "short",
  1807. "label": null,
  1808. "logBase": 1,
  1809. "max": null,
  1810. "min": null,
  1811. "show": true
  1812. },
  1813. {
  1814. "format": "short",
  1815. "label": null,
  1816. "logBase": 1,
  1817. "max": null,
  1818. "min": null,
  1819. "show": true
  1820. }
  1821. ],
  1822. "yaxis": {
  1823. "align": false,
  1824. "alignLevel": null
  1825. }
  1826. },
  1827. {
  1828. "aliasColors": {},
  1829. "bars": false,
  1830. "dashLength": 10,
  1831. "dashes": false,
  1832. "datasource": "Prometheus-hw-cce-prod",
  1833. "fieldConfig": {
  1834. "defaults": {
  1835. "custom": {}
  1836. },
  1837. "overrides": []
  1838. },
  1839. "fill": 1,
  1840. "fillGradient": 0,
  1841. "gridPos": {
  1842. "h": 5,
  1843. "w": 8,
  1844. "x": 16,
  1845. "y": 7
  1846. },
  1847. "hiddenSeries": false,
  1848. "id": 50,
  1849. "legend": {
  1850. "avg": false,
  1851. "current": false,
  1852. "max": false,
  1853. "min": false,
  1854. "show": true,
  1855. "total": false,
  1856. "values": false
  1857. },
  1858. "lines": true,
  1859. "linewidth": 1,
  1860. "links": [],
  1861. "nullPointMode": "null",
  1862. "options": {
  1863. "alertThreshold": true
  1864. },
  1865. "percentage": false,
  1866. "pluginVersion": "7.4.3",
  1867. "pointradius": 5,
  1868. "points": false,
  1869. "renderer": "flot",
  1870. "seriesOverrides": [],
  1871. "spaceLength": 10,
  1872. "stack": false,
  1873. "steppedLine": false,
  1874. "targets": [
  1875. {
  1876. "expr": "sum(nacos_monitor{name='avgPushCost', instance=~'$instance'})",
  1877. "format": "time_series",
  1878. "intervalFactor": 1,
  1879. "refId": "A"
  1880. }
  1881. ],
  1882. "thresholds": [],
  1883. "timeFrom": null,
  1884. "timeRegions": [],
  1885. "timeShift": null,
  1886. "title": "avgPushCost",
  1887. "tooltip": {
  1888. "shared": true,
  1889. "sort": 0,
  1890. "value_type": "individual"
  1891. },
  1892. "type": "graph",
  1893. "xaxis": {
  1894. "buckets": null,
  1895. "mode": "time",
  1896. "name": null,
  1897. "show": true,
  1898. "values": []
  1899. },
  1900. "yaxes": [
  1901. {
  1902. "format": "short",
  1903. "label": null,
  1904. "logBase": 1,
  1905. "max": null,
  1906. "min": null,
  1907. "show": true
  1908. },
  1909. {
  1910. "format": "short",
  1911. "label": null,
  1912. "logBase": 1,
  1913. "max": null,
  1914. "min": null,
  1915. "show": true
  1916. }
  1917. ],
  1918. "yaxis": {
  1919. "align": false,
  1920. "alignLevel": null
  1921. }
  1922. },
  1923. {
  1924. "aliasColors": {},
  1925. "bars": false,
  1926. "dashLength": 10,
  1927. "dashes": false,
  1928. "datasource": "Prometheus-hw-cce-prod",
  1929. "fieldConfig": {
  1930. "defaults": {
  1931. "custom": {}
  1932. },
  1933. "overrides": []
  1934. },
  1935. "fill": 1,
  1936. "fillGradient": 0,
  1937. "gridPos": {
  1938. "h": 5,
  1939. "w": 8,
  1940. "x": 0,
  1941. "y": 12
  1942. },
  1943. "hiddenSeries": false,
  1944. "id": 53,
  1945. "legend": {
  1946. "avg": false,
  1947. "current": false,
  1948. "max": false,
  1949. "min": false,
  1950. "show": true,
  1951. "total": false,
  1952. "values": false
  1953. },
  1954. "lines": true,
  1955. "linewidth": 1,
  1956. "links": [],
  1957. "nullPointMode": "null",
  1958. "options": {
  1959. "alertThreshold": true
  1960. },
  1961. "percentage": false,
  1962. "pluginVersion": "7.4.3",
  1963. "pointradius": 5,
  1964. "points": false,
  1965. "renderer": "flot",
  1966. "seriesOverrides": [],
  1967. "spaceLength": 10,
  1968. "stack": false,
  1969. "steppedLine": false,
  1970. "targets": [
  1971. {
  1972. "expr": "max(nacos_monitor{name='maxPushCost', instance=~'$instance'})",
  1973. "format": "time_series",
  1974. "intervalFactor": 1,
  1975. "refId": "A"
  1976. }
  1977. ],
  1978. "thresholds": [],
  1979. "timeFrom": null,
  1980. "timeRegions": [],
  1981. "timeShift": null,
  1982. "title": "maxPushCost",
  1983. "tooltip": {
  1984. "shared": true,
  1985. "sort": 0,
  1986. "value_type": "individual"
  1987. },
  1988. "type": "graph",
  1989. "xaxis": {
  1990. "buckets": null,
  1991. "mode": "time",
  1992. "name": null,
  1993. "show": true,
  1994. "values": []
  1995. },
  1996. "yaxes": [
  1997. {
  1998. "format": "short",
  1999. "label": null,
  2000. "logBase": 1,
  2001. "max": null,
  2002. "min": null,
  2003. "show": true
  2004. },
  2005. {
  2006. "format": "short",
  2007. "label": null,
  2008. "logBase": 1,
  2009. "max": null,
  2010. "min": null,
  2011. "show": true
  2012. }
  2013. ],
  2014. "yaxis": {
  2015. "align": false,
  2016. "alignLevel": null
  2017. }
  2018. },
  2019. {
  2020. "aliasColors": {},
  2021. "bars": false,
  2022. "dashLength": 10,
  2023. "dashes": false,
  2024. "datasource": "Prometheus-hw-cce-prod",
  2025. "fieldConfig": {
  2026. "defaults": {
  2027. "custom": {}
  2028. },
  2029. "overrides": []
  2030. },
  2031. "fill": 1,
  2032. "fillGradient": 0,
  2033. "gridPos": {
  2034. "h": 5,
  2035. "w": 8,
  2036. "x": 8,
  2037. "y": 12
  2038. },
  2039. "hiddenSeries": false,
  2040. "id": 83,
  2041. "legend": {
  2042. "avg": false,
  2043. "current": false,
  2044. "max": false,
  2045. "min": false,
  2046. "show": true,
  2047. "total": false,
  2048. "values": false
  2049. },
  2050. "lines": true,
  2051. "linewidth": 1,
  2052. "links": [],
  2053. "nullPointMode": "null",
  2054. "options": {
  2055. "alertThreshold": true
  2056. },
  2057. "percentage": false,
  2058. "pluginVersion": "7.4.3",
  2059. "pointradius": 5,
  2060. "points": false,
  2061. "renderer": "flot",
  2062. "seriesOverrides": [],
  2063. "spaceLength": 10,
  2064. "stack": false,
  2065. "steppedLine": false,
  2066. "targets": [
  2067. {
  2068. "expr": "sum(nacos_monitor{name='publish', instance=~'$instance'}) by (name)",
  2069. "format": "time_series",
  2070. "intervalFactor": 1,
  2071. "legendFormat": "publish config",
  2072. "refId": "A"
  2073. },
  2074. {
  2075. "expr": "sum(nacos_monitor{name='getConfig', instance=~'$instance'}) by (name)",
  2076. "format": "time_series",
  2077. "intervalFactor": 1,
  2078. "legendFormat": "get config",
  2079. "refId": "B"
  2080. }
  2081. ],
  2082. "thresholds": [],
  2083. "timeFrom": null,
  2084. "timeRegions": [],
  2085. "timeShift": null,
  2086. "title": "config statistics",
  2087. "tooltip": {
  2088. "shared": true,
  2089. "sort": 0,
  2090. "value_type": "individual"
  2091. },
  2092. "type": "graph",
  2093. "xaxis": {
  2094. "buckets": null,
  2095. "mode": "time",
  2096. "name": null,
  2097. "show": true,
  2098. "values": []
  2099. },
  2100. "yaxes": [
  2101. {
  2102. "format": "short",
  2103. "label": null,
  2104. "logBase": 1,
  2105. "max": null,
  2106. "min": null,
  2107. "show": true
  2108. },
  2109. {
  2110. "format": "short",
  2111. "label": null,
  2112. "logBase": 1,
  2113. "max": null,
  2114. "min": null,
  2115. "show": true
  2116. }
  2117. ],
  2118. "yaxis": {
  2119. "align": false,
  2120. "alignLevel": null
  2121. }
  2122. },
  2123. {
  2124. "aliasColors": {},
  2125. "bars": false,
  2126. "dashLength": 10,
  2127. "dashes": false,
  2128. "datasource": "Prometheus-hw-cce-prod",
  2129. "fieldConfig": {
  2130. "defaults": {
  2131. "custom": {}
  2132. },
  2133. "overrides": []
  2134. },
  2135. "fill": 1,
  2136. "fillGradient": 0,
  2137. "gridPos": {
  2138. "h": 5,
  2139. "w": 8,
  2140. "x": 16,
  2141. "y": 12
  2142. },
  2143. "hiddenSeries": false,
  2144. "id": 16,
  2145. "legend": {
  2146. "avg": false,
  2147. "current": false,
  2148. "max": false,
  2149. "min": false,
  2150. "show": true,
  2151. "total": false,
  2152. "values": false
  2153. },
  2154. "lines": true,
  2155. "linewidth": 1,
  2156. "links": [],
  2157. "nullPointMode": "null",
  2158. "options": {
  2159. "alertThreshold": true
  2160. },
  2161. "percentage": false,
  2162. "pluginVersion": "7.4.3",
  2163. "pointradius": 5,
  2164. "points": false,
  2165. "renderer": "flot",
  2166. "seriesOverrides": [],
  2167. "spaceLength": 10,
  2168. "stack": false,
  2169. "steppedLine": false,
  2170. "targets": [
  2171. {
  2172. "expr": "sum(rate(nacos_monitor{name=~'.*HealthCheck', instance=~'$instance'}[1m])) by (name) * 60",
  2173. "format": "time_series",
  2174. "intervalFactor": 1,
  2175. "legendFormat": "",
  2176. "refId": "A"
  2177. }
  2178. ],
  2179. "thresholds": [],
  2180. "timeFrom": null,
  2181. "timeRegions": [],
  2182. "timeShift": null,
  2183. "title": "health check",
  2184. "tooltip": {
  2185. "shared": true,
  2186. "sort": 0,
  2187. "value_type": "individual"
  2188. },
  2189. "type": "graph",
  2190. "xaxis": {
  2191. "buckets": null,
  2192. "mode": "time",
  2193. "name": null,
  2194. "show": true,
  2195. "values": []
  2196. },
  2197. "yaxes": [
  2198. {
  2199. "format": "short",
  2200. "label": null,
  2201. "logBase": 1,
  2202. "max": null,
  2203. "min": null,
  2204. "show": true
  2205. },
  2206. {
  2207. "format": "short",
  2208. "label": null,
  2209. "logBase": 1,
  2210. "max": null,
  2211. "min": null,
  2212. "show": true
  2213. }
  2214. ],
  2215. "yaxis": {
  2216. "align": false,
  2217. "alignLevel": null
  2218. }
  2219. }
  2220. ],
  2221. "title": "nacos detail",
  2222. "type": "row"
  2223. },
  2224. {
  2225. "collapsed": true,
  2226. "datasource": null,
  2227. "gridPos": {
  2228. "h": 1,
  2229. "w": 24,
  2230. "x": 0,
  2231. "y": 2
  2232. },
  2233. "id": 74,
  2234. "panels": [
  2235. {
  2236. "alert": {
  2237. "conditions": [
  2238. {
  2239. "evaluator": {
  2240. "params": [
  2241. 50
  2242. ],
  2243. "type": "gt"
  2244. },
  2245. "operator": {
  2246. "type": "and"
  2247. },
  2248. "query": {
  2249. "params": [
  2250. "A",
  2251. "1m",
  2252. "now"
  2253. ]
  2254. },
  2255. "reducer": {
  2256. "params": [],
  2257. "type": "avg"
  2258. },
  2259. "type": "query"
  2260. }
  2261. ],
  2262. "executionErrorState": "keep_state",
  2263. "for": "1m",
  2264. "frequency": "1m",
  2265. "handler": 1,
  2266. "name": "cpu alert",
  2267. "noDataState": "ok",
  2268. "notifications": [
  2269. {
  2270. "id": 1
  2271. }
  2272. ]
  2273. },
  2274. "aliasColors": {},
  2275. "bars": false,
  2276. "dashLength": 10,
  2277. "dashes": false,
  2278. "datasource": "Prometheus-hw-cce-prod",
  2279. "fieldConfig": {
  2280. "defaults": {
  2281. "custom": {}
  2282. },
  2283. "overrides": []
  2284. },
  2285. "fill": 1,
  2286. "fillGradient": 0,
  2287. "gridPos": {
  2288. "h": 5,
  2289. "w": 8,
  2290. "x": 0,
  2291. "y": 3
  2292. },
  2293. "hiddenSeries": false,
  2294. "id": 45,
  2295. "legend": {
  2296. "avg": false,
  2297. "current": false,
  2298. "max": false,
  2299. "min": false,
  2300. "show": true,
  2301. "total": false,
  2302. "values": false
  2303. },
  2304. "lines": true,
  2305. "linewidth": 1,
  2306. "links": [],
  2307. "nullPointMode": "null",
  2308. "options": {
  2309. "alertThreshold": true
  2310. },
  2311. "percentage": false,
  2312. "pluginVersion": "7.4.3",
  2313. "pointradius": 5,
  2314. "points": false,
  2315. "renderer": "flot",
  2316. "seriesOverrides": [],
  2317. "spaceLength": 10,
  2318. "stack": false,
  2319. "steppedLine": false,
  2320. "targets": [
  2321. {
  2322. "expr": "max(system_cpu_usage) * 100",
  2323. "format": "time_series",
  2324. "intervalFactor": 1,
  2325. "refId": "A"
  2326. }
  2327. ],
  2328. "thresholds": [
  2329. {
  2330. "colorMode": "critical",
  2331. "fill": true,
  2332. "line": true,
  2333. "op": "gt",
  2334. "value": 50,
  2335. "visible": true
  2336. }
  2337. ],
  2338. "timeFrom": null,
  2339. "timeRegions": [],
  2340. "timeShift": null,
  2341. "title": "cpu alert",
  2342. "tooltip": {
  2343. "shared": true,
  2344. "sort": 0,
  2345. "value_type": "individual"
  2346. },
  2347. "type": "graph",
  2348. "xaxis": {
  2349. "buckets": null,
  2350. "mode": "time",
  2351. "name": null,
  2352. "show": true,
  2353. "values": []
  2354. },
  2355. "yaxes": [
  2356. {
  2357. "format": "short",
  2358. "label": null,
  2359. "logBase": 1,
  2360. "max": null,
  2361. "min": null,
  2362. "show": true
  2363. },
  2364. {
  2365. "format": "short",
  2366. "label": null,
  2367. "logBase": 1,
  2368. "max": null,
  2369. "min": null,
  2370. "show": true
  2371. }
  2372. ],
  2373. "yaxis": {
  2374. "align": false,
  2375. "alignLevel": null
  2376. }
  2377. },
  2378. {
  2379. "alert": {
  2380. "conditions": [
  2381. {
  2382. "evaluator": {
  2383. "params": [
  2384. 15
  2385. ],
  2386. "type": "gt"
  2387. },
  2388. "operator": {
  2389. "type": "and"
  2390. },
  2391. "query": {
  2392. "params": [
  2393. "A",
  2394. "1m",
  2395. "now"
  2396. ]
  2397. },
  2398. "reducer": {
  2399. "params": [],
  2400. "type": "avg"
  2401. },
  2402. "type": "query"
  2403. }
  2404. ],
  2405. "executionErrorState": "keep_state",
  2406. "frequency": "60s",
  2407. "handler": 1,
  2408. "name": "load 1m alert",
  2409. "noDataState": "ok",
  2410. "notifications": []
  2411. },
  2412. "aliasColors": {},
  2413. "bars": false,
  2414. "dashLength": 10,
  2415. "dashes": false,
  2416. "datasource": "Prometheus-hw-cce-prod",
  2417. "fieldConfig": {
  2418. "defaults": {
  2419. "custom": {}
  2420. },
  2421. "overrides": []
  2422. },
  2423. "fill": 1,
  2424. "fillGradient": 0,
  2425. "gridPos": {
  2426. "h": 5,
  2427. "w": 8,
  2428. "x": 8,
  2429. "y": 3
  2430. },
  2431. "hiddenSeries": false,
  2432. "id": 86,
  2433. "legend": {
  2434. "avg": false,
  2435. "current": false,
  2436. "max": false,
  2437. "min": false,
  2438. "show": true,
  2439. "total": false,
  2440. "values": false
  2441. },
  2442. "lines": true,
  2443. "linewidth": 1,
  2444. "links": [],
  2445. "nullPointMode": "null",
  2446. "options": {
  2447. "alertThreshold": true
  2448. },
  2449. "percentage": false,
  2450. "pluginVersion": "7.4.3",
  2451. "pointradius": 5,
  2452. "points": false,
  2453. "renderer": "flot",
  2454. "seriesOverrides": [],
  2455. "spaceLength": 10,
  2456. "stack": false,
  2457. "steppedLine": false,
  2458. "targets": [
  2459. {
  2460. "expr": "max(system_load_average_1m)",
  2461. "format": "time_series",
  2462. "intervalFactor": 1,
  2463. "refId": "A"
  2464. }
  2465. ],
  2466. "thresholds": [
  2467. {
  2468. "colorMode": "critical",
  2469. "fill": true,
  2470. "line": true,
  2471. "op": "gt",
  2472. "value": 15,
  2473. "visible": true
  2474. }
  2475. ],
  2476. "timeFrom": null,
  2477. "timeRegions": [],
  2478. "timeShift": null,
  2479. "title": "load alert",
  2480. "tooltip": {
  2481. "shared": true,
  2482. "sort": 0,
  2483. "value_type": "individual"
  2484. },
  2485. "type": "graph",
  2486. "xaxis": {
  2487. "buckets": null,
  2488. "mode": "time",
  2489. "name": null,
  2490. "show": true,
  2491. "values": []
  2492. },
  2493. "yaxes": [
  2494. {
  2495. "format": "short",
  2496. "label": null,
  2497. "logBase": 1,
  2498. "max": null,
  2499. "min": null,
  2500. "show": true
  2501. },
  2502. {
  2503. "format": "short",
  2504. "label": null,
  2505. "logBase": 1,
  2506. "max": null,
  2507. "min": null,
  2508. "show": true
  2509. }
  2510. ],
  2511. "yaxis": {
  2512. "align": false,
  2513. "alignLevel": null
  2514. }
  2515. },
  2516. {
  2517. "alert": {
  2518. "conditions": [
  2519. {
  2520. "evaluator": {
  2521. "params": [
  2522. 60
  2523. ],
  2524. "type": "gt"
  2525. },
  2526. "operator": {
  2527. "type": "and"
  2528. },
  2529. "query": {
  2530. "params": [
  2531. "A",
  2532. "5m",
  2533. "now"
  2534. ]
  2535. },
  2536. "reducer": {
  2537. "params": [],
  2538. "type": "avg"
  2539. },
  2540. "type": "query"
  2541. }
  2542. ],
  2543. "executionErrorState": "keep_state",
  2544. "frequency": "60s",
  2545. "handler": 1,
  2546. "name": "memory alert",
  2547. "noDataState": "ok",
  2548. "notifications": []
  2549. },
  2550. "aliasColors": {},
  2551. "bars": false,
  2552. "dashLength": 10,
  2553. "dashes": false,
  2554. "datasource": "Prometheus-hw-cce-prod",
  2555. "fieldConfig": {
  2556. "defaults": {
  2557. "custom": {}
  2558. },
  2559. "overrides": []
  2560. },
  2561. "fill": 1,
  2562. "fillGradient": 0,
  2563. "gridPos": {
  2564. "h": 5,
  2565. "w": 8,
  2566. "x": 16,
  2567. "y": 3
  2568. },
  2569. "hiddenSeries": false,
  2570. "id": 46,
  2571. "legend": {
  2572. "avg": false,
  2573. "current": false,
  2574. "max": false,
  2575. "min": false,
  2576. "show": true,
  2577. "total": false,
  2578. "values": false
  2579. },
  2580. "lines": true,
  2581. "linewidth": 1,
  2582. "links": [],
  2583. "nullPointMode": "null",
  2584. "options": {
  2585. "alertThreshold": true
  2586. },
  2587. "percentage": false,
  2588. "pluginVersion": "7.4.3",
  2589. "pointradius": 5,
  2590. "points": false,
  2591. "renderer": "flot",
  2592. "seriesOverrides": [],
  2593. "spaceLength": 10,
  2594. "stack": false,
  2595. "steppedLine": false,
  2596. "targets": [
  2597. {
  2598. "expr": "sum(jvm_memory_used_bytes{area=\"heap\"})/sum(jvm_memory_max_bytes{area=\"heap\"}) * 100",
  2599. "format": "time_series",
  2600. "intervalFactor": 1,
  2601. "refId": "A"
  2602. }
  2603. ],
  2604. "thresholds": [
  2605. {
  2606. "colorMode": "critical",
  2607. "fill": true,
  2608. "line": true,
  2609. "op": "gt",
  2610. "value": 60,
  2611. "visible": true
  2612. }
  2613. ],
  2614. "timeFrom": null,
  2615. "timeRegions": [],
  2616. "timeShift": null,
  2617. "title": "memory alert",
  2618. "tooltip": {
  2619. "shared": true,
  2620. "sort": 0,
  2621. "value_type": "individual"
  2622. },
  2623. "type": "graph",
  2624. "xaxis": {
  2625. "buckets": null,
  2626. "mode": "time",
  2627. "name": null,
  2628. "show": true,
  2629. "values": []
  2630. },
  2631. "yaxes": [
  2632. {
  2633. "format": "short",
  2634. "label": null,
  2635. "logBase": 1,
  2636. "max": null,
  2637. "min": null,
  2638. "show": true
  2639. },
  2640. {
  2641. "format": "short",
  2642. "label": null,
  2643. "logBase": 1,
  2644. "max": null,
  2645. "min": null,
  2646. "show": true
  2647. }
  2648. ],
  2649. "yaxis": {
  2650. "align": false,
  2651. "alignLevel": null
  2652. }
  2653. },
  2654. {
  2655. "alert": {
  2656. "conditions": [
  2657. {
  2658. "evaluator": {
  2659. "params": [
  2660. 500
  2661. ],
  2662. "type": "gt"
  2663. },
  2664. "operator": {
  2665. "type": "and"
  2666. },
  2667. "query": {
  2668. "params": [
  2669. "A",
  2670. "1m",
  2671. "now"
  2672. ]
  2673. },
  2674. "reducer": {
  2675. "params": [],
  2676. "type": "avg"
  2677. },
  2678. "type": "query"
  2679. }
  2680. ],
  2681. "executionErrorState": "keep_state",
  2682. "frequency": "60s",
  2683. "handler": 1,
  2684. "name": "threads alert",
  2685. "noDataState": "ok",
  2686. "notifications": []
  2687. },
  2688. "aliasColors": {},
  2689. "bars": false,
  2690. "dashLength": 10,
  2691. "dashes": false,
  2692. "datasource": "Prometheus-hw-cce-prod",
  2693. "fieldConfig": {
  2694. "defaults": {
  2695. "custom": {}
  2696. },
  2697. "overrides": []
  2698. },
  2699. "fill": 1,
  2700. "fillGradient": 0,
  2701. "gridPos": {
  2702. "h": 5,
  2703. "w": 8,
  2704. "x": 0,
  2705. "y": 8
  2706. },
  2707. "hiddenSeries": false,
  2708. "id": 39,
  2709. "legend": {
  2710. "avg": false,
  2711. "current": false,
  2712. "max": false,
  2713. "min": false,
  2714. "show": true,
  2715. "total": false,
  2716. "values": false
  2717. },
  2718. "lines": true,
  2719. "linewidth": 1,
  2720. "links": [],
  2721. "nullPointMode": "null",
  2722. "options": {
  2723. "alertThreshold": true
  2724. },
  2725. "percentage": false,
  2726. "pluginVersion": "7.4.3",
  2727. "pointradius": 5,
  2728. "points": false,
  2729. "renderer": "flot",
  2730. "seriesOverrides": [],
  2731. "spaceLength": 10,
  2732. "stack": false,
  2733. "steppedLine": false,
  2734. "targets": [
  2735. {
  2736. "expr": "max(jvm_threads_daemon_threads)",
  2737. "format": "time_series",
  2738. "intervalFactor": 1,
  2739. "refId": "A"
  2740. }
  2741. ],
  2742. "thresholds": [
  2743. {
  2744. "colorMode": "critical",
  2745. "fill": true,
  2746. "line": true,
  2747. "op": "gt",
  2748. "value": 500,
  2749. "visible": true
  2750. }
  2751. ],
  2752. "timeFrom": null,
  2753. "timeRegions": [],
  2754. "timeShift": null,
  2755. "title": "threads alert",
  2756. "tooltip": {
  2757. "shared": true,
  2758. "sort": 0,
  2759. "value_type": "individual"
  2760. },
  2761. "type": "graph",
  2762. "xaxis": {
  2763. "buckets": null,
  2764. "mode": "time",
  2765. "name": null,
  2766. "show": true,
  2767. "values": []
  2768. },
  2769. "yaxes": [
  2770. {
  2771. "format": "short",
  2772. "label": null,
  2773. "logBase": 1,
  2774. "max": null,
  2775. "min": null,
  2776. "show": true
  2777. },
  2778. {
  2779. "format": "short",
  2780. "label": null,
  2781. "logBase": 1,
  2782. "max": null,
  2783. "min": null,
  2784. "show": true
  2785. }
  2786. ],
  2787. "yaxis": {
  2788. "align": false,
  2789. "alignLevel": null
  2790. }
  2791. },
  2792. {
  2793. "alert": {
  2794. "conditions": [
  2795. {
  2796. "evaluator": {
  2797. "params": [
  2798. 5
  2799. ],
  2800. "type": "gt"
  2801. },
  2802. "operator": {
  2803. "type": "and"
  2804. },
  2805. "query": {
  2806. "params": [
  2807. "A",
  2808. "1m",
  2809. "now"
  2810. ]
  2811. },
  2812. "reducer": {
  2813. "params": [],
  2814. "type": "avg"
  2815. },
  2816. "type": "query"
  2817. }
  2818. ],
  2819. "executionErrorState": "keep_state",
  2820. "for": "1m",
  2821. "frequency": "1m",
  2822. "handler": 1,
  2823. "message": "too many full gc",
  2824. "name": "gc alert",
  2825. "noDataState": "ok",
  2826. "notifications": [
  2827. {
  2828. "id": 1
  2829. }
  2830. ]
  2831. },
  2832. "aliasColors": {},
  2833. "bars": false,
  2834. "dashLength": 10,
  2835. "dashes": false,
  2836. "datasource": "Prometheus-hw-cce-prod",
  2837. "fieldConfig": {
  2838. "defaults": {
  2839. "custom": {}
  2840. },
  2841. "overrides": []
  2842. },
  2843. "fill": 1,
  2844. "fillGradient": 0,
  2845. "gridPos": {
  2846. "h": 5,
  2847. "w": 8,
  2848. "x": 8,
  2849. "y": 8
  2850. },
  2851. "hiddenSeries": false,
  2852. "id": 38,
  2853. "legend": {
  2854. "avg": false,
  2855. "current": false,
  2856. "max": false,
  2857. "min": false,
  2858. "show": true,
  2859. "total": false,
  2860. "values": false
  2861. },
  2862. "lines": true,
  2863. "linewidth": 1,
  2864. "links": [],
  2865. "nullPointMode": "null",
  2866. "options": {
  2867. "alertThreshold": true
  2868. },
  2869. "percentage": false,
  2870. "pluginVersion": "7.4.3",
  2871. "pointradius": 5,
  2872. "points": false,
  2873. "renderer": "flot",
  2874. "seriesOverrides": [],
  2875. "spaceLength": 10,
  2876. "stack": false,
  2877. "steppedLine": false,
  2878. "targets": [
  2879. {
  2880. "expr": "max(rate(jvm_gc_pause_seconds_count{action=\"end of major GC\"}[5m])) * 300",
  2881. "format": "time_series",
  2882. "intervalFactor": 1,
  2883. "refId": "A"
  2884. }
  2885. ],
  2886. "thresholds": [
  2887. {
  2888. "colorMode": "critical",
  2889. "fill": true,
  2890. "line": true,
  2891. "op": "gt",
  2892. "value": 5,
  2893. "visible": true
  2894. }
  2895. ],
  2896. "timeFrom": null,
  2897. "timeRegions": [],
  2898. "timeShift": null,
  2899. "title": "gc alert",
  2900. "tooltip": {
  2901. "shared": true,
  2902. "sort": 0,
  2903. "value_type": "individual"
  2904. },
  2905. "type": "graph",
  2906. "xaxis": {
  2907. "buckets": null,
  2908. "mode": "time",
  2909. "name": null,
  2910. "show": true,
  2911. "values": []
  2912. },
  2913. "yaxes": [
  2914. {
  2915. "format": "short",
  2916. "label": null,
  2917. "logBase": 1,
  2918. "max": null,
  2919. "min": null,
  2920. "show": true
  2921. },
  2922. {
  2923. "format": "short",
  2924. "label": null,
  2925. "logBase": 1,
  2926. "max": null,
  2927. "min": null,
  2928. "show": true
  2929. }
  2930. ],
  2931. "yaxis": {
  2932. "align": false,
  2933. "alignLevel": null
  2934. }
  2935. },
  2936. {
  2937. "alert": {
  2938. "conditions": [
  2939. {
  2940. "evaluator": {
  2941. "params": [
  2942. 10
  2943. ],
  2944. "type": "gt"
  2945. },
  2946. "operator": {
  2947. "type": "and"
  2948. },
  2949. "query": {
  2950. "params": [
  2951. "A",
  2952. "1m",
  2953. "now"
  2954. ]
  2955. },
  2956. "reducer": {
  2957. "params": [],
  2958. "type": "avg"
  2959. },
  2960. "type": "query"
  2961. }
  2962. ],
  2963. "executionErrorState": "keep_state",
  2964. "frequency": "60s",
  2965. "handler": 1,
  2966. "name": "notify task alert",
  2967. "noDataState": "ok",
  2968. "notifications": []
  2969. },
  2970. "aliasColors": {},
  2971. "bars": false,
  2972. "dashLength": 10,
  2973. "dashes": false,
  2974. "datasource": "Prometheus-hw-cce-prod",
  2975. "fieldConfig": {
  2976. "defaults": {
  2977. "custom": {}
  2978. },
  2979. "overrides": []
  2980. },
  2981. "fill": 1,
  2982. "fillGradient": 0,
  2983. "gridPos": {
  2984. "h": 5,
  2985. "w": 8,
  2986. "x": 16,
  2987. "y": 8
  2988. },
  2989. "hiddenSeries": false,
  2990. "id": 49,
  2991. "legend": {
  2992. "avg": false,
  2993. "current": false,
  2994. "max": false,
  2995. "min": false,
  2996. "show": true,
  2997. "total": false,
  2998. "values": false
  2999. },
  3000. "lines": true,
  3001. "linewidth": 1,
  3002. "links": [],
  3003. "nullPointMode": "null",
  3004. "options": {
  3005. "alertThreshold": true
  3006. },
  3007. "percentage": false,
  3008. "pluginVersion": "7.4.3",
  3009. "pointradius": 5,
  3010. "points": false,
  3011. "renderer": "flot",
  3012. "seriesOverrides": [],
  3013. "spaceLength": 10,
  3014. "stack": false,
  3015. "steppedLine": false,
  3016. "targets": [
  3017. {
  3018. "expr": "sum(nacos_monitor{name='notifyTask'})",
  3019. "format": "time_series",
  3020. "intervalFactor": 1,
  3021. "refId": "A"
  3022. }
  3023. ],
  3024. "thresholds": [
  3025. {
  3026. "colorMode": "critical",
  3027. "fill": true,
  3028. "line": true,
  3029. "op": "gt",
  3030. "value": 10,
  3031. "visible": true
  3032. }
  3033. ],
  3034. "timeFrom": null,
  3035. "timeRegions": [],
  3036. "timeShift": null,
  3037. "title": "notify task alert",
  3038. "tooltip": {
  3039. "shared": true,
  3040. "sort": 0,
  3041. "value_type": "individual"
  3042. },
  3043. "type": "graph",
  3044. "xaxis": {
  3045. "buckets": null,
  3046. "mode": "time",
  3047. "name": null,
  3048. "show": true,
  3049. "values": []
  3050. },
  3051. "yaxes": [
  3052. {
  3053. "format": "short",
  3054. "label": null,
  3055. "logBase": 1,
  3056. "max": null,
  3057. "min": null,
  3058. "show": true
  3059. },
  3060. {
  3061. "format": "short",
  3062. "label": null,
  3063. "logBase": 1,
  3064. "max": null,
  3065. "min": null,
  3066. "show": true
  3067. }
  3068. ],
  3069. "yaxis": {
  3070. "align": false,
  3071. "alignLevel": null
  3072. }
  3073. },
  3074. {
  3075. "alert": {
  3076. "conditions": [
  3077. {
  3078. "evaluator": {
  3079. "params": [
  3080. 5000
  3081. ],
  3082. "type": "gt"
  3083. },
  3084. "operator": {
  3085. "type": "and"
  3086. },
  3087. "query": {
  3088. "params": [
  3089. "B",
  3090. "1m",
  3091. "now"
  3092. ]
  3093. },
  3094. "reducer": {
  3095. "params": [],
  3096. "type": "avg"
  3097. },
  3098. "type": "query"
  3099. }
  3100. ],
  3101. "executionErrorState": "keep_state",
  3102. "frequency": "60s",
  3103. "handler": 1,
  3104. "name": "rt alert",
  3105. "noDataState": "ok",
  3106. "notifications": []
  3107. },
  3108. "aliasColors": {},
  3109. "bars": false,
  3110. "dashLength": 10,
  3111. "dashes": false,
  3112. "datasource": "Prometheus-hw-cce-prod",
  3113. "fieldConfig": {
  3114. "defaults": {
  3115. "custom": {}
  3116. },
  3117. "overrides": []
  3118. },
  3119. "fill": 1,
  3120. "fillGradient": 0,
  3121. "gridPos": {
  3122. "h": 5,
  3123. "w": 8,
  3124. "x": 0,
  3125. "y": 13
  3126. },
  3127. "hiddenSeries": false,
  3128. "id": 85,
  3129. "legend": {
  3130. "avg": false,
  3131. "current": false,
  3132. "max": false,
  3133. "min": false,
  3134. "show": true,
  3135. "total": false,
  3136. "values": false
  3137. },
  3138. "lines": true,
  3139. "linewidth": 1,
  3140. "links": [],
  3141. "nullPointMode": "null",
  3142. "options": {
  3143. "alertThreshold": true
  3144. },
  3145. "percentage": false,
  3146. "pluginVersion": "7.4.3",
  3147. "pointradius": 5,
  3148. "points": false,
  3149. "renderer": "flot",
  3150. "seriesOverrides": [],
  3151. "spaceLength": 10,
  3152. "stack": false,
  3153. "steppedLine": false,
  3154. "targets": [
  3155. {
  3156. "expr": "sum(rate(http_server_requests_seconds_sum[1m]))/sum(rate(http_server_requests_seconds_count[1m])) * 1000",
  3157. "format": "time_series",
  3158. "hide": false,
  3159. "intervalFactor": 1,
  3160. "refId": "B"
  3161. }
  3162. ],
  3163. "thresholds": [
  3164. {
  3165. "colorMode": "critical",
  3166. "fill": true,
  3167. "line": true,
  3168. "op": "gt",
  3169. "value": 5000,
  3170. "visible": true
  3171. }
  3172. ],
  3173. "timeFrom": null,
  3174. "timeRegions": [],
  3175. "timeShift": null,
  3176. "title": "rt alert",
  3177. "tooltip": {
  3178. "shared": true,
  3179. "sort": 0,
  3180. "value_type": "individual"
  3181. },
  3182. "type": "graph",
  3183. "xaxis": {
  3184. "buckets": null,
  3185. "mode": "time",
  3186. "name": null,
  3187. "show": true,
  3188. "values": []
  3189. },
  3190. "yaxes": [
  3191. {
  3192. "format": "short",
  3193. "label": null,
  3194. "logBase": 1,
  3195. "max": null,
  3196. "min": null,
  3197. "show": true
  3198. },
  3199. {
  3200. "format": "short",
  3201. "label": null,
  3202. "logBase": 1,
  3203. "max": null,
  3204. "min": null,
  3205. "show": true
  3206. }
  3207. ],
  3208. "yaxis": {
  3209. "align": false,
  3210. "alignLevel": null
  3211. }
  3212. },
  3213. {
  3214. "alert": {
  3215. "conditions": [
  3216. {
  3217. "evaluator": {
  3218. "params": [
  3219. 5000
  3220. ],
  3221. "type": "gt"
  3222. },
  3223. "operator": {
  3224. "type": "and"
  3225. },
  3226. "query": {
  3227. "params": [
  3228. "A",
  3229. "1m",
  3230. "now"
  3231. ]
  3232. },
  3233. "reducer": {
  3234. "params": [],
  3235. "type": "avg"
  3236. },
  3237. "type": "query"
  3238. }
  3239. ],
  3240. "executionErrorState": "keep_state",
  3241. "frequency": "60s",
  3242. "handler": 1,
  3243. "name": "long polling alert",
  3244. "noDataState": "ok",
  3245. "notifications": []
  3246. },
  3247. "aliasColors": {},
  3248. "bars": false,
  3249. "dashLength": 10,
  3250. "dashes": false,
  3251. "datasource": "Prometheus-hw-cce-prod",
  3252. "fieldConfig": {
  3253. "defaults": {
  3254. "custom": {}
  3255. },
  3256. "overrides": []
  3257. },
  3258. "fill": 1,
  3259. "fillGradient": 0,
  3260. "gridPos": {
  3261. "h": 5,
  3262. "w": 8,
  3263. "x": 8,
  3264. "y": 13
  3265. },
  3266. "hiddenSeries": false,
  3267. "id": 84,
  3268. "legend": {
  3269. "avg": false,
  3270. "current": false,
  3271. "max": false,
  3272. "min": false,
  3273. "show": true,
  3274. "total": false,
  3275. "values": false
  3276. },
  3277. "lines": true,
  3278. "linewidth": 1,
  3279. "links": [],
  3280. "nullPointMode": "null",
  3281. "options": {
  3282. "alertThreshold": true
  3283. },
  3284. "percentage": false,
  3285. "pluginVersion": "7.4.3",
  3286. "pointradius": 5,
  3287. "points": false,
  3288. "renderer": "flot",
  3289. "repeatDirection": "h",
  3290. "seriesOverrides": [],
  3291. "spaceLength": 10,
  3292. "stack": false,
  3293. "steppedLine": false,
  3294. "targets": [
  3295. {
  3296. "expr": "max(nacos_monitor{name='longPolling'})",
  3297. "format": "time_series",
  3298. "intervalFactor": 1,
  3299. "legendFormat": "",
  3300. "refId": "A"
  3301. }
  3302. ],
  3303. "thresholds": [
  3304. {
  3305. "colorMode": "critical",
  3306. "fill": true,
  3307. "line": true,
  3308. "op": "gt",
  3309. "value": 5000,
  3310. "visible": true
  3311. }
  3312. ],
  3313. "timeFrom": null,
  3314. "timeRegions": [],
  3315. "timeShift": null,
  3316. "title": "long polling alert",
  3317. "tooltip": {
  3318. "shared": true,
  3319. "sort": 0,
  3320. "value_type": "individual"
  3321. },
  3322. "type": "graph",
  3323. "xaxis": {
  3324. "buckets": null,
  3325. "mode": "time",
  3326. "name": null,
  3327. "show": true,
  3328. "values": []
  3329. },
  3330. "yaxes": [
  3331. {
  3332. "format": "short",
  3333. "label": "",
  3334. "logBase": 1,
  3335. "max": null,
  3336. "min": null,
  3337. "show": true
  3338. },
  3339. {
  3340. "format": "short",
  3341. "label": null,
  3342. "logBase": 1,
  3343. "max": null,
  3344. "min": null,
  3345. "show": true
  3346. }
  3347. ],
  3348. "yaxis": {
  3349. "align": false,
  3350. "alignLevel": null
  3351. }
  3352. },
  3353. {
  3354. "alert": {
  3355. "conditions": [
  3356. {
  3357. "evaluator": {
  3358. "params": [
  3359. 1
  3360. ],
  3361. "type": "gt"
  3362. },
  3363. "operator": {
  3364. "type": "and"
  3365. },
  3366. "query": {
  3367. "params": [
  3368. "A",
  3369. "1m",
  3370. "now"
  3371. ]
  3372. },
  3373. "reducer": {
  3374. "params": [],
  3375. "type": "avg"
  3376. },
  3377. "type": "query"
  3378. }
  3379. ],
  3380. "executionErrorState": "keep_state",
  3381. "frequency": "60s",
  3382. "handler": 1,
  3383. "name": "config unhealth exception alert",
  3384. "noDataState": "ok",
  3385. "notifications": []
  3386. },
  3387. "aliasColors": {},
  3388. "bars": false,
  3389. "dashLength": 10,
  3390. "dashes": false,
  3391. "datasource": "Prometheus-hw-cce-prod",
  3392. "fieldConfig": {
  3393. "defaults": {
  3394. "custom": {}
  3395. },
  3396. "overrides": []
  3397. },
  3398. "fill": 1,
  3399. "fillGradient": 0,
  3400. "gridPos": {
  3401. "h": 5,
  3402. "w": 8,
  3403. "x": 16,
  3404. "y": 13
  3405. },
  3406. "hiddenSeries": false,
  3407. "id": 56,
  3408. "legend": {
  3409. "avg": false,
  3410. "current": false,
  3411. "max": false,
  3412. "min": false,
  3413. "show": true,
  3414. "total": false,
  3415. "values": false
  3416. },
  3417. "lines": true,
  3418. "linewidth": 1,
  3419. "links": [],
  3420. "nullPointMode": "null",
  3421. "options": {
  3422. "alertThreshold": true
  3423. },
  3424. "percentage": false,
  3425. "pluginVersion": "7.4.3",
  3426. "pointradius": 5,
  3427. "points": false,
  3428. "renderer": "flot",
  3429. "seriesOverrides": [],
  3430. "spaceLength": 10,
  3431. "stack": false,
  3432. "steppedLine": false,
  3433. "targets": [
  3434. {
  3435. "expr": "sum(rate(nacos_exception_total{name='unhealth'}[1m])) * 60",
  3436. "format": "time_series",
  3437. "intervalFactor": 1,
  3438. "refId": "A"
  3439. }
  3440. ],
  3441. "thresholds": [
  3442. {
  3443. "colorMode": "critical",
  3444. "fill": true,
  3445. "line": true,
  3446. "op": "gt",
  3447. "value": 1,
  3448. "visible": true
  3449. }
  3450. ],
  3451. "timeFrom": null,
  3452. "timeRegions": [],
  3453. "timeShift": null,
  3454. "title": "config unhealth exception alert",
  3455. "tooltip": {
  3456. "shared": true,
  3457. "sort": 0,
  3458. "value_type": "individual"
  3459. },
  3460. "type": "graph",
  3461. "xaxis": {
  3462. "buckets": null,
  3463. "mode": "time",
  3464. "name": null,
  3465. "show": true,
  3466. "values": []
  3467. },
  3468. "yaxes": [
  3469. {
  3470. "format": "short",
  3471. "label": null,
  3472. "logBase": 1,
  3473. "max": null,
  3474. "min": null,
  3475. "show": true
  3476. },
  3477. {
  3478. "format": "short",
  3479. "label": null,
  3480. "logBase": 1,
  3481. "max": null,
  3482. "min": null,
  3483. "show": true
  3484. }
  3485. ],
  3486. "yaxis": {
  3487. "align": false,
  3488. "alignLevel": null
  3489. }
  3490. },
  3491. {
  3492. "alert": {
  3493. "conditions": [
  3494. {
  3495. "evaluator": {
  3496. "params": [
  3497. 1
  3498. ],
  3499. "type": "gt"
  3500. },
  3501. "operator": {
  3502. "type": "and"
  3503. },
  3504. "query": {
  3505. "params": [
  3506. "A",
  3507. "1m",
  3508. "now"
  3509. ]
  3510. },
  3511. "reducer": {
  3512. "params": [],
  3513. "type": "avg"
  3514. },
  3515. "type": "query"
  3516. }
  3517. ],
  3518. "executionErrorState": "keep_state",
  3519. "frequency": "60s",
  3520. "handler": 1,
  3521. "name": "db exception alert",
  3522. "noDataState": "ok",
  3523. "notifications": []
  3524. },
  3525. "aliasColors": {},
  3526. "bars": false,
  3527. "dashLength": 10,
  3528. "dashes": false,
  3529. "datasource": "Prometheus-hw-cce-prod",
  3530. "fieldConfig": {
  3531. "defaults": {
  3532. "custom": {}
  3533. },
  3534. "overrides": []
  3535. },
  3536. "fill": 1,
  3537. "fillGradient": 0,
  3538. "gridPos": {
  3539. "h": 5,
  3540. "w": 8,
  3541. "x": 0,
  3542. "y": 18
  3543. },
  3544. "hiddenSeries": false,
  3545. "id": 54,
  3546. "legend": {
  3547. "avg": false,
  3548. "current": false,
  3549. "max": false,
  3550. "min": false,
  3551. "show": true,
  3552. "total": false,
  3553. "values": false
  3554. },
  3555. "lines": true,
  3556. "linewidth": 1,
  3557. "links": [],
  3558. "nullPointMode": "null",
  3559. "options": {
  3560. "alertThreshold": true
  3561. },
  3562. "percentage": false,
  3563. "pluginVersion": "7.4.3",
  3564. "pointradius": 5,
  3565. "points": false,
  3566. "renderer": "flot",
  3567. "seriesOverrides": [],
  3568. "spaceLength": 10,
  3569. "stack": false,
  3570. "steppedLine": false,
  3571. "targets": [
  3572. {
  3573. "expr": "sum(rate(nacos_exception_total{name='db'}[1m])) * 60",
  3574. "format": "time_series",
  3575. "intervalFactor": 1,
  3576. "refId": "A"
  3577. }
  3578. ],
  3579. "thresholds": [
  3580. {
  3581. "colorMode": "critical",
  3582. "fill": true,
  3583. "line": true,
  3584. "op": "gt",
  3585. "value": 1,
  3586. "visible": true
  3587. }
  3588. ],
  3589. "timeFrom": null,
  3590. "timeRegions": [],
  3591. "timeShift": null,
  3592. "title": "db exception alert",
  3593. "tooltip": {
  3594. "shared": true,
  3595. "sort": 0,
  3596. "value_type": "individual"
  3597. },
  3598. "type": "graph",
  3599. "xaxis": {
  3600. "buckets": null,
  3601. "mode": "time",
  3602. "name": null,
  3603. "show": true,
  3604. "values": []
  3605. },
  3606. "yaxes": [
  3607. {
  3608. "format": "short",
  3609. "label": null,
  3610. "logBase": 1,
  3611. "max": null,
  3612. "min": null,
  3613. "show": true
  3614. },
  3615. {
  3616. "format": "short",
  3617. "label": null,
  3618. "logBase": 1,
  3619. "max": null,
  3620. "min": null,
  3621. "show": true
  3622. }
  3623. ],
  3624. "yaxis": {
  3625. "align": false,
  3626. "alignLevel": null
  3627. }
  3628. },
  3629. {
  3630. "alert": {
  3631. "conditions": [
  3632. {
  3633. "evaluator": {
  3634. "params": [
  3635. 1
  3636. ],
  3637. "type": "gt"
  3638. },
  3639. "operator": {
  3640. "type": "and"
  3641. },
  3642. "query": {
  3643. "params": [
  3644. "A",
  3645. "1m",
  3646. "now"
  3647. ]
  3648. },
  3649. "reducer": {
  3650. "params": [],
  3651. "type": "avg"
  3652. },
  3653. "type": "query"
  3654. }
  3655. ],
  3656. "executionErrorState": "keep_state",
  3657. "frequency": "60s",
  3658. "handler": 1,
  3659. "name": "failedPush alert",
  3660. "noDataState": "ok",
  3661. "notifications": []
  3662. },
  3663. "aliasColors": {},
  3664. "bars": false,
  3665. "dashLength": 10,
  3666. "dashes": false,
  3667. "datasource": "Prometheus-hw-cce-prod",
  3668. "fieldConfig": {
  3669. "defaults": {
  3670. "custom": {}
  3671. },
  3672. "overrides": []
  3673. },
  3674. "fill": 1,
  3675. "fillGradient": 0,
  3676. "gridPos": {
  3677. "h": 5,
  3678. "w": 8,
  3679. "x": 8,
  3680. "y": 18
  3681. },
  3682. "hiddenSeries": false,
  3683. "id": 51,
  3684. "legend": {
  3685. "avg": false,
  3686. "current": false,
  3687. "max": false,
  3688. "min": false,
  3689. "show": true,
  3690. "total": false,
  3691. "values": false
  3692. },
  3693. "lines": true,
  3694. "linewidth": 1,
  3695. "links": [],
  3696. "nullPointMode": "null",
  3697. "options": {
  3698. "alertThreshold": true
  3699. },
  3700. "percentage": false,
  3701. "pluginVersion": "7.4.3",
  3702. "pointradius": 5,
  3703. "points": false,
  3704. "renderer": "flot",
  3705. "seriesOverrides": [],
  3706. "spaceLength": 10,
  3707. "stack": false,
  3708. "steppedLine": false,
  3709. "targets": [
  3710. {
  3711. "expr": "sum(nacos_monitor{name='failedPush'})",
  3712. "format": "time_series",
  3713. "intervalFactor": 1,
  3714. "refId": "A"
  3715. }
  3716. ],
  3717. "thresholds": [
  3718. {
  3719. "colorMode": "critical",
  3720. "fill": true,
  3721. "line": true,
  3722. "op": "gt",
  3723. "value": 1,
  3724. "visible": true
  3725. }
  3726. ],
  3727. "timeFrom": null,
  3728. "timeRegions": [],
  3729. "timeShift": null,
  3730. "title": "failed push alert",
  3731. "tooltip": {
  3732. "shared": true,
  3733. "sort": 0,
  3734. "value_type": "individual"
  3735. },
  3736. "type": "graph",
  3737. "xaxis": {
  3738. "buckets": null,
  3739. "mode": "time",
  3740. "name": null,
  3741. "show": true,
  3742. "values": []
  3743. },
  3744. "yaxes": [
  3745. {
  3746. "format": "short",
  3747. "label": null,
  3748. "logBase": 1,
  3749. "max": null,
  3750. "min": null,
  3751. "show": true
  3752. },
  3753. {
  3754. "format": "short",
  3755. "label": null,
  3756. "logBase": 1,
  3757. "max": null,
  3758. "min": null,
  3759. "show": true
  3760. }
  3761. ],
  3762. "yaxis": {
  3763. "align": false,
  3764. "alignLevel": null
  3765. }
  3766. },
  3767. {
  3768. "alert": {
  3769. "conditions": [
  3770. {
  3771. "evaluator": {
  3772. "params": [
  3773. 1
  3774. ],
  3775. "type": "gt"
  3776. },
  3777. "operator": {
  3778. "type": "and"
  3779. },
  3780. "query": {
  3781. "params": [
  3782. "A",
  3783. "1m",
  3784. "now"
  3785. ]
  3786. },
  3787. "reducer": {
  3788. "params": [],
  3789. "type": "avg"
  3790. },
  3791. "type": "query"
  3792. }
  3793. ],
  3794. "executionErrorState": "keep_state",
  3795. "frequency": "60s",
  3796. "handler": 1,
  3797. "name": "illegalArgument exception alert",
  3798. "noDataState": "ok",
  3799. "notifications": []
  3800. },
  3801. "aliasColors": {},
  3802. "bars": false,
  3803. "dashLength": 10,
  3804. "dashes": false,
  3805. "datasource": "Prometheus-hw-cce-prod",
  3806. "fieldConfig": {
  3807. "defaults": {
  3808. "custom": {}
  3809. },
  3810. "overrides": []
  3811. },
  3812. "fill": 1,
  3813. "fillGradient": 0,
  3814. "gridPos": {
  3815. "h": 5,
  3816. "w": 8,
  3817. "x": 16,
  3818. "y": 18
  3819. },
  3820. "hiddenSeries": false,
  3821. "id": 59,
  3822. "legend": {
  3823. "avg": false,
  3824. "current": false,
  3825. "max": false,
  3826. "min": false,
  3827. "show": true,
  3828. "total": false,
  3829. "values": false
  3830. },
  3831. "lines": true,
  3832. "linewidth": 1,
  3833. "links": [],
  3834. "nullPointMode": "null",
  3835. "options": {
  3836. "alertThreshold": true
  3837. },
  3838. "percentage": false,
  3839. "pluginVersion": "7.4.3",
  3840. "pointradius": 5,
  3841. "points": false,
  3842. "renderer": "flot",
  3843. "seriesOverrides": [],
  3844. "spaceLength": 10,
  3845. "stack": false,
  3846. "steppedLine": false,
  3847. "targets": [
  3848. {
  3849. "expr": "sum(rate(nacos_exception_total{name='illegalArgument'}[1m])) * 60",
  3850. "format": "time_series",
  3851. "intervalFactor": 1,
  3852. "refId": "A"
  3853. }
  3854. ],
  3855. "thresholds": [
  3856. {
  3857. "colorMode": "critical",
  3858. "fill": true,
  3859. "line": true,
  3860. "op": "gt",
  3861. "value": 1,
  3862. "visible": true
  3863. }
  3864. ],
  3865. "timeFrom": null,
  3866. "timeRegions": [],
  3867. "timeShift": null,
  3868. "title": "illegalArgument exception alert",
  3869. "tooltip": {
  3870. "shared": true,
  3871. "sort": 0,
  3872. "value_type": "individual"
  3873. },
  3874. "type": "graph",
  3875. "xaxis": {
  3876. "buckets": null,
  3877. "mode": "time",
  3878. "name": null,
  3879. "show": true,
  3880. "values": []
  3881. },
  3882. "yaxes": [
  3883. {
  3884. "format": "short",
  3885. "label": null,
  3886. "logBase": 1,
  3887. "max": null,
  3888. "min": null,
  3889. "show": true
  3890. },
  3891. {
  3892. "format": "short",
  3893. "label": null,
  3894. "logBase": 1,
  3895. "max": null,
  3896. "min": null,
  3897. "show": true
  3898. }
  3899. ],
  3900. "yaxis": {
  3901. "align": false,
  3902. "alignLevel": null
  3903. }
  3904. },
  3905. {
  3906. "alert": {
  3907. "conditions": [
  3908. {
  3909. "evaluator": {
  3910. "params": [
  3911. 1
  3912. ],
  3913. "type": "gt"
  3914. },
  3915. "operator": {
  3916. "type": "and"
  3917. },
  3918. "query": {
  3919. "params": [
  3920. "A",
  3921. "5m",
  3922. "now"
  3923. ]
  3924. },
  3925. "reducer": {
  3926. "params": [],
  3927. "type": "avg"
  3928. },
  3929. "type": "query"
  3930. }
  3931. ],
  3932. "executionErrorState": "keep_state",
  3933. "frequency": "60s",
  3934. "handler": 1,
  3935. "name": "naming disk exception alert",
  3936. "noDataState": "ok",
  3937. "notifications": []
  3938. },
  3939. "aliasColors": {},
  3940. "bars": false,
  3941. "dashLength": 10,
  3942. "dashes": false,
  3943. "datasource": "Prometheus-hw-cce-prod",
  3944. "fieldConfig": {
  3945. "defaults": {
  3946. "custom": {}
  3947. },
  3948. "overrides": []
  3949. },
  3950. "fill": 1,
  3951. "fillGradient": 0,
  3952. "gridPos": {
  3953. "h": 5,
  3954. "w": 8,
  3955. "x": 0,
  3956. "y": 23
  3957. },
  3958. "hiddenSeries": false,
  3959. "id": 57,
  3960. "legend": {
  3961. "avg": false,
  3962. "current": false,
  3963. "max": false,
  3964. "min": false,
  3965. "show": true,
  3966. "total": false,
  3967. "values": false
  3968. },
  3969. "lines": true,
  3970. "linewidth": 1,
  3971. "links": [],
  3972. "nullPointMode": "null",
  3973. "options": {
  3974. "alertThreshold": true
  3975. },
  3976. "percentage": false,
  3977. "pluginVersion": "7.4.3",
  3978. "pointradius": 5,
  3979. "points": false,
  3980. "renderer": "flot",
  3981. "seriesOverrides": [],
  3982. "spaceLength": 10,
  3983. "stack": false,
  3984. "steppedLine": false,
  3985. "targets": [
  3986. {
  3987. "expr": "sum(rate(nacos_exception_total{name='disk'}[1m])) * 60",
  3988. "format": "time_series",
  3989. "intervalFactor": 1,
  3990. "refId": "A"
  3991. }
  3992. ],
  3993. "thresholds": [
  3994. {
  3995. "colorMode": "critical",
  3996. "fill": true,
  3997. "line": true,
  3998. "op": "gt",
  3999. "value": 1,
  4000. "visible": true
  4001. }
  4002. ],
  4003. "timeFrom": null,
  4004. "timeRegions": [],
  4005. "timeShift": null,
  4006. "title": "naming disk exception alert",
  4007. "tooltip": {
  4008. "shared": true,
  4009. "sort": 0,
  4010. "value_type": "individual"
  4011. },
  4012. "type": "graph",
  4013. "xaxis": {
  4014. "buckets": null,
  4015. "mode": "time",
  4016. "name": null,
  4017. "show": true,
  4018. "values": []
  4019. },
  4020. "yaxes": [
  4021. {
  4022. "format": "short",
  4023. "label": null,
  4024. "logBase": 1,
  4025. "max": null,
  4026. "min": null,
  4027. "show": true
  4028. },
  4029. {
  4030. "format": "short",
  4031. "label": null,
  4032. "logBase": 1,
  4033. "max": null,
  4034. "min": null,
  4035. "show": true
  4036. }
  4037. ],
  4038. "yaxis": {
  4039. "align": false,
  4040. "alignLevel": null
  4041. }
  4042. },
  4043. {
  4044. "alert": {
  4045. "conditions": [
  4046. {
  4047. "evaluator": {
  4048. "params": [
  4049. 1
  4050. ],
  4051. "type": "gt"
  4052. },
  4053. "operator": {
  4054. "type": "and"
  4055. },
  4056. "query": {
  4057. "params": [
  4058. "A",
  4059. "1m",
  4060. "now"
  4061. ]
  4062. },
  4063. "reducer": {
  4064. "params": [],
  4065. "type": "avg"
  4066. },
  4067. "type": "query"
  4068. }
  4069. ],
  4070. "executionErrorState": "keep_state",
  4071. "frequency": "60s",
  4072. "handler": 1,
  4073. "name": "config notify exception alert",
  4074. "noDataState": "ok",
  4075. "notifications": []
  4076. },
  4077. "aliasColors": {},
  4078. "bars": false,
  4079. "dashLength": 10,
  4080. "dashes": false,
  4081. "datasource": "Prometheus-hw-cce-prod",
  4082. "fieldConfig": {
  4083. "defaults": {
  4084. "custom": {}
  4085. },
  4086. "overrides": []
  4087. },
  4088. "fill": 1,
  4089. "fillGradient": 0,
  4090. "gridPos": {
  4091. "h": 5,
  4092. "w": 8,
  4093. "x": 8,
  4094. "y": 23
  4095. },
  4096. "hiddenSeries": false,
  4097. "id": 55,
  4098. "legend": {
  4099. "avg": false,
  4100. "current": false,
  4101. "max": false,
  4102. "min": false,
  4103. "show": true,
  4104. "total": false,
  4105. "values": false
  4106. },
  4107. "lines": true,
  4108. "linewidth": 1,
  4109. "links": [],
  4110. "nullPointMode": "null",
  4111. "options": {
  4112. "alertThreshold": true
  4113. },
  4114. "percentage": false,
  4115. "pluginVersion": "7.4.3",
  4116. "pointradius": 5,
  4117. "points": false,
  4118. "renderer": "flot",
  4119. "seriesOverrides": [],
  4120. "spaceLength": 10,
  4121. "stack": false,
  4122. "steppedLine": false,
  4123. "targets": [
  4124. {
  4125. "expr": "sum(rate(nacos_exception_total{name='configNotify'}[1m])) * 60",
  4126. "format": "time_series",
  4127. "intervalFactor": 1,
  4128. "refId": "A"
  4129. }
  4130. ],
  4131. "thresholds": [
  4132. {
  4133. "colorMode": "critical",
  4134. "fill": true,
  4135. "line": true,
  4136. "op": "gt",
  4137. "value": 1,
  4138. "visible": true
  4139. }
  4140. ],
  4141. "timeFrom": null,
  4142. "timeRegions": [],
  4143. "timeShift": null,
  4144. "title": "config notify exception alert",
  4145. "tooltip": {
  4146. "shared": true,
  4147. "sort": 0,
  4148. "value_type": "individual"
  4149. },
  4150. "type": "graph",
  4151. "xaxis": {
  4152. "buckets": null,
  4153. "mode": "time",
  4154. "name": null,
  4155. "show": true,
  4156. "values": []
  4157. },
  4158. "yaxes": [
  4159. {
  4160. "format": "short",
  4161. "label": null,
  4162. "logBase": 1,
  4163. "max": null,
  4164. "min": null,
  4165. "show": true
  4166. },
  4167. {
  4168. "format": "short",
  4169. "label": null,
  4170. "logBase": 1,
  4171. "max": null,
  4172. "min": null,
  4173. "show": true
  4174. }
  4175. ],
  4176. "yaxis": {
  4177. "align": false,
  4178. "alignLevel": null
  4179. }
  4180. },
  4181. {
  4182. "alert": {
  4183. "conditions": [
  4184. {
  4185. "evaluator": {
  4186. "params": [
  4187. 1
  4188. ],
  4189. "type": "gt"
  4190. },
  4191. "operator": {
  4192. "type": "and"
  4193. },
  4194. "query": {
  4195. "params": [
  4196. "A",
  4197. "1m",
  4198. "now"
  4199. ]
  4200. },
  4201. "reducer": {
  4202. "params": [],
  4203. "type": "avg"
  4204. },
  4205. "type": "query"
  4206. }
  4207. ],
  4208. "executionErrorState": "keep_state",
  4209. "frequency": "60s",
  4210. "handler": 1,
  4211. "name": "naming leader send beat failed exception alert",
  4212. "noDataState": "ok",
  4213. "notifications": []
  4214. },
  4215. "aliasColors": {},
  4216. "bars": false,
  4217. "dashLength": 10,
  4218. "dashes": false,
  4219. "datasource": "Prometheus-hw-cce-prod",
  4220. "fieldConfig": {
  4221. "defaults": {
  4222. "custom": {}
  4223. },
  4224. "overrides": []
  4225. },
  4226. "fill": 1,
  4227. "fillGradient": 0,
  4228. "gridPos": {
  4229. "h": 5,
  4230. "w": 8,
  4231. "x": 16,
  4232. "y": 23
  4233. },
  4234. "hiddenSeries": false,
  4235. "id": 58,
  4236. "legend": {
  4237. "avg": false,
  4238. "current": false,
  4239. "max": false,
  4240. "min": false,
  4241. "show": true,
  4242. "total": false,
  4243. "values": false
  4244. },
  4245. "lines": true,
  4246. "linewidth": 1,
  4247. "links": [],
  4248. "nullPointMode": "null",
  4249. "options": {
  4250. "alertThreshold": true
  4251. },
  4252. "percentage": false,
  4253. "pluginVersion": "7.4.3",
  4254. "pointradius": 5,
  4255. "points": false,
  4256. "renderer": "flot",
  4257. "seriesOverrides": [],
  4258. "spaceLength": 10,
  4259. "stack": false,
  4260. "steppedLine": false,
  4261. "targets": [
  4262. {
  4263. "expr": "sum(rate(nacos_exception_total{name='leaderSendBeatFailed'}[1m])) * 60",
  4264. "format": "time_series",
  4265. "intervalFactor": 1,
  4266. "refId": "A"
  4267. }
  4268. ],
  4269. "thresholds": [
  4270. {
  4271. "colorMode": "critical",
  4272. "fill": true,
  4273. "line": true,
  4274. "op": "gt",
  4275. "value": 1,
  4276. "visible": true
  4277. }
  4278. ],
  4279. "timeFrom": null,
  4280. "timeRegions": [],
  4281. "timeShift": null,
  4282. "title": "naming leader send beat failed exception alert",
  4283. "tooltip": {
  4284. "shared": true,
  4285. "sort": 0,
  4286. "value_type": "individual"
  4287. },
  4288. "type": "graph",
  4289. "xaxis": {
  4290. "buckets": null,
  4291. "mode": "time",
  4292. "name": null,
  4293. "show": true,
  4294. "values": []
  4295. },
  4296. "yaxes": [
  4297. {
  4298. "format": "short",
  4299. "label": null,
  4300. "logBase": 1,
  4301. "max": null,
  4302. "min": null,
  4303. "show": true
  4304. },
  4305. {
  4306. "format": "short",
  4307. "label": null,
  4308. "logBase": 1,
  4309. "max": null,
  4310. "min": null,
  4311. "show": true
  4312. }
  4313. ],
  4314. "yaxis": {
  4315. "align": false,
  4316. "alignLevel": null
  4317. }
  4318. },
  4319. {
  4320. "alert": {
  4321. "conditions": [
  4322. {
  4323. "evaluator": {
  4324. "params": [
  4325. 1
  4326. ],
  4327. "type": "gt"
  4328. },
  4329. "operator": {
  4330. "type": "and"
  4331. },
  4332. "query": {
  4333. "params": [
  4334. "A",
  4335. "1m",
  4336. "now"
  4337. ]
  4338. },
  4339. "reducer": {
  4340. "params": [],
  4341. "type": "avg"
  4342. },
  4343. "type": "query"
  4344. }
  4345. ],
  4346. "executionErrorState": "keep_state",
  4347. "frequency": "60s",
  4348. "handler": 1,
  4349. "name": "nacos_exception alert",
  4350. "noDataState": "ok",
  4351. "notifications": []
  4352. },
  4353. "aliasColors": {},
  4354. "bars": false,
  4355. "dashLength": 10,
  4356. "dashes": false,
  4357. "datasource": "Prometheus-hw-cce-prod",
  4358. "fieldConfig": {
  4359. "defaults": {
  4360. "custom": {}
  4361. },
  4362. "overrides": []
  4363. },
  4364. "fill": 1,
  4365. "fillGradient": 0,
  4366. "gridPos": {
  4367. "h": 5,
  4368. "w": 8,
  4369. "x": 0,
  4370. "y": 28
  4371. },
  4372. "hiddenSeries": false,
  4373. "id": 60,
  4374. "legend": {
  4375. "avg": false,
  4376. "current": false,
  4377. "max": false,
  4378. "min": false,
  4379. "show": true,
  4380. "total": false,
  4381. "values": false
  4382. },
  4383. "lines": true,
  4384. "linewidth": 1,
  4385. "links": [],
  4386. "nullPointMode": "null",
  4387. "options": {
  4388. "alertThreshold": true
  4389. },
  4390. "percentage": false,
  4391. "pluginVersion": "7.4.3",
  4392. "pointradius": 5,
  4393. "points": false,
  4394. "renderer": "flot",
  4395. "seriesOverrides": [],
  4396. "spaceLength": 10,
  4397. "stack": false,
  4398. "steppedLine": false,
  4399. "targets": [
  4400. {
  4401. "expr": "sum(rate(nacos_exception_total{name='nacos'}[1m])) * 60",
  4402. "format": "time_series",
  4403. "intervalFactor": 1,
  4404. "refId": "A"
  4405. }
  4406. ],
  4407. "thresholds": [
  4408. {
  4409. "colorMode": "critical",
  4410. "fill": true,
  4411. "line": true,
  4412. "op": "gt",
  4413. "value": 1,
  4414. "visible": true
  4415. }
  4416. ],
  4417. "timeFrom": null,
  4418. "timeRegions": [],
  4419. "timeShift": null,
  4420. "title": "nacos exception alert",
  4421. "tooltip": {
  4422. "shared": true,
  4423. "sort": 0,
  4424. "value_type": "individual"
  4425. },
  4426. "type": "graph",
  4427. "xaxis": {
  4428. "buckets": null,
  4429. "mode": "time",
  4430. "name": null,
  4431. "show": true,
  4432. "values": []
  4433. },
  4434. "yaxes": [
  4435. {
  4436. "format": "short",
  4437. "label": null,
  4438. "logBase": 1,
  4439. "max": null,
  4440. "min": null,
  4441. "show": true
  4442. },
  4443. {
  4444. "format": "short",
  4445. "label": null,
  4446. "logBase": 1,
  4447. "max": null,
  4448. "min": null,
  4449. "show": true
  4450. }
  4451. ],
  4452. "yaxis": {
  4453. "align": false,
  4454. "alignLevel": null
  4455. }
  4456. }
  4457. ],
  4458. "title": "nacos alert",
  4459. "type": "row"
  4460. }
  4461. ],
  4462. "refresh": "5s",
  4463. "schemaVersion": 27,
  4464. "style": "dark",
  4465. "tags": [],
  4466. "templating": {
  4467. "list": [
  4468. {
  4469. "allValue": ".*:8848",
  4470. "current": {
  4471. "selected": false,
  4472. "text": "All",
  4473. "value": "$__all"
  4474. },
  4475. "datasource": "Prometheus-hw-cce-prod",
  4476. "definition": "label_values(instance)",
  4477. "description": null,
  4478. "error": null,
  4479. "hide": 0,
  4480. "includeAll": true,
  4481. "label": "instance",
  4482. "multi": false,
  4483. "name": "instance",
  4484. "options": [],
  4485. "query": {
  4486. "query": "label_values(instance)",
  4487. "refId": "Prometheus-hw-cce-prod-instance-Variable-Query"
  4488. },
  4489. "refresh": 2,
  4490. "regex": "/.*:8848/",
  4491. "skipUrlSync": false,
  4492. "sort": 0,
  4493. "tagValuesQuery": "",
  4494. "tags": [],
  4495. "tagsQuery": "",
  4496. "type": "query",
  4497. "useTags": false
  4498. }
  4499. ]
  4500. },
  4501. "time": {
  4502. "from": "now-5m",
  4503. "to": "now"
  4504. },
  4505. "timepicker": {
  4506. "refresh_intervals": [
  4507. "5s",
  4508. "10s",
  4509. "30s",
  4510. "1m",
  4511. "5m",
  4512. "15m",
  4513. "30m",
  4514. "1h",
  4515. "2h",
  4516. "1d"
  4517. ],
  4518. "time_options": [
  4519. "5m",
  4520. "15m",
  4521. "1h",
  4522. "6h",
  4523. "12h",
  4524. "24h",
  4525. "2d",
  4526. "7d",
  4527. "30d"
  4528. ]
  4529. },
  4530. "timezone": "",
  4531. "title": "Nacos生产环境监控",
  4532. "uid": "Bz_QALEiz18",
  4533. "version": 8
  4534. }

image.png