1. 配置并启动 alertmanager
## 创建 alertmanager 数据目录
mkdir -p /data/alertmanager/
chmod -R 777 /data/alertmanager
## 编辑 alertmanager 配置文件
# 配置的详细说明,见官方文档: https://prometheus.io/docs/alerting/configuration/
# 请参考附件1中的配置模板编写配置文件: /etc/alertmanager/config.yml
## docker 启动 alertmanager
docker run -d -p 9093:9093 \
-v /etc/alertmanager/config.yml:/etc/alertmanager/config.yml \
-v /data/alertmanager:/data/alertmanager \
--name alertmanager \
--restart=always \
quay.io/prometheus/alertmanager \
--config.file=/etc/alertmanager/config.yml \
--storage.path=/data/alertmanager
2. prometheus 中加入alertmanager 配置
在 prometheus 的配置文件中加入如下配置:
# Alertmanager配置
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"] # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口
# 重启 prometheus
docker restart prometheus
3. prometheus 中配置rules
2 在 prometheus 配置文件加入 alert rules 配置
# alertmanager rules
rule_files:
- "/etc/prometheus/rules/*.yml"
3 加入两条配置规则vim /etc/prometheus/rules/testAlert.yml
groups:
- name: ServiceStatus #规则组名称
rules:
- alert: ServiceStatusAlert #单个规则的名称
expr: up == 0 #匹配规则, up==0, 1表示在线,0表示down机
for: 10s #持续时间
labels: #标签
project: zhidaoAPP #自定义lables
annotations: #告警正文
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
- name: hostStatsAlert
rules:
- alert: hostCpuUsageAlert
expr: sum(avg without (cpu)(irate(node_cpu{mode!='idle'}[5m]))) by (instance) > 0.85
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} CPU usgae high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal - node_memory_MemAvailable)/node_memory_MemTotal > 0.85
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})"
4 重启 prometheus
docker restart prometheus
查看 prometheus 中的告警规则
http://{YOU_prometheus_IP}:9090/alerts
查看alert 信息
http://{YOU_alertmanager_IP}:9093/#/alerts
4.增加钉钉告警功能
启动dingtalk 插件
docker run -d --name dingtalk \
-p 8060:8060 \
--restart=always \
docker.io/timonwong/prometheus-webhook-dingtalk \
--ding.profile="ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=139c51c0c3f8dabf9d0ea50b042ef6593bea61340a7d116ef2ce51e4e538b8a9" \
--ding.profile="webhook2=https://oapi.dingtalk.com/robot/send?access_token=yyyyyyyyyyy" # 可以写多个
配置 alertmanager, 增加
参考文档:
附件:
1. Alertmanager 配置模板
配置示例: https://raw.githubusercontent.com/prometheus/alertmanager/master/doc/examples/simple.yml
vim /etc/alertmanager/config.yml
global:
# 邮箱配置
smtp_smarthost: smtp.ym.163.com:587 # 如果是企业邮箱一定要配置587端口, 456端口邮件会发送失败
smtp_from: alert@xxx.com
smtp_auth_username: alert@xxx.com
smtp_auth_identity: alert@xxx.com
smtp_auth_password: XXXXXXX
route:
## default receiver
receiver: 'default'
group_wait: 30s
group_interval: 1m
repeat_interval: 4h
group_by: ['claster','alertname']
routes:
- receiver: webhook
group_wait: 10s
match: # match_re: 正则匹配
alertname: ServiceStatusAlert # 定义告警的匹配标签,来确定告警组的标实
receivers:
- name: default
email_configs:
- to: 152xxxx8332@163.com
send_resolved: true
# webhook——钉钉
- name: webhook
webhook_configs:
- url: http://{prometheus-webhook-dingtalk_IP}:8060/dingtalk/ops_dingding/send