0、将需要安装的node打标签,多node节点需要打多次,这样pod才会被调遣

situadmin@k8s-monitor-slave-1:~$ kubectl get nodes
NAME STATUS ROLES AGE VERSION
k8s-monitor-slave-1 Ready master 82m v1.15.3

kubectl label nodes role=monitor

1、安装blackbox-exporter(是个第三方插件,支持自定义tcp、http、icmp数据监控), 配置文件目录:/home/situadmin/zhuq/monitor/00_blackbox

root@ali-ubuntu:~/zhuq/monitor# tree ./00_blackbox/
./00_blackbox/
├── blackbox-exporter-configmap.yaml
└── blackbox-exporter-deployment.yaml
# kubectl create -f blackbox-exporter-configmap.yaml
# kubectl create -f blackbox-exporter-deployment.yaml

2、安装sql-exporter(不监控mysql,不需要安装), 目录: 01_sql-exoprter

2.1、 阿里云RDS创建完成,开通RDS访问权限,安全组等互通配置。使用以下sql,创建触发器

root@ali-ubuntu:~/zhuq/monitor/01_sql-exoprter# tree sqlTrigger/
sqlTrigger/
├── updateRequestSecondsCountMetrics.sql
└── updateRequestSecondsSumMetrics.sql

2.2、 部署服务,先安装configmap,在安装bundle包

2.2.1 修改sql-exporter-configmap.yaml文件中,datasource配置项为rds配置项data_source_name中用户、密码、数据库实例的内网域名
data_source_name: ‘mysql://chagemeUser:chagemePass@tcp(chagemeRDSurl)/face_saas_log
# kubectl create -f sql-exporter-configmap.yaml
# kubectl create -f sql-exporter-deployment.yaml

3、node_exporter监控宿主机(可选),如使用阿里云ECS监控,可以忽略

援引文档: https://github.com/coreos/prometheus-operator/blob/1b016520b9f1899f0973c7c0beb5acaaef2a415a/Documentation/additional-scrape-config.md
目录:
root@ali-ubuntu:~/zhuq/monitor/02_prometheus# tree ./
./
├── additional-scrape-configs
│ ├── additional-scrape-configs.yaml
│ └── prometheus-additional.yaml
├── alertmanger
│ └── alertmanager.yaml
├── node_exporter
│ ├── node_exporter-0.18.1.linux-amd64.tar.gz
│ ├── node_exporter.service
│ └── sysconfig.node_exporter
└── prometheusrules
└── facesaas-prometheus-operator-rule.yaml

3.0 在被监控主机上安装node-exporter暴露metrics(link: https://github.com/prometheus/node_exporter)

目录:
root@ali-ubuntu:~/zhuq/monitor/02_prometheus# tree ./node_exporter/
./node_exporter/
├── node_exporter-0.18.1.linux-amd64.tar.gz
├── node_exporter.service
└── sysconfig.node_exporter
# groupadd node_exporter&&useradd node_exporter -g node_exporter -s /sbin/nologin
# tar zxvf node_exporter-0.18.1.linux-amd64.tar.gz&& cp node_exporter-0.18.1.linux-amd64/node_exporter /bin/&&chmod +x /bin/node_exporter
# mkdir -p /etc/sysconfig&&cp sysconfig.node_exporter /etc/sysconfig/node_exporter
# cp node_exporter.service /etc/systemd/system/node_exporter.service
# systemctl start node_exporter

3.1 根据需要修改prometheus-additional.yaml,根据需要的监控的节点增加

kubectl create secret generic additional-scrape-configs —from-file=prometheus-additional.yaml —dry-run -oyaml > additional-scrape-configs.yaml
# kubectl -n monitor create -f additional-scrape-configs.yaml

把node节点ip加到这个文件(note: 需要修改secret中的labels,将prometheus的crd中的label贴过来)
# vim zhuq/monitor/02_prometheus/additional-scrape-configs/prometheus-additional.yaml 把里面node节点ip补全
image2019-9-20_22-11-57.png
将这个文件内容转换成base64格式
# cat zhuq/monitor/02_prometheus/additional-scrape-configs/prometheus-additional.yaml |base64 -w 0;echo
将生成的base64值复制到下面这个文件
# kubectl -n monitor edit secrets additional-scrape-configs
反验证base64值是否正确
# echo LSBqb2JfbmFtZTogbm9kZS1leHBvcnRlcgogIGhvbm9yX3RpbWVzdGFtcHM6IHRydWUKICBzY3JhcGVfaW50ZXJ2YWw6IDMwcwogIHNjcmFwZV90aW1lb3V0OiAzMHMKICBtZXRyaWNzX3BhdGg6IC9tZXRyaWNzCiAgc2NoZW1lOiBodHRwCiAgc3RhdGljX2NvbmZpZ3M6CiAgLSB0YXJnZXRzOgogICAgLSAxOTIuMTY4LjEuMjM5Ojkx9wb3J0CiAgaG9ub3JfdGltZXN0YW1wczogdHJ1ZQogIHBhcmFtczoKICAgIG1vZHVsZToKICAgIC0gdGNwX2Nvbm5lY3QKICBzY3JhcGVfaW50ZXJ2YWw6IDMwcwogIHNjcmYXRpY19jb25maWdzOgogIC0gdGFyZ2V0czoKICAgIC0gNTguMjE1LjEyMC42Njo0MDAwMgogICAgbGFiZWxzOgogICAgICBncm91cDogc2VjcmV0LTQwMDAyCiAgLSB0YXJnFiZWxzOgogICAgICBncm91cDogb2NyCiAgLSB0YXJnZXRzOgogICAgLSAxOTIuMTY4LjEuMjQxOjk1MDAKICAgIC0gMTkyLjE2OC4xLjg6OTUwMAogICAgbGFiZWxzOgogICOTIuMTY4LjEuOTo5NTAwCiAgICBsYWJlbHM6CiAgICAgIGdyb3VwOiBjcAogIC0gdGFyZ2V0czoKICAgIC0gMTkyLjE2OC4xLjIzo5NTAzCiAgICAtIDE5Mi4xNjguMS42E5Mi4xNjguMS4yNDM6MTg5ODkKICAgIC0gMTkyLjE2OC4xLjEwOjE4OTg5CiAgICBsYWJlbHM6CiAgICAgIGdyb3VwOiB3ZWIKICAtIHRhcmdldHM6CiAgICAtIHItYnAxamICBncm91cDogcmVkaXMKICAtIHRhcmdldHM6CiAgICAtIHJtLWJwMTc0cDA2Nzl2NDBsbnpsLm15c3FsLnJkcy5hbGl5dW5jcy5jb206MzMwNgogICAgLSBybS1icDFrMzM4JvdXA6IG15c3FsCiAgLSB0YXJnZXRzOgogICAgLSAxOTIuMTY4LjEuMjUwOjkwOTIKICAgIC0gMTkyLjE2OC4xLjI1MTo5MDkyCiAgICAtIDE5Mi4xNjguMS4yNTI6OTA5MgLmt1YmUtc3lzdGVtLnN2Yzo1MwogICAgbGFiZWxzOgogICAgICBncm91cDogY29yZWRucwogIHJlbGFiZWxfY29uZmlnczoKICAtIHNvdXJjZV9sYWJlbHM6IFtfX2FkZHJl9wYXJhbV90YXJnZXQKICAgIHJlcGxhY2VtZW50OiAkMQogICAgYWN0aW9uOiByZXBsYWNlCiAgLSBzb3VyY2VfbGFiZWxzOiBbX19wYXJhbV90YXJnZXRdCiAgICBzZXBhcmYWNlbWVudDogJDEKICAgIGFjdGlvbjogcmVwbGFjZQogIC0gc2VwYXJhdG9yOiA7CiAgICByZWdleDogKC4qKQogICAgdGFyZ2V0X2xhYmVsOiBfX2FkZHJlc3NfXwogICAg | base64 -d

3.2 修改crd prometheuses,建议按照官网文档,追加在serviceMonitorSelector下面。

在 Kubernetes 中一切都可视为资源,Kubernetes 1.7 之后增加了对 CRD 自定义资源二次开发能力来扩展 Kubernetes API,通过 CRD 我们可以向 Kubernetes API 中增加新资源类型,而不需要修改 Kubernetes 源码来创建自定义的 API server,该功能大大提高了 Kubernetes 的扩展能力。
当你创建一个新的CustomResourceDefinition (CRD)时,Kubernetes API服务器将为你指定的每个版本创建一个新的RESTful资源路径,我们可以根据该api路径来创建一些我们自己定义的类型资源。CRD可以是命名空间的,也可以是集群范围的,由CRD的作用域(scpoe)字段中所指定的,与现有的内置对象一样,删除名称空间将删除该名称空间中的所有自定义对象。customresourcedefinition本身没有名称空间,所有名称空间都可以使用。
按照官方的方法修改crd https://github.com/coreos/prometheus-operator/blob/1b016520b9f1899f0973c7c0beb5acaaef2a415a/Documentation/additional-scrape-config.md
# kubectl -n monitor edit prometheuses -oyaml

  1. additionalScrapeConfigs:
  2. name: additional-scrape-configs
  3. key: prometheus-additional.yaml

image2019-9-20_20-59-42.png
# kubectl -n monitor get prometheuses.monitoring.coreos.com -oyaml

3.3 增加业务告警facesaas-prometheus-operator-rule.yaml

kubectl apply -f prometheusrules/facesaas-prometheus-operator-rule.yaml

  • alert: 获取CPU 大于百分之80 触发
    annotations:
    summary: CPU大于百分之80告警
    expr: 100 - (100 - (avg (rate(node_cpu_seconds_total{mode=”idle”}[1m])) by (instance)* 100))
    > 80
    for: 30m
    labels:
    group: app
    severity: critical

4:增加告警通道 邮件

4.1 根据需要修改 alertmanager.yaml

4.2 替换alertmanager-prometheus-prometheus-oper-alertmanager 中的data, 记得base64 加密

history:
398 cat alertmanager.yaml |base64 -w 0;echo
399 kubectl -n monitor edit secrets alertmanager-prometheus-prometheus-oper-alertmanager
# kubectl -n monitor get secrets alertmanager-prometheus-prometheus-oper-alertmanager

重新加载 promethues
# 常用脚本:
root@ali-ubuntu:~/zhuq/monitor# tree script/
script/
├── reloadAlertmanager.sh
└── reloadPromethues.sh

./zhuq/monitor/script/reloadAlertmanager.sh
# ./zhuq/monitor/script/reloadPromethues.sh