前面三篇文章,我们已经安装了prometheus,node_exporter,blackbox_exporter,alertmanager,接下来利用已经安装的组件和exporter完成对主机资源,URL,SSL证书三个方面的指标监控。
已经完成安装的四个组件结构为
tree -L 2 /usr/local/monitor/
/usr/local/monitor/
├── alertmanager-0.22.2.linux-amd64
│ ├── alertmanager
│ ├── alertmanager.yml
│ ├── amtool
│ ├── LICENSE
│ └── NOTICE
├── blackbox_exporter-0.19.0.linux-amd64
│ ├── blackbox_exporter
│ ├── blackbox.yml
│ ├── LICENSE
│ └── NOTICE
├── node_exporter-1.1.2.linux-amd64
│ ├── LICENSE
│ ├── node_exporter
│ ├── nohup.out
│ └── NOTICE
└── prometheus-2.27.1.linux-amd64
├── console_libraries
├── consoles
├── data
├── LICENSE
├── nohup.out
├── NOTICE
├── prometheus
├── prometheus.yml
└── promtool
接下来我们在prometheus-2.27.1.linux-amd64 文件中新建一个rules文件夹存放我们要监控的所有指标文件
一、创建监控规则文件
本部分内容为服务端操作。
ref:https://awesome-prometheus-alerts.grep.to/rules
#1、创建rules文件
mkdir /usr/local/monitor/prometheus-2.27.1.linux-amd64/rules -p
#2、监控主机CPU,内存,硬盘等硬件资源信息
$ cd /usr/local/monitor/prometheus-2.27.1.linux-amd64/rules
$ vim host-alerts.yml
groups:
- name: host-alerts
rules:
- alert: LowMemory
expr: ((node_memory_MemFree_bytes{instance=~"$instance"} / node_memory_MemTotal_bytes{instance=~"$instance"}) * 100) < 3
for: 1m
labels:
severity: warning
at_mobiles: "10086"
annotations:
text: '{{ $labels.instance }} has had low available memeory for than 1 minutes.'
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
at_mobiles: "10086"
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} has been down for more than 1 minutes."
- alert: hostCpuUsageAlert
expr: sum(avg without(cpu)(irate(node_cpu_seconds_total{ mode != 'idle' } [5m]))) by (instance) > 0.80
for: 1m
labels:
severity: warning
at_mobiles: "10086"
annotations:
summary: "Instance {{ $labels.instance }} CPU usage high"
description: "{{ $labels.instance }} CPU usage above 80% (current value: {{ $value }})"
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.80
for: 1m
labels:
severity: warning
at_mobiles: "10086"
annotations:
summary: "Instance {{ $labels.instance }} MEM usage high"
description: "{{ $labels.instance }} MEM usage above 80% (current value: {{ $value }})"
- alert: hostFileSystemUsageAlert
expr: 100 - (node_filesystem_free_bytes{fstype!~"rootfs|tmpfs|ext3"} / node_filesystem_size_bytes{fstype!~"rootfs|tmpfs|ext3"} * 100) > 80
for: 1m
labels:
severity: warning
at_mobiles: "10086"
annotations:
summary: "Instance {{ $labels.instance }} filesystem usage above 70% (current value: {{ $value }})"
description: "{{ $labels.instance }} filesystem usage above 70% (current value: {{ $value }})"
#3、监控内网或者外网接口
$ vim URL-alerts.yml
groups:
- name: url-alerts
rules:
- alert: EndpointDown
expr: probe_success == 0
for: 3s
labels:
severity: "critical"
at_mobiles: "10086"
annotations:
text: '{{ $labels.instance }} has down for 1 second.'
#4、监控SSL证书过期时间
$ vim ssl-alerts.yml
groups:
- name: ssl-alerts
rules:
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 3
for: 10m
labels:
severity: "critical"
at_mobiles: "10086"
annotations:
# text: '{{ $labels.instance }} certificate will expired in 30 days.'
text: '{{ $labels.instance }} 证书过期不足3天.'
二、修改prometheus主配置文件
编辑配置文件 /usr/local/monitor/prometheus-2.27.1.linux-amd64/prometheus.yml ,将上面第二步添加的rules 加载进主配置文件,并将需要监控的主机IP或域名接口加入配置中即可。完成配置文件如下:
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"] #增加配置
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- /usr/local/monitor/prometheus-2.27.1.linux-amd64/rules/*.yml
#------------------------------------------- 主机硬件信息监控 -----------------------------------
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
labels:
instance: prometheus
- job_name: 'hz-p-inner'
static_configs:
- targets: ['172.17.3.194:9100']
labels:
instance: hz-p-inner
#------------------------------------------- 证书 SSL 过期时间 ------------------------------
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://p.coh.123.com
- http://p.test.123.com/wechat
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
#------------------------------------------ 接口 200 监控 ------------------------------------
- job_name: 'blackbox_http_2xx_post'
metrics_path: /probe
params:
module: [http_post_2xx]
static_configs:
- targets:
- https://www.123.com/appfi/receive
- http://coa.123.com/mini/ver_code
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115