1 下载
官网: https://prometheus.io/download/
wget https://github.com/prometheus/prometheus/releases/download/v2.22.2/prometheus-2.22.2.linux-amd64.tar.gz
2 解压安装
tar zxf prometheus-2.22.2.linux-amd64.tar.gz
cd prometheus-2.22.2.linux-amd64
3 写进systemd服务中
vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
ExecStart=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/prometheus --config.file=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/prometheus.yml --storage.tsdb.path=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/data
ExecReload=/bin/kill -HUP $MAINPID
ExecStop=/bin/kill -KILL $MAINPID
KillMode=control-group
Restart=on-failure
[Install]
WantedBy=multi-user.target
4 启动
systemctl enable prometheus
systemctl start prometheus
systemctl stop prometheus
systemctl reload prometheus
5 设置prometheus的配置
prometheus.yml
# my global config
global:
scrape_interval: 15s # 设定抓取数据的周期,默认为1min
evaluation_interval: 15s # 设定更新rules文件的周期,默认为1min
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口
- 127.0.0.1:9093
# rule配置,首次读取默认加载,之后根据evaluation_interval设定的周期加载
rule_files:
- "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus' # job_name默认写入timeseries的labels中,可以用于查询使用
scrape_interval: 15s # 抓取周期,默认采用global配置
static_configs: # 静态配置
- targets: ['127.0.0.1:9090'] # prometheus所要抓取数据的地址,即instance实例项
- job_name: 'node_export' #node_export配置
static_configs:
- targets:
- 127.0.0.1:9100
- job_name: 'blackbox_http_2xx'
scrape_interval: 45s
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
###配置文件
# file_sd_configs:
# - refresh_interval: 1m
# files:
# - "/home/prometheus/conf/service_post.yml"
static_configs:
- targets:
- https://www.baidu.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
- job_name: "blackbox_telnet_port"
scrape_interval: 5s
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: [ '127.0.0.1:3306' ]
labels:
group: '数据库监控'
- targets: [ '10.240.0.7:6379' ]
labels:
group: '中文官网redis监控'
- targets: [ '10.240.0.7:6380' ]
labels:
group: '英文官网redis监控'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
- job_name: 'blackbox_ssl_expiry'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- www.baidu.com.cn # Targent to probe
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # Blackbox exporter.
6 报警项设置
first_rules.yml #报警项配置文件,和路径是在prometheus.yml文件中配置
groups:
- name: blackbox_network_stats
rules:
- alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "This requires immediate action!"
- name: ssl_expiry.rules
rules:
- alert: SSLCertExpiringSoon
expr: (probe_ssl_earliest_cert_expiry{job="blackbox_ssl_expiry"} - time())/86400 < 30
for: 10m
labels:
severity: warn
annotations:
summary: "ssl证书过期警告"
description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器延时超过5分钟"
- alert: CPU使用情况
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
for: 1m
labels:
severity: warn
annotations:
summary: "{{$labels.mountpoint}} CPU使用率过高!"
description: "{{$labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)"
- alert: 内存使用
expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
for: 1m
labels:
severity: warn
annotations:
summary: "{{$labels.mountpoint}} 内存使用率过高!"
description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 1m
labels:
severity: warn
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
- alert: 网络
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
severity: warn
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
for: 1m
labels:
severity: warn
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
7 检查语法是否正确
#!/bin/bash
# 用于检查 prometheus的配置文件和规则文件是否有错误
# 需要根据自己的部署和配置文件路径修改下面的脚本部分内容
kubectl exec -ti -n monitoring prometheus-stateful-0 -c prometheus-server -- /bin/sh -c "cd /etc/prometheus/ && /bin/promtool check config prometheus.yml" | grep -A 2 FAILED
kubectl exec -ti -n monitoring prometheus-stateful-0 -c prometheus-server -- /bin/sh -c "cd /etc/prometheus/ && /bin/promtool check rules rules/*.yaml" | grep -A 2 FAILED
echo
echo "========================================================================================="
echo "如果全部正确,没有返回值。"
echo "如果有报错,会返回错误的行数(指的是pod里面配置文件的对应的行数)或报错的原因"
echo "========================================================================================="
8 热重启
直接重启prometheus即可。因为部署prometheus时开启了热重启 直接
然后热重启:curl -XPOST http://localhost:9090/-/reload
prometheus热重启
prometheus启动命令添加参数 —web.enable-lifecycle