1 下载
官网: https://prometheus.io/download/
wget https://github.com/prometheus/prometheus/releases/download/v2.22.2/prometheus-2.22.2.linux-amd64.tar.gz
2 解压安装
tar zxf prometheus-2.22.2.linux-amd64.tar.gz
cd prometheus-2.22.2.linux-amd64
3 写进systemd服务中
vim /usr/lib/systemd/system/prometheus.service[Unit]Description=PrometheusDocumentation=https://prometheus.io/After=network.target[Service]Type=simpleExecStart=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/prometheus --config.file=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/prometheus.yml --storage.tsdb.path=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/dataExecReload=/bin/kill -HUP $MAINPIDExecStop=/bin/kill -KILL $MAINPIDKillMode=control-groupRestart=on-failure[Install]WantedBy=multi-user.target
4 启动
systemctl enable prometheussystemctl start prometheussystemctl stop prometheussystemctl reload prometheus
5 设置prometheus的配置
prometheus.yml# my global configglobal:scrape_interval: 15s # 设定抓取数据的周期,默认为1minevaluation_interval: 15s # 设定更新rules文件的周期,默认为1min# scrape_timeout is set to the global default (10s).# Alertmanager configurationalerting:alertmanagers:- static_configs:- targets: # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口- 127.0.0.1:9093# rule配置,首次读取默认加载,之后根据evaluation_interval设定的周期加载rule_files:- "first_rules.yml"# - "second_rules.yml"# A scrape configuration containing exactly one endpoint to scrape:# Here it's Prometheus itself.scrape_configs:# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.- job_name: 'prometheus' # job_name默认写入timeseries的labels中,可以用于查询使用scrape_interval: 15s # 抓取周期,默认采用global配置static_configs: # 静态配置- targets: ['127.0.0.1:9090'] # prometheus所要抓取数据的地址,即instance实例项- job_name: 'node_export' #node_export配置static_configs:- targets:- 127.0.0.1:9100- job_name: 'blackbox_http_2xx'scrape_interval: 45smetrics_path: /probeparams:module: [http_2xx] # Look for a HTTP 200 response.###配置文件# file_sd_configs:# - refresh_interval: 1m# files:# - "/home/prometheus/conf/service_post.yml"static_configs:- targets:- https://www.baidu.comrelabel_configs:- source_labels: [__address__]target_label: __param_target- source_labels: [__param_target]target_label: instance- target_label: __address__replacement: 127.0.0.1:9115- job_name: "blackbox_telnet_port"scrape_interval: 5smetrics_path: /probeparams:module: [tcp_connect]static_configs:- targets: [ '127.0.0.1:3306' ]labels:group: '数据库监控'- targets: [ '10.240.0.7:6379' ]labels:group: '中文官网redis监控'- targets: [ '10.240.0.7:6380' ]labels:group: '英文官网redis监控'relabel_configs:- source_labels: [__address__]target_label: __param_target- source_labels: [__param_target]target_label: instance- target_label: __address__replacement: 127.0.0.1:9115- job_name: 'blackbox_ssl_expiry'metrics_path: /probeparams:module: [http_2xx] # Look for a HTTP 200 response.static_configs:- targets:- www.baidu.com.cn # Targent to proberelabel_configs:- source_labels: [__address__]target_label: __param_target- source_labels: [__param_target]target_label: instance- target_label: __address__replacement: 127.0.0.1:9115 # Blackbox exporter.
6 报警项设置
first_rules.yml #报警项配置文件,和路径是在prometheus.yml文件中配置groups:- name: blackbox_network_statsrules:- alert: blackbox_network_statsexpr: probe_success == 0for: 1mlabels:severity: criticalannotations:summary: "Instance {{ $labels.instance }} is down"description: "This requires immediate action!"- name: ssl_expiry.rulesrules:- alert: SSLCertExpiringSoonexpr: (probe_ssl_earliest_cert_expiry{job="blackbox_ssl_expiry"} - time())/86400 < 30for: 10mlabels:severity: warnannotations:summary: "ssl证书过期警告"description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'- name: 主机状态-监控告警rules:- alert: 主机状态expr: up == 0for: 1mlabels:severity: criticalannotations:summary: "{{$labels.instance}}:服务器宕机"description: "{{$labels.instance}}:服务器延时超过5分钟"- alert: CPU使用情况expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60for: 1mlabels:severity: warnannotations:summary: "{{$labels.mountpoint}} CPU使用率过高!"description: "{{$labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)"- alert: 内存使用expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80for: 1mlabels:severity: warnannotations:summary: "{{$labels.mountpoint}} 内存使用率过高!"description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"- alert: IO性能expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60for: 1mlabels:severity: warnannotations:summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"- alert: 网络expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400for: 1mlabels:severity: warnannotations:summary: "{{$labels.mountpoint}} 流出网络带宽过高!"description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"- alert: TCP会话expr: node_netstat_Tcp_CurrEstab > 1000for: 1mlabels:severity: criticalannotations:summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"- alert: 磁盘容量expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80for: 1mlabels:severity: warnannotations:summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
7 检查语法是否正确
#!/bin/bash# 用于检查 prometheus的配置文件和规则文件是否有错误# 需要根据自己的部署和配置文件路径修改下面的脚本部分内容kubectl exec -ti -n monitoring prometheus-stateful-0 -c prometheus-server -- /bin/sh -c "cd /etc/prometheus/ && /bin/promtool check config prometheus.yml" | grep -A 2 FAILEDkubectl exec -ti -n monitoring prometheus-stateful-0 -c prometheus-server -- /bin/sh -c "cd /etc/prometheus/ && /bin/promtool check rules rules/*.yaml" | grep -A 2 FAILEDechoecho "========================================================================================="echo "如果全部正确,没有返回值。"echo "如果有报错,会返回错误的行数(指的是pod里面配置文件的对应的行数)或报错的原因"echo "========================================================================================="
8 热重启
直接重启prometheus即可。因为部署prometheus时开启了热重启 直接
然后热重启:curl -XPOST http://localhost:9090/-/reload
prometheus热重启
prometheus启动命令添加参数 —web.enable-lifecycle
