1 下载

官网: https://prometheus.io/download/
wget https://github.com/prometheus/prometheus/releases/download/v2.22.2/prometheus-2.22.2.linux-amd64.tar.gz

2 解压安装

tar zxf prometheus-2.22.2.linux-amd64.tar.gz
cd prometheus-2.22.2.linux-amd64

3 写进systemd服务中

  1. vim /usr/lib/systemd/system/prometheus.service
  2. [Unit]
  3. Description=Prometheus
  4. Documentation=https://prometheus.io/
  5. After=network.target
  6. [Service]
  7. Type=simple
  8. ExecStart=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/prometheus --config.file=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/prometheus.yml --storage.tsdb.path=/data01/prometheus-download/prometheus-2.22.2.linux-amd64/data
  9. ExecReload=/bin/kill -HUP $MAINPID
  10. ExecStop=/bin/kill -KILL $MAINPID
  11. KillMode=control-group
  12. Restart=on-failure
  13. [Install]
  14. WantedBy=multi-user.target

4 启动

  1. systemctl enable prometheus
  2. systemctl start prometheus
  3. systemctl stop prometheus
  4. systemctl reload prometheus

5 设置prometheus的配置

  1. prometheus.yml
  2. # my global config
  3. global:
  4. scrape_interval: 15s # 设定抓取数据的周期,默认为1min
  5. evaluation_interval: 15s # 设定更新rules文件的周期,默认为1min
  6. # scrape_timeout is set to the global default (10s).
  7. # Alertmanager configuration
  8. alerting:
  9. alertmanagers:
  10. - static_configs:
  11. - targets: # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口
  12. - 127.0.0.1:9093
  13. # rule配置,首次读取默认加载,之后根据evaluation_interval设定的周期加载
  14. rule_files:
  15. - "first_rules.yml"
  16. # - "second_rules.yml"
  17. # A scrape configuration containing exactly one endpoint to scrape:
  18. # Here it's Prometheus itself.
  19. scrape_configs:
  20. # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  21. - job_name: 'prometheus' # job_name默认写入timeseries的labels中,可以用于查询使用
  22. scrape_interval: 15s # 抓取周期,默认采用global配置
  23. static_configs: # 静态配置
  24. - targets: ['127.0.0.1:9090'] # prometheus所要抓取数据的地址,即instance实例项
  25. - job_name: 'node_export' #node_export配置
  26. static_configs:
  27. - targets:
  28. - 127.0.0.1:9100
  29. - job_name: 'blackbox_http_2xx'
  30. scrape_interval: 45s
  31. metrics_path: /probe
  32. params:
  33. module: [http_2xx] # Look for a HTTP 200 response.
  34. ###配置文件
  35. # file_sd_configs:
  36. # - refresh_interval: 1m
  37. # files:
  38. # - "/home/prometheus/conf/service_post.yml"
  39. static_configs:
  40. - targets:
  41. - https://www.baidu.com
  42. relabel_configs:
  43. - source_labels: [__address__]
  44. target_label: __param_target
  45. - source_labels: [__param_target]
  46. target_label: instance
  47. - target_label: __address__
  48. replacement: 127.0.0.1:9115
  49. - job_name: "blackbox_telnet_port"
  50. scrape_interval: 5s
  51. metrics_path: /probe
  52. params:
  53. module: [tcp_connect]
  54. static_configs:
  55. - targets: [ '127.0.0.1:3306' ]
  56. labels:
  57. group: '数据库监控'
  58. - targets: [ '10.240.0.7:6379' ]
  59. labels:
  60. group: '中文官网redis监控'
  61. - targets: [ '10.240.0.7:6380' ]
  62. labels:
  63. group: '英文官网redis监控'
  64. relabel_configs:
  65. - source_labels: [__address__]
  66. target_label: __param_target
  67. - source_labels: [__param_target]
  68. target_label: instance
  69. - target_label: __address__
  70. replacement: 127.0.0.1:9115
  71. - job_name: 'blackbox_ssl_expiry'
  72. metrics_path: /probe
  73. params:
  74. module: [http_2xx] # Look for a HTTP 200 response.
  75. static_configs:
  76. - targets:
  77. - www.baidu.com.cn # Targent to probe
  78. relabel_configs:
  79. - source_labels: [__address__]
  80. target_label: __param_target
  81. - source_labels: [__param_target]
  82. target_label: instance
  83. - target_label: __address__
  84. replacement: 127.0.0.1:9115 # Blackbox exporter.

6 报警项设置

  1. first_rules.yml #报警项配置文件,和路径是在prometheus.yml文件中配置
  2. groups:
  3. - name: blackbox_network_stats
  4. rules:
  5. - alert: blackbox_network_stats
  6. expr: probe_success == 0
  7. for: 1m
  8. labels:
  9. severity: critical
  10. annotations:
  11. summary: "Instance {{ $labels.instance }} is down"
  12. description: "This requires immediate action!"
  13. - name: ssl_expiry.rules
  14. rules:
  15. - alert: SSLCertExpiringSoon
  16. expr: (probe_ssl_earliest_cert_expiry{job="blackbox_ssl_expiry"} - time())/86400 < 30
  17. for: 10m
  18. labels:
  19. severity: warn
  20. annotations:
  21. summary: "ssl证书过期警告"
  22. description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
  23. - name: 主机状态-监控告警
  24. rules:
  25. - alert: 主机状态
  26. expr: up == 0
  27. for: 1m
  28. labels:
  29. severity: critical
  30. annotations:
  31. summary: "{{$labels.instance}}:服务器宕机"
  32. description: "{{$labels.instance}}:服务器延时超过5分钟"
  33. - alert: CPU使用情况
  34. expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
  35. for: 1m
  36. labels:
  37. severity: warn
  38. annotations:
  39. summary: "{{$labels.mountpoint}} CPU使用率过高!"
  40. description: "{{$labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)"
  41. - alert: 内存使用
  42. expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
  43. for: 1m
  44. labels:
  45. severity: warn
  46. annotations:
  47. summary: "{{$labels.mountpoint}} 内存使用率过高!"
  48. description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"
  49. - alert: IO性能
  50. expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
  51. for: 1m
  52. labels:
  53. severity: warn
  54. annotations:
  55. summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
  56. description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
  57. - alert: 网络
  58. expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
  59. for: 1m
  60. labels:
  61. severity: warn
  62. annotations:
  63. summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
  64. description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
  65. - alert: TCP会话
  66. expr: node_netstat_Tcp_CurrEstab > 1000
  67. for: 1m
  68. labels:
  69. severity: critical
  70. annotations:
  71. summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
  72. description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
  73. - alert: 磁盘容量
  74. expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
  75. for: 1m
  76. labels:
  77. severity: warn
  78. annotations:
  79. summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
  80. description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"

7 检查语法是否正确

  1. #!/bin/bash
  2. # 用于检查 prometheus的配置文件和规则文件是否有错误
  3. # 需要根据自己的部署和配置文件路径修改下面的脚本部分内容
  4. kubectl exec -ti -n monitoring prometheus-stateful-0 -c prometheus-server -- /bin/sh -c "cd /etc/prometheus/ && /bin/promtool check config prometheus.yml" | grep -A 2 FAILED
  5. kubectl exec -ti -n monitoring prometheus-stateful-0 -c prometheus-server -- /bin/sh -c "cd /etc/prometheus/ && /bin/promtool check rules rules/*.yaml" | grep -A 2 FAILED
  6. echo
  7. echo "========================================================================================="
  8. echo "如果全部正确,没有返回值。"
  9. echo "如果有报错,会返回错误的行数(指的是pod里面配置文件的对应的行数)或报错的原因"
  10. echo "========================================================================================="

8 热重启
直接重启prometheus即可。因为部署prometheus时开启了热重启 直接
然后热重启:curl -XPOST http://localhost:9090/-/reload
prometheus热重启
prometheus启动命令添加参数 —web.enable-lifecycle