需要prometheus和alertmanager通知组件。
    具体软件参考:https://www.yuque.com/g/qinxi-cvygi/gndo6n/folder/19640486
    正常运行进程如下:

    1. $ ps -ef|grep prome
    2. root 726 1 0 2020 ? 17:25:18 /usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/data/prometheus
    3. root 2563 758 0 10:52 ? 00:00:00 /bin/bash /data/shell/monitor_prometheus.sh
    4. root 2565 758 0 10:52 ? 00:00:00 /usr/local/prometheus/blackbox_exporter/blackbox_exporter --config.file=/usr/local/prometheus/blackbox_exporter/blackbox.yml
    5. root 2566 758 0 10:52 ? 00:00:00 ./usr/local/alertmanager/webhook_dingtalk/dingtalk/prometheus-webhook-dingtalk --ding.profile=webhook1=https://oapi.dingtalk.com/robot/send?access_token=8bc2cdc7d19d2448447b40f4c9bb19794dc3af0c572c45016ca6044e7c42361e

    prometheus主配置文件:

    1. # my global config
    2. global:
    3. scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
    4. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
    5. # scrape_timeout is set to the global default (10s).
    6. # Alertmanager configuration
    7. alerting:
    8. alertmanagers:
    9. - static_configs:
    10. - targets: ["localhost:9093"]
    11. # - alertmanager:9093
    12. # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    13. rule_files:
    14. - /usr/local/prometheus/rules/*.yml
    15. # A scrape configuration containing exactly one endpoint to scrape:
    16. # Here it's Prometheus itself.
    17. scrape_configs:
    18. # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
    19. - job_name: 'prometheus'
    20. static_configs:
    21. - targets: ['localhost:9090']
    22. labels:
    23. instance: prometheus
    24. # - job_name: 'hz-p-inner'
    25. # static_configs:
    26. # - targets: ['198.126.61.194:9100']
    27. # labels:
    28. # instance: hz-p-inner
    29. - job_name: 'blackbox'
    30. metrics_path: /probe
    31. params:
    32. module: [http_2xx]
    33. static_configs:
    34. - targets:
    35. - https://p.coach.123.com
    36. - http://p.bdwechat.123.com/wechat
    37. - http://p.coach.123.com
    38. - http://klass.api.com/actuator/health
    39. relabel_configs:
    40. - source_labels: [__address__]
    41. target_label: __param_target
    42. - source_labels: [__param_target]
    43. target_label: instance
    44. - target_label: __address__
    45. replacement: 127.0.0.1:9115
    46. - job_name: 'blackbox_http_2xx_post'
    47. metrics_path: /probe
    48. params:
    49. module: [http_post_2xx]
    50. static_configs:
    51. - targets:
    52. - https://www.123.com/api/new_receive_trial_klass
    53. - http://p.coach.123.com/mini_program/verification_code
    54. - http://p.coach.123.com/api/login
    55. relabel_configs:
    56. - source_labels: [__address__]
    57. target_label: __param_target
    58. - source_labels: [__param_target]
    59. target_label: instance
    60. - target_label: __address__
    61. replacement: 127.0.0.1:9115

    supervisor守护进程配置:

    1. ls /etc/supervisor/conf.d/
    2. alertmanager.conf blackbox_exporter.conf prometheus.conf web-hook-dingtalk.conf

    具体守护进程配置文件如下:
    $ cat alertmanager.conf

    1. [program:alertmanager]
    2. dictory = /usr/local/alertmanager
    3. command = /usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
    4. autostart = true
    5. autorestart = true
    6. startsecs = 3
    7. startretries = 20

    $ cat blackbox_exporter.conf

    1. [program:blackbox_exporter]
    2. dictory = /usr/local/prometheus/blackbox_exporter
    3. command = /usr/local/prometheus/blackbox_exporter/blackbox_exporter --config.file=/usr/local/prometheus/blackbox_exporter/blackbox.yml
    4. autostart = true
    5. autorestart = true
    6. startsecs = 3
    7. startretries = 20

    $ cat prometheus.conf

    1. [program:monitor_prometheus]
    2. user = root
    3. dictory = /data/shell
    4. command = /bin/bash /data/shell/monitor_prometheus.sh
    5. stdout_logfile = /var/log/supervisor/monitor_prometheus.log
    6. stdout_logfile_maxbytes = 50MB
    7. stdout_logfile_backups = 10
    8. autostart = true
    9. autorestart = true
    10. startsecs = 3
    11. startretries = 20

    $ cat /data/shell/monitor_prometheus.sh

    1. while true;do
    2. count=$(ps -ef|grep prometheus.yml| grep -v "grep" | wc -l)
    3. echo $count
    4. sleep 5
    5. if [ $count -eq 0 ]; then
    6. echo "$(date)-" >> /tmp/test.log
    7. curl 'https://oapi.dingtalk.com/robot/send?access_token=a8ca044089002471**********2a7825632631' \
    8. -H 'Content-Type: application/json' \
    9. -d '
    10. {"msgtype": "text",
    11. "text": {
    12. "content": "hz-prome promethues正在重启,Restarting..."
    13. }
    14. }'
    15. nohup /usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/data/prometheus &
    16. fi
    17. done

    $ cat web-hook-dingtalk.conf

    1. [program:dingtalk]
    2. dictory = /usr/local/alertmanager/webhook_dingtalk/dingtalk
    3. command = ./usr/local/alertmanager/webhook_dingtalk/dingtalk/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=8bc2cdc7d19d2448447b40f4**********6ca6044e7c42361e"
    4. stdout_logfile = /usr/local/alertmanager/webhook_dingtalk/dingtalk/dingtalk.log
    5. stdout_logfile_maxbytes = 50MB
    6. stdout_logfile_backups = 10
    7. autostart = true
    8. autorestart = true
    9. startsecs = 3
    10. startretries = 20