1. 环境说明

2. 安装node_exporter

  • 运行 ```bash /usr/sbin/node_exporter —log.level=”info” \ —web.disable-exporter-metrics \ —collector.systemd \ —collector.processes \ —collector.mountstats \ —collector.ntp.server=”10.68.3.101” \ —collector.diskstats.ignored-devices=”^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\d+n\d+p)\d+$” \ —collector.filesystem.ignored-mount-points=”^/(dev|proc|sys|var/lib/docker/.+)($|/)” \ —collector.filesystem.ignored-fs-types=”^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$” &
  1. - 开机启动
  2. ```bash
  3. [root@monitor1 ~]# cat /etc/rc.local
  4. #!/bin/bash
  5. # THIS FILE IS ADDED FOR COMPATIBILITY PURPOSES
  6. #
  7. # It is highly advisable to create own systemd services or udev rules
  8. # to run scripts during boot instead of using this file.
  9. #
  10. # In contrast to previous versions due to parallel execution during boot
  11. # this script will NOT be run after all other services.
  12. #
  13. # Please note that you must run 'chmod +x /etc/rc.d/rc.local' to ensure
  14. # that this script will be executed during boot.
  15. /usr/sbin/node_exporter --log.level="info" \
  16. --web.disable-exporter-metrics \
  17. --collector.systemd \
  18. --collector.processes \
  19. --collector.mountstats \
  20. --collector.ntp.server="10.68.3.101" \
  21. --collector.diskstats.ignored-devices="^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$" \
  22. --collector.filesystem.ignored-mount-points="^/(dev|proc|sys|var/lib/docker/.+)($|/)" \
  23. --collector.filesystem.ignored-fs-types="^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" &
  24. touch /var/lock/subsys/local

2.安装influxdb

  • 安装influxdb
    1. ## 下载influxdb https://portal.influxdata.com/downloads/
    2. yum localinstall -y influxdb-1.8.0.x86_64.rpm

    2.2 创建挂载ceph块存储

  1. [root@mon1 ~]# ceph -s
  2. cluster:
  3. id: 8785bb23-6770-4f83-9235-279c7d34c76f
  4. health: HEALTH_WARN
  5. Degraded data redundancy: 487/1527 objects degraded (31.893%), 122 pgs degraded
  6. services:
  7. mon: 3 daemons, quorum mon1,mon2,mon3 (age 2m)
  8. mgr: mon1(active, since 3h), standbys: mon2, mon3
  9. osd: 9 osds: 9 up (since 4h), 9 in (since 4h)
  10. data:
  11. pools: 1 pools, 128 pgs
  12. objects: 509 objects, 1.9 GiB
  13. usage: 13 GiB used, 527 GiB / 540 GiB avail
  14. pgs: 487/1527 objects degraded (31.893%)
  15. 120 active+recovery_wait+degraded
  16. 6 active+clean
  17. 2 active+recovering+degraded
  18. io:
  19. recovery: 3.3 MiB/s, 0 objects/s
  • 创建pool

    1. ceph osd pool create influxdata 512 512
  • 检查

    1. [root@mon1 ~]# ceph osd lspools
    2. 22 influxdata
  • 创建块镜像

    1. rbd create influxdata/influx_data --size 10240 --image-feature layering
    2. ## 如果不指定 pool_name/image_name 默认使用的是rbd pool
  • 检查

    1. [root@mon1 ~]# rbd list influxdata
    2. influx_data
    3. [root@mon1 ~]# rbd info influxdata/influx_data
    4. rbd image 'influx_data':
    5. size 10 GiB in 2560 objects
    6. order 22 (4 MiB objects)
    7. snapshot_count: 0
    8. id: 1982a32f770f0
    9. block_name_prefix: rbd_data.1982a32f770f0
    10. format: 2
    11. features: layering
    12. op_features:
    13. flags:
    14. create_timestamp: Wed Jan 6 13:28:27 2021
    15. access_timestamp: Wed Jan 6 13:28:27 2021
    16. modify_timestamp: Wed Jan 6 13:28:27 2021
  • 创建账户和密码 ```bash ceph auth get-or-create client.influx mon ‘allow r’ osd ‘allow rwx pool=influxdata’|tee > /etc/ceph/ceph.client.influx.keyring

[root@mon1 ~]# cat /etc/ceph/ceph.client.influx.keyring [client.influx] key = AQCgTfVfogPQNBAAsisDrblyxPYNg5WKk1XzLg==

  1. - 拷贝到客户端
  2. ```bash
  3. [root@mon1 ~]# scp /etc/ceph/ceph.client.influx.keyring 10.68.3.91:/etc/ceph/
  • 客户端映射快设备 ```bash yum install ceph-common -y

    创建ceph.conf

    [root@monitor1 ~]# cat /etc/ceph/ceph.conf [global] mon_host = 10.68.3.121,10.68.3.122,10.68.3.123

[root@monitor1 ~]# rbd map —image influxdata/influx_data —name client.influx /dev/rbd0

  1. - 格式化挂载使用
  2. ```bash
  3. mkfs.xfs /dev/rbd0
  4. mount /dev/rbd0 /var/lib/influxdb/
  5. [root@monitor1 ~]# df -h
  6. Filesystem Size Used Avail Use% Mounted on
  7. devtmpfs 899M 0 899M 0% /dev
  8. tmpfs 910M 0 910M 0% /dev/shm
  9. tmpfs 910M 33M 878M 4% /run
  10. tmpfs 910M 0 910M 0% /sys/fs/cgroup
  11. /dev/mapper/centos-root 14G 2.2G 12G 17% /
  12. /dev/sda1 1014M 149M 866M 15% /boot
  13. tmpfs 182M 0 182M 0% /run/user/0
  14. /dev/rbd0 10G 33M 10G 1% /var/lib/influxdb
  • 开机挂载 ```bash

[root@monitor1 ~]# cat /etc/fstab

#

/etc/fstab

Created by anaconda on Mon Jan 4 07:43:22 2021

#

Accessible filesystems, by reference, are maintained under ‘/dev/disk’

See man pages fstab(5), findfs(8), mount(8) and/or blkid(8) for more info

# /dev/mapper/centos-root / xfs defaults 0 0 UUID=68c4209c-d479-4804-ac96-406a547d6168 /boot xfs defaults 0 0 /dev/mapper/centos-swap swap swap defaults 0 0 /dev/rbd0 /var/lib/influxdb xfs defaults 0 0

  1. <a name="bRTk4"></a>
  2. ## 2.3 启动influxdb
  3. - 修改配置文件
  4. ```bash
  5. bind-address = "0.0.0.0:8088"
  1. systemctl enable influxdb
  2. systemctl start influxdb

3. 安装prometheus

3.1 influxdb创建表

  1. [root@monitor1 lib]# influx
  2. Connected to http://localhost:8086 version 1.8.3
  3. InfluxDB shell version: 1.8.3
  4. > create database prometheus;
  5. > create user icinga2 with password 'password';
  6. > grant all on prometheus to icinga2;
  7. ERR: user not found
  8. Warning: It is possible this error is due to not setting a database.
  9. Please set a database with the command "use <database>".
  10. > use prometheus;
  11. Using database prometheus
  12. > grant all on prometheus to icinga2;
  13. ERR: user not found
  14. > show retention policies on prometheus;
  15. name duration shardGroupDuration replicaN default
  16. ---- -------- ------------------ -------- -------
  17. autogen 0s 168h0m0s 1 true
  18. > quit
  • 或者 ```bash

    创建数据库和授权用户

    本地通过influx客户端命令连接数据库: influx -precision rfc3339

创建数据库和用户: create database prometheus create user “admin” with password ‘Admin123’ with all privileges use prometheus

show measurements name: measurements

name

scrape_duration_seconds scrape_samples_post_metric_relabeling scrape_samples_scraped scrape_series_added up ```

3.2 安装promethues

  • 配置分片节点的prometheus ```bash tar xvf prometheus-2.13.1.linux-amd64.tar.gz [root@monitor1 ~]# cp prometheus-2.13.1.linux-amd64/prometheus /usr/sbin/ [root@monitor1 ~]# cp prometheus-2.13.1.linux-amd64/promtool /usr/sbin/ [root@monitor1 ~]# cp prometheus-2.13.1.linux-amd64/prometheus.yml /etc/

分发文件到其他服务器

[root@monitor1 ~]# scp /usr/sbin/prometheus monitor2:/usr/sbin/ prometheus 100% 75MB 9.4MB/s 00:08 [root@monitor1 ~]# scp /usr/sbin/prometheus monitor3:/usr/sbin/ prometheus

  1. - 配置promethues文件
  2. ```bash
  3. [root@monitor1 ~]# cat /etc/prometheus.yml |egrep -v '\#|^$'
  4. global:
  5. alerting:
  6. alertmanagers:
  7. - static_configs:
  8. - targets:
  9. rule_files:
  10. scrape_configs:
  11. - job_name: 'prometheus'
  12. static_configs:
  13. - targets: ['10.68.3.91:9100'] # 监控的主机
  14. labels:
  15. hostname: monitor1
  16. ## 分发到其他主机
  17. [root@monitor1 ~]# scp /etc/prometheus.yml monitor2:/etc/
  18. prometheus.yml 100% 925 42.1KB/s 00:00
  19. ^C[root@monitor1 ~]# scp /etc/prometheus.yml monitor3:/etc/
  20. prometheus.yml
  • 启动服务

    1. [root@monitor1 ~]#/usr/sbin/prometheus --config.file=/etc/prometheus.yml \
    2. --log.level=info \
    3. --storage.tsdb.path=/var/lib/data/ \
    4. --web.enable-admin-api \
    5. --web.enable-lifecycle \
    6. --web.listen-address="0.0.0.0:9090" &
  • 开机启动 ```bash [root@monitor1 ~]# cat /etc/rc.local /usr/sbin/prometheus —config.file=/etc/prometheus.yml \ —log.level=info \ —storage.tsdb.path=/var/lib/data/ \ —web.enable-admin-api \ —web.enable-lifecycle \ —web.listen-address=”0.0.0.0:9090” &

分发到各节点

[root@monitor1 ~]# scp /etc/rc.local monitor2:/etc/ [root@monitor1 ~]# scp /etc/rc.local monitor3:/etc/

  1. <a name="8fpG0"></a>
  2. ## 3.3 安装primary节点
  3. - 配置文件
  4. ```bash
  5. [root@monitor1 etc]# cat prometheus-primary.yml |egrep -v '#|^$'
  6. global:
  7. alerting:
  8. alertmanagers:
  9. - static_configs:
  10. - targets:
  11. remote_write:
  12. - url: "http://10.68.3.91:8086/api/v1/prom/write?db=prometheus"
  13. basic_auth:
  14. username: admin
  15. password: Admin123
  16. remote_read:
  17. - url: "http://10.68.3.93:8086/api/v1/prom/read?db=prometheus"
  18. basic_auth:
  19. username: admin
  20. password: Admin123
  21. rule_files:
  22. scrape_configs:
  23. - job_name: 'prometheus-primary'
  24. honor_labels: true
  25. metrics_path: '/federate'
  26. params:
  27. 'match[]':
  28. - ^'{job=~"promethues*"}'
  29. static_configs:
  30. - targets:
  31. - 'monitor1:9090'
  32. - 'monitor2:9090'
  33. - 'monitor3:9090'

其中,db替换为实际创建的数据库名称,username和password需要替换为上一步influxdb中创建的账密信息。 修改完成后并重启prometheus,服务正常后,influxdb中就会有相应的数据了,实现了数据的持久化。

  • 启动服务

    1. [root@monitor1 opt]# prometheus --config.file=/etc/prometheus-primary.yml \
    2. --log.level=info --web.enable-admin-api --web.enable-lifecycle \
    3. --web.listen-address="0.0.0.0:8080" &
  • 检查验证

浏览器打开: http://10.68.3.91:8080/targets
图片.png

  • 同理创建其他三个prometheus-primary节点

    1. [root@monitor1 ~]# scp /etc/prometheus-primary.yml monitor2:/etc/
    2. prometheus-primary.yml 100% 1443 59.5KB/s 00:00
    3. [root@monitor1 ~]# scp /etc/prometheus-primary.yml monitor3:/etc/
    4. prometheus-primary.yml
  • 启动

    1. [root@monitor1 ~]# scp /etc/rc.local monitor2:/etc/
    2. rc.local 100% 1433 59.1KB/s 00:00
    3. [root@monitor1 ~]# scp /etc/rc.local monitor3:/etc/
    4. rc.local
  • 验证

http://10.68.3.92:8080/targets
图片.png
http://10.68.3.93:8080/targets
图片.png

  • 检查influxdb的数据
    1. [root@monitor1 ~]# influx
    2. Connected to http://localhost:8086 version 1.8.3
    3. InfluxDB shell version: 1.8.3
    4. >
    5. > use prometheus
    6. Using database prometheus
    7. > show series ##可以看到收集到的数据
    8. > show measurements

4. 安装配置高可用和负载均衡

4.1 安装keepalived

  • 安装

    1. yum install keepalived -y
  • MASTER[3.91]配置文件 ```bash [root@monitor1 keepalived]# cat /etc/keepalived/keepalived.conf ! Configuration File for keepalived

global_defs { router_id monitor1 vrrp_skip_check_adv_addr vrrp_strict vrrp_garp_interval 0 vrrp_gna_interval 0 } vrrp_script check_nginx { script “/etc/keepalived/nginx_check.sh” interval 2 weight -20 }

vrrp_instance VI_1 { state MASTER interface ens32 virtual_router_id 51 priority 100 advert_int 1 authentication { auth_type PASS auth_pass 1111 } virtual_ipaddress { 10.68.3.90 } }

  1. - BACKUP1 节点[3.92]配置文件
  2. ```bash
  3. [root@monitor2 ~]# cat /etc/keepalived/keepalived.conf
  4. ! Configuration File for keepalived
  5. global_defs {
  6. router_id monitor2
  7. vrrp_skip_check_adv_addr
  8. vrrp_strict
  9. vrrp_garp_interval 0
  10. vrrp_gna_interval 0
  11. }
  12. vrrp_script check_nginx {
  13. script "/etc/keepalived/nginx_check.sh"
  14. interval 2
  15. weight -20
  16. }
  17. vrrp_instance VI_1 {
  18. state MASTER
  19. interface ens32
  20. mcast_src_ip 10.68.3.92
  21. virtual_router_id 51
  22. priority 99
  23. advert_int 1
  24. authentication {
  25. auth_type PASS
  26. auth_pass 1111
  27. }
  28. virtual_ipaddress {
  29. 10.68.3.90
  30. }
  31. }
  • BACKUP1 节点[3.93]配置文件 ```bash [root@monitor3 ~]# cat /etc/keepalived/keepalived.conf ! Configuration File for keepalived

global_defs { router_id monitor3 vrrp_skip_check_adv_addr vrrp_garp_interval 0 vrrp_gna_interval 0 } vrrp_script check_nginx { script “/etc/keepalived/nginx_check.sh” interval 2 weight -20 }

vrrp_instance VI_1 { state MASTER interface ens32 mcast_src_ip 10.68.3.93 virtual_router_id 51 priority 98 advert_int 1 authentication { auth_type PASS auth_pass 1111 } virtual_ipaddress { 10.68.3.90 } }

  1. - 启动服务
  2. ```bash
  3. [root@monitor1 keepalived]# systemctl start keepalived
  4. [root@monitor1 keepalived]# systemctl enable keepalived
  • 验证 ```bash

    master节点

    [root@monitor1 keepalived]# ip addr|grep inet inet 127.0.0.1/8 scope host lo inet6 ::1/128 scope host inet 10.68.3.91/24 brd 10.68.3.255 scope global noprefixroute ens32 inet 10.68.3.90/32 scope global ens32 inet6 fe80::150f:2e53:9bd0:a01b/64 scope link noprefixroute
  1. - 测试
  2. ```bash
  3. ## 关闭master到keepalived 服务,检查vip有没有漂移到backup节点

5. Alert manager高可用

5.1 二进制包

  • 下载二进制包
    ```bash cp alertmanager-0.20.0.linux-amd64/alertmanager /usr/sbin/ cp alertmanager-0.20.0.linux-amd64/alertmanager.yml /etc/

scp alertmanager-0.20.0.linux-amd64/alertmanager monitor2:/usr/sbin/ scp alertmanager-0.20.0.linux-amd64/alertmanager monitor3:/usr/sbin/

scp /etc/alertmanager.yml monitor2:/etc/ scp /etc/alertmanager.yml monitor3:/etc/

  1. <a name="6BTWs"></a>
  2. ## 5.2 启动服务
  3. ```bash
  4. ## monitor1
  5. /usr/sbin/alertmanager \
  6. --config.file /etc/alertmanager.yml \
  7. --web.listen-address="10.68.3.91:8194" \
  8. --log.level=debug \
  9. --data.retention=120h \
  10. --cluster.listen-address="10.68.3.91:6129" > /var/log/alert_manager.log &
  11. ## monitor2
  12. /usr/sbin/alertmanager \
  13. --config.file /etc/alertmanager.yml \
  14. --web.listen-address="10.68.3.92:8194" \
  15. --log.level=debug \
  16. --data.retention=120h \
  17. --cluster.peer="10.68.3.91:6129" \
  18. --cluster.listen-address="10.68.3.92:6129" 2>&1 > /var/log/alert_manager.log &
  19. ## monitor3
  20. /usr/sbin/alertmanager \
  21. --config.file /etc/alertmanager.yml \
  22. --web.listen-address="10.68.3.93:8194" \
  23. --log.level=debug \
  24. --data.retention=120h \
  25. --cluster.peer="10.68.3.91:6129" \
  26. --cluster.listen-address="10.68.3.93:6129" 2>&1 > /var/log/alert_manager.log &
  • 开机启动 ```bash

    monitor1

    [root@monitor1 ~]# cat /etc/rc.local

    !/bin/bash

    THIS FILE IS ADDED FOR COMPATIBILITY PURPOSES

    #

    It is highly advisable to create own systemd services or udev rules

    to run scripts during boot instead of using this file.

    #

    In contrast to previous versions due to parallel execution during boot

    this script will NOT be run after all other services.

    #

    Please note that you must run ‘chmod +x /etc/rc.d/rc.local’ to ensure

    that this script will be executed during boot.

    /usr/sbin/node_exporter —log.level=”info” \ —web.disable-exporter-metrics \ —collector.systemd \ —collector.processes \ —collector.mountstats \ —collector.ntp.server=”10.68.3.101” \ —collector.diskstats.ignored-devices=”^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\d+n\d+p)\d+$” \ —collector.filesystem.ignored-mount-points=”^/(dev|proc|sys|var/lib/docker/.+)($|/)” \ —collector.filesystem.ignored-fs-types=”^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$” &

/usr/sbin/prometheus —config.file=/etc/prometheus.yml \ —log.level=info \ —storage.tsdb.path=/var/lib/data/ \ —web.enable-admin-api \ —web.enable-lifecycle \ —web.listen-address=”0.0.0.0:9090” &

prometheus —config.file=/etc/prometheus-primary.yml —log.level=info —web.enable-admin-api —web.enable-lifecycle —web.listen-address=”0.0.0.0:8080” &

/usr/sbin/alertmanager \ —config.file /etc/alertmanager.yml \ —web.listen-address=”10.68.3.91:8194” \ —log.level=debug \ —data.retention=120h \ —cluster.listen-address=”10.68.3.91:6129” > /var/log/alert_manager.log &

touch /var/lock/subsys/local

monitor2

cat /etc/rc.local

!/bin/bash

THIS FILE IS ADDED FOR COMPATIBILITY PURPOSES

#

It is highly advisable to create own systemd services or udev rules

to run scripts during boot instead of using this file.

#

In contrast to previous versions due to parallel execution during boot

this script will NOT be run after all other services.

#

Please note that you must run ‘chmod +x /etc/rc.d/rc.local’ to ensure

that this script will be executed during boot.

/usr/sbin/node_exporter —log.level=”info” \ —web.disable-exporter-metrics \ —collector.systemd \ —collector.processes \ —collector.mountstats \ —collector.ntp.server=”10.68.3.101” \ —collector.diskstats.ignored-devices=”^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\d+n\d+p)\d+$” \ —collector.filesystem.ignored-mount-points=”^/(dev|proc|sys|var/lib/docker/.+)($|/)” \ —collector.filesystem.ignored-fs-types=”^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$” &

/usr/sbin/prometheus —config.file=/etc/prometheus.yml \ —log.level=info \ —storage.tsdb.path=/var/lib/data/ \ —web.enable-admin-api \ —web.enable-lifecycle \ —web.listen-address=”0.0.0.0:9090” &

/usr/sbin/prometheus —config.file=/etc/prometheus-primary.yml —log.level=info —web.enable-admin-api —web.enable-lifecycle —web.listen-address=”0.0.0.0:8080” &

/usr/sbin/alertmanager \ —config.file /etc/alertmanager.yml \ —web.listen-address=”10.68.3.92:8194” \ —log.level=debug \ —data.retention=120h \ —cluster.peer=”10.68.3.91:6129” \ —cluster.listen-address=”10.68.3.92:6129” 2>&1 > /var/log/alert_manager.log &

touch /var/lock/subsys/local

monitor3

cat /etc/rc.local

!/bin/bash

THIS FILE IS ADDED FOR COMPATIBILITY PURPOSES

#

It is highly advisable to create own systemd services or udev rules

to run scripts during boot instead of using this file.

#

In contrast to previous versions due to parallel execution during boot

this script will NOT be run after all other services.

#

Please note that you must run ‘chmod +x /etc/rc.d/rc.local’ to ensure

that this script will be executed during boot.

/usr/sbin/node_exporter —log.level=”info” \ —web.disable-exporter-metrics \ —collector.systemd \ —collector.processes \ —collector.mountstats \ —collector.ntp.server=”10.68.3.101” \ —collector.diskstats.ignored-devices=”^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\d+n\d+p)\d+$” \ —collector.filesystem.ignored-mount-points=”^/(dev|proc|sys|var/lib/docker/.+)($|/)” \ —collector.filesystem.ignored-fs-types=”^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$” &

/usr/sbin/prometheus —config.file=/etc/prometheus.yml \ —log.level=info \ —storage.tsdb.path=/var/lib/data/ \ —web.enable-admin-api \ —web.enable-lifecycle \ —web.listen-address=”0.0.0.0:9090” &

prometheus —config.file=/etc/prometheus-primary.yml —log.level=info —web.enable-admin-api —web.enable-lifecycle —web.listen-address=”0.0.0.0:8080” &

/usr/sbin/alertmanager \ —config.file /etc/alertmanager.yml \ —web.listen-address=”10.68.3.93:8194” \ —log.level=debug \ —data.retention=120h \ —cluster.peer=”10.68.3.91:6129” \ —cluster.listen-address=”10.68.3.93:6129” 2>&1 > /var/log/alert_manager.log &

touch /var/lock/subsys/local

```

5.3 检查验证

http://10.68.3.93:8194/#/status
http://10.68.3.92:8194/#/status
http://10.68.3.91:8194/#/status

图片.png