title: 部署高可用consul集群 #标题tags: consul #标签
date: 2020-12-29
categories: linux大杂烩 # 分类

记录下conusl高可用集群部署方案。
参考官方文档

环境准备

OS hostname IP roles
Centos 7.5 consul01 192.168.20.2 server
Centos 7.5 consul02 192.168.20.3 server
Centos 7.5 consul03 192.168.20.4 server
Centos 7.5 consul04 192.168.20.5 client

部署consul集群

配置consul01

  1. # 下载consul二进制包
  2. $ wget https://releases.hashicorp.com/consul/1.9.0/consul_1.9.0_linux_amd64.zip
  3. # 创建consul家目录
  4. $ mkdir /opt/consul/{bin,conf,data,logs} -p
  5. # 解压二进制文件到指定目录
  6. $ unzip consul_1.9.0_linux_amd64.zip -d /opt/consul/bin/
  7. # 将consul家目录发送至其他节点
  8. $ for i in 3 4 5;do rsync -az /opt/consul 192.168.20.$i:/opt/;done
  9. # 准备配置文件
  10. $ cat > /opt/consul/conf/consul.json << 'EOF'
  11. {
  12. "ports": {
  13. "http": 8500,
  14. "dns": 8600,
  15. "serf_lan": 8301,
  16. "serf_wan": 8302,
  17. "server": 8300
  18. },
  19. "datacenter": "aspire-pro",
  20. "data_dir": "/opt/consul/data",
  21. "log_level": "INFO",
  22. "log_file": "/opt/consul/logs/",
  23. "node_name": "server-20-2",
  24. "server": true,
  25. "ui": true,
  26. "bind_addr": "192.168.20.2",
  27. "client_addr": "0.0.0.0"
  28. }
  29. EOF
  30. # 启动consul01
  31. $ nohup /opt/consul/bin/consul agent -server -bootstrap -config-dir /opt/consul/conf/ &> /dev/null &
  32. # -server:以server身份启动,默认是client。
  33. # -bootstrap:仅在第一个节点第一次启动时需要增加 -bootstrap 选项,以便引导集群初始化。
  34. # 后续任何节点启动,都不要加 -bootstrap 选项
  35. $ ss -lnpt | grep consul # 确定端口已监听
  36. LISTEN 0 128 192.168.20.2:8300 *:* users:(("consul",pid=31879,fd=7))
  37. LISTEN 0 128 192.168.20.2:8301 *:* users:(("consul",pid=31879,fd=13))
  38. LISTEN 0 128 192.168.20.2:8302 *:* users:(("consul",pid=31879,fd=9))
  39. LISTEN 0 128 [::]:8500 [::]:* users:(("consul",pid=31879,fd=17))
  40. LISTEN 0 128 [::]:8600 [::]:* users:(("consul",pid=31879,fd=16))

配置文件说明

下面是常用的配置参数解释:

  • ports:指定各种通讯端口;
  • datacenter:指定数据中心名称,自定义即可,默认为dc1;
  • data_dir:数据存放目录;
  • log_level:日志等级;
  • log_file:日志存放路径,日志名为 consul-时间戳.log 命名,通常是服务重启后或者24小时会生成新的日志文件;
  • node_name:此节点在ui界面中显示的名称;
  • server:是否为服务端,默认为false,一个高可用集群,至少有三个节点是服务端;
  • ui:是否开启此节点的ui界面,一般server端建议开启;
  • bind_addr:内部群集通信应绑定到的地址。这个IP地址应该可以被集群中的所有其他节点访问。默认情况下,这是“0.0.0.0”;
  • client_addr:Consul将绑定客户端接口(包括HTTP和DNS服务器)到的地址。默认情况下,这是”127.0.0.1”。

配置consul02

$ cat > /opt/consul/conf/consul.json << 'EOF'
{
    "ports": { 
        "http": 8500, 
        "dns": 8600, 
        "serf_lan": 8301, 
        "serf_wan": 8302, 
        "server": 8300 
    },
    "datacenter": "aspire-pro",
    "data_dir": "/opt/consul/data",
    "log_level": "INFO",
    "log_file": "/opt/consul/logs/",
    "node_name": "server-20-3",
    "server": true,
    "ui": true,
    "bind_addr": "192.168.20.3",
    "client_addr": "0.0.0.0"        
}
EOF

# 启动consul
$ nohup /opt/consul/bin/consul agent -server -config-dir /opt/consul/conf/ &> /dev/null &
# 加入consul01集群
$ /opt/consul/bin/consul join 192.168.20.2

配置consul03

$ cat > /opt/consul/conf/consul.json << 'EOF'
{
    "ports": { 
        "http": 8500, 
        "dns": 8600, 
        "serf_lan": 8301, 
        "serf_wan": 8302, 
        "server": 8300 
    },
    "datacenter": "aspire-pro",
    "data_dir": "/opt/consul/data",
    "log_level": "INFO",
    "log_file": "/opt/consul/logs/",
    "node_name": "server-20-4",
    "server": true,
    "ui": true,
    "bind_addr": "192.168.20.4",
    "client_addr": "0.0.0.0"        
}
EOF

# 启动consul
$ nohup /opt/consul/bin/consul agent -server -config-dir /opt/consul/conf/ &> /dev/null &
# 加入consul01集群
$ /opt/consul/bin/consul join 192.168.20.2

配置consul04

$ cat > /opt/consul/conf/consul.json << 'EOF'
{
    "ports": {
        "http": 8500,
        "dns": 8600,
        "serf_lan": 8301,
        "serf_wan": 8302,
        "server": 8300
    },
    "datacenter": "aspire-pro",
    "data_dir": "/opt/consul/data",
    "log_level": "INFO",
    "log_file": "/opt/consul/logs/",
    "node_name": "client-20-5",
    "bind_addr": "192.168.20.5",
    "client_addr": "0.0.0.0",
    "retry_join": ["192.168.20.2","192.168.20.3","192.168.20.4"],
    "retry_interval": "30s",
    "rejoin_after_leave": true
}
EOF

# 启动consul即可
$ nohup /opt/consul/bin/consul agent -config-dir /opt/consul/conf/ &> /dev/null &

客户端配置文件中部分参数解释:

  • retry_join:类似于-join,但是允许重试一个连接直到成功。一旦成功地加入到成员列表中的成员,它就不再尝试再次加入。
  • retry_interval:连接尝试之间的等待时间。默认为30秒。
  • rejoin_after_leave:允许使用以前的状态重新加入集群。
  • start_join:指定启动时要加入的集群IP列表

注:其他更多client要加入集群的话,则按照 consul04 这样配置、启动即可。

访问consul管理页面

访问任意一个开启ui的server端的 8500端口,按照如下点击,即可看到集群中的所有节点:

部署高可用consul集群 - 图1

重启consul01

由于consul01 节点启动时,增加了 -bootstrap 参数,故最好在集群配置成功后,重启consul01节点,正好也可以验证下consul高可用性,请在进行以下操作前,注意当前集群中的leader角色在哪个节点(一般是在consul01节点上,因为此节点是第一个启动的,并且有-bootstrap这个参数进行集群初始化引导)。

# 停止consul(在consul01节点执行)
$ ps -ef | grep consul | grep -v grep | awk '{print $2}' | xargs kill -9
# 启动consul01(注意:没有 -bootstrap 参数,并且后续重启都应是下面的命令)
$ nohup /opt/consul/bin/consul agent -server -config-dir /opt/consul/conf/ &> /dev/null &

再次访问ui页面,查看当前集群角色状态如下:

部署高可用consul集群 - 图2

consul运维指令

$ consul operator raft list-peers    # 查看集群状态及server的角色
Node         ID                                    Address            State     Voter  RaftProtocol
server-20-3  47c0d52a-cd2e-3f42-2320-00f6f230c71f  192.168.20.3:8300  leader    true   3
server-20-4  a6e3102a-a3ac-ad5c-e960-6f6bd146e3c4  192.168.20.4:8300  follower  true   3
server-20-2  f6d8c628-47e9-7fab-bec5-6896aa7f3fd3  192.168.20.2:8300  follower  true   3


$ consul members     # 查看集群和节点状态
Node         Address            Status  Type    Build  Protocol  DC          Segment
server-20-2  192.168.20.2:8301  alive   server  1.9.0  2         aspire-pro  <all>
server-20-3  192.168.20.3:8301  alive   server  1.9.0  2         aspire-pro  <all>
server-20-4  192.168.20.4:8301  alive   server  1.9.0  2         aspire-pro  <all>
client-20-5  192.168.20.5:8301  alive   client  1.9.0  2         aspire-pro  <default>


# 集群get/set 测试
$ consul kv put key value       # set一个键值
Success! Data written to: key
$ consul kv get key             # get一个键值
value



$ consul info      # 查询当前节点详细信息
agent:
        check_monitors = 0
        check_ttls = 0
        checks = 0
        services = 0
build:
        prerelease = 
        revision = a417fe51
        version = 1.9.0
consul:
        acl = disabled
        bootstrap = false
        known_datacenters = 1
        leader = false
        leader_addr = 192.168.20.3:8300
        server = true
raft:
        applied_index = 458
         # ....... 省略部分输出

集群中移除指定节点

$ consul operator raft list-peers         # 假设集群中server节点如下
Node         ID                                    Address            State     Voter  RaftProtocol
server-20-3  47c0d52a-cd2e-3f42-2320-00f6f230c71f  192.168.20.3:8300  leader    true   3
server-20-4  a6e3102a-a3ac-ad5c-e960-6f6bd146e3c4  192.168.20.4:8300  follower  true   3
server-20-2  f6d8c628-47e9-7fab-bec5-6896aa7f3fd3  192.168.20.2:8300  follower  true   3

$ consul operator raft remove-peer -id=47c0d52a-cd2e-3f42-2320-00f6f230c71f   # 移除当前leader角色的节点
Removed peer with id "47c0d52a-cd2e-3f42-2320-00f6f230c71f"


$ consul operator raft list-peers   # 再次查看,发现并不是真正的移除,只是leader角色转移了,并且 voter 状态为 false
Node         ID                                    Address            State     Voter  RaftProtocol
server-20-4  a6e3102a-a3ac-ad5c-e960-6f6bd146e3c4  192.168.20.4:8300  follower  true   3
server-20-2  f6d8c628-47e9-7fab-bec5-6896aa7f3fd3  192.168.20.2:8300  leader    true   3
server-20-3  47c0d52a-cd2e-3f42-2320-00f6f230c71f  192.168.20.3:8300  follower  false  3

$ consul operator raft remove-peer -id=47c0d52a-cd2e-3f42-2320-00f6f230c71f    # 再执行一次移除操作
Removed peer with id "47c0d52a-cd2e-3f42-2320-00f6f230c71f"


$ consul operator raft list-peers        # 再次查看集群中的server,发现已经被移除
Node         ID                                    Address            State     Voter  RaftProtocol
server-20-4  a6e3102a-a3ac-ad5c-e960-6f6bd146e3c4  192.168.20.4:8300  follower  true   3
server-20-2  f6d8c628-47e9-7fab-bec5-6896aa7f3fd3  192.168.20.2:8300  leader    true   3