一、简介

简介

哨兵核心功能:能够后台监控redis主机是否故障,如果故障了根据投票数自动将从库转换为主库

原理

  • 分布式架构,若干个sentinel节点,每个哨兵对其余哨兵和redis监控,节点不可达则进行下线表示。

  • 主节点down则与其他哨兵协商,大多数认为master down,选出一个哨兵完成failover

功能

监控: Redis和哨兵定期检测 alive

  • 通知: 哨兵将failover节点通知应用方

  • failover:从-->主

  • 配置提供:client从哨兵获取redis拓扑

image

image

二、搭建

2.1 安装部署

主:192.168.9.78:6410
从:192.168.9.78:6411
sentinel: 26410 26411 26412

redis搭建1主1从

[root@OPS-9-78 ~]# redis-cli -p 6411 slaveof 127.0.0.1 6410 OK
[root@OPS-9-78 ~]# redis-cli -p 6411 info Replication
# Replication
role:slave
master_host:127.0.0.1
master_port:6410

安装sentinel

(1)目录初始化

mkdir -p /data/redis/{sentinel_26410, sentinel_26411, sentinel_26412}
cp -r /usr/local/redis/bin/ /data/redis/sentinel_26410/
cp -r /usr/local/redis/bin/ /data/redis/sentinel_26411/
cp -r /usr/local/redis/bin/ /data/redis/sentinel_26412/

(2)生成配置文件sentinel.conf

新建sentinel.conf 文件, 名字绝不能错

26410节点

cat > /data/redis/sentinel_26410/redis_26410.conf <<EOF
port 26410
daemonize yes
pidfile /data/redis/sentinel_26410/redis-sentinel26410.pid
logfile "/data/redis/sentinel_26410/sentinel26410.log"
dir /data/redis/sentinel_26410/
sentinel monitor mymaster 127.0.0.1 6410 2
sentinel down-after-milliseconds mymaster 30000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 180000
EOF

26411节点

cat > /data/redis/sentinel_26411/redis_26411.conf <<EOF
port 26411
daemonize yes
pidfile /data/redis/sentinel_26411/redis-sentinel26411.pid
logfile "/data/redis/sentinel_26411/sentinel26411.log"
dir /data/redis/sentinel_26411/
sentinel monitor mymaster 127.0.0.1 6410 2
sentinel down-after-milliseconds mymaster 30000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 180000 EOF

26412节点

cat > /data/redis/sentinel_26412/redis_26412.conf <<EOF
port 26412
daemonize yes
pidfile /data/redis/sentinel_26412/redis-sentinel26412.pid
logfile "/data/redis/sentinel_26412/sentinel26412.log"
dir /data/redis/sentinel_26412/
sentinel monitor mymaster 127.0.0.1 6410 2
sentinel down-after-milliseconds mymaster 30000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 180000
EOF

(3)启动哨兵

方式1:

/data/redis/sentinel_26410/bin/redis-server /data/redis/sentinel_26410/redis_26410.conf --sentinel
/data/redis/sentinel_26411/bin/redis-server /data/redis/sentinel_26411/redis_26411.conf --sentinel
/data/redis/sentinel_26412/bin/redis-server /data/redis/sentinel_26412/redis_26412.conf --sentinel

方式2:

/data/redis/sentinel_26410/bin/redis-sentinel
/data/redis/sentinel_26410/redis_26410.conf
/data/redis/sentinel_26411/bin/redis-sentinel
/data/redis/sentinel_26411/redis_26411.conf
/data/redis/sentinel_26412/bin/redis-sentinel
/data/redis/sentinel_26412/redis_26412.conf

确认配置文件

$ cat /data/redis/sentinel_26410/redis_26410.conf
sentinel known-replica mymaster 127.0.0.1 6411
sentinel known-sentinel mymaster 127.0.0.1 26412
b8ac0a4d4b94e9bcb1fba566dbe64e8f82687fa9
sentinel known-sentinel mymaster 127.0.0.1 26411
0906d5f48c088da18eca19a6ef3ba893813d9246

确认info信息

[root@OPS-9-78 redis]#redis-cli -p 26410 info Sentinel
#Sentinel
sentinel_masters:1
sentinel_tilt:0
sentinel_running_scripts:0
sentinel_scripts_queue_length:0
sentinel_simulate_failure_flags:0
master0:name=mymaster,status=ok,address=127.0.0.1:6410,slaves=1,sentinels=3

[root@OPS-9-78 redis]# redis-cli -p 26410 sentinel masters
1) 1) "name"
 2) "mymaster"
 3) "ip"
 4) "127.0.0.1"
 5) "port"
 6) "6410"
 7) "runid"
 8) "aa7828c0cbdf62432ad3b4174bdaaa6dd2c9ba4c"
 9) "flags"
 10) "master"

2.2 哨兵节点相关的配置

##动态设置参数 sentinel set
- 仅对哨兵结果有效
- 立马刷新配置文件,redis需要 config rewrite
- 哨兵尽量配置一致
##相关配置说明##
# sentinel 监控的主节点信息
sentinel monitor <master-name> <ip> <port> <quorum>
# sentinel判定节点不可达的时间,单位毫秒
sentinel down-after-milliseconds <master-name> <times>
动态设置:sentinel set <master-name> down-after-milliseconds 30000
#故障转移超时时间
sentinel failover-timeout <master-name> <times>
动态设置:sentinel set <master-name> failover-timeout 3600
#Sentinel监控的主节点配置了密码
sentinel auth-pass <master-name> <password>
动态设置:sentinel set <master-name> auth-pass password
#每次向新的主节点发起复制操作的从节点个数
sentinel parallel-syncs <master-name> <nums>
动态设置:sentinel set <master-name> parallel-syncs 2

image

image

2.3 sentinel部署技巧

1、sentinel部署分配的原则

  • 不应该部署在一台物理“机器”上

  • 部署至少三个且奇数个的Sentinel节点

  • 按需选择 sentinel集群与redis集群的管控方式:1套sentinel-1套redis or 1套sentinel-多套redis

2、sentinel监控多个主节点

配置多个sentinel monitor

sentinel monitor master-test-1 192.168.9.78 6379 2

sentinel monitor master-test-2 192.168.9.78 6380 2

image

Redis Sentinel监控多个主节点

三、运维管理

3.1 sentinel API

1、sentinel masters #展示所有被监控的主节点状态以及相关的统计信息

1sentinel masters #展示所有被监控的主节点状态以及相关的统计信息
127.0.0.1:26410> sentinel masters
1) 1) "name"
 2) "mymaster"
 3) "ip"
 4) "127.0.0.1"
 5) "port"
 6) "6410"
 7) "runid"
 8) "aa7828c0cbdf62432ad3b4174bdaaa6dd2c9ba4c"
 9) "flags"
 10) "master"
2sentinel master <master name> #展示<master name> 的主节点状态以及相关的统计信息
127.0.0.1:26410> sentinel master mymaster
3sentinel slaves <master name> #展示指定<master name>的从节点状态以及相关的统计信息
127.0.0.1:26410> sentinel slaves mymaster
1) 1) "name"
 2) "127.0.0.1:6411"
 3) "ip"
 4) "127.0.0.1"
 5) "port"
 6) "6411"
 7) "runid"
 8) "2fb124c588960cb07a04322af5306890636f7114"
 9) "flags"
 10) "slave"
4sentinel sentinels <master name> #展示指定<master name>的Sentinel节点集合不包含当前
Sentinel节点
5sentinel get-master-addr-by-name <master name> #返回<master name>主节点的IP地址和端口
127.0.0.1:26410> sentinel get-master-addr-by-name mymaster
1) "127.0.0.1"
2) "6410"
6sentinel reset<pattern> #对符合条件的主节点的配置进行重置
7sentinel failover<master name> #对指定<master name>主节点进行强制故障转移没有和其他
Sentinel节点协商”)
8sentinel ckquorum<master name> #检测当前可达的Sentinel节点总数是否达到<quorum>的个数
/*例如 quorum=3而当前可达的Sentinel节点个数为2个那么将无法进行故障转 Redis Sentinel的高可
用特性也将失去*/
9sentinel flushconfig #将Sentinel节点的配置强制刷到磁盘上
10sentinel remove<master name> #取消当前Sentinel节点对于指定<master name>主节点的监控
11sentinel monitor<master name><ip><port><quorum>
12sentinel set<master name> #动态修改Sentinel节点配置选项
13sentinel is-master-down-by-addr

3.2 业务连接

1. 拿到Sentinel节点集合、masterName参数
2. 客户端链接
 1)遍历Sentinel节点集合获取一个可用的sentinel
 2)通过sentinel get-master-addr-by-name master-name这个API来获取对应 主节点的相关信息
 3)验证当前获取的“主节点”是真正的主节点
 4)保持和Sentinel节点集合的联系,时刻获取关于主节点的相关信息
#python链接redis sentinel示例
from redis.sentinel import Sentinel
sentinel_list = [
 ("192.168.9.78", "26410"),
 ("192.168.9.78", "26411"),
 ("192.168.9.78", "26412")
 ]
mySentinel = Sentinel(sentinel_list)
master = mySentinel.master_for("mymaster", db=0)
slave = mySentinel.slave_for("mymaster", db=0)
# 使用master进行写的操作,使用slave进行读的操作
master.hset("key_name", "filed", "value")
slave.hget("key_name", "filed")
slave.hgetall("key_name")

四、哨兵实现原理

三个定时任务

主观下线/客观下线

Sentinel领导者选举

故障转移

4.1 三个定时任务

1、10s一次,发送info获取redis拓扑

2、2s一次,发送哨兵订阅节点,保证新加入哨兵保存信息和消息交换

3、1s一次,哨兵向所有node(哨兵和redis)发送ping,确认心跳

Sentinel节点定时执行info命令

image

Sentinel节点发布和订阅_sentinel_hello频道

image

Sentinel节点向其余节点发送ping命令

image

4.2 主观下线和客观下线

1、主观下线

每隔1s对主/从节点/sentinel检测,超过down时间,哨兵判定主观下线

image

2、客观下线

主观下线发现是master,则跟其他哨兵进行投票。

投票通过,则做出客观下线规定

选择一个哨兵(Raft协议选择负责failover的哨兵),进行failover

image

Sentinel节点对主节点做客观下线

领导者Sentinel节点选举

image

节点发出的同意接受的同意
s1s2,s3
s2s1
s3s1

4.3 故障转移

1、选出主节点

2、哨兵leader对新主 slaveof no one

3、其余从节点-->新主

4、整理拓扑关系

image

五、主从切换测试

5.1 手动进行故障转移

查看redis主从拓扑

[root@OPS-9-78 redis]#redis-cli -p 26410 info Sentinel master0:name=mymaster, status=ok, address=127.0.0.1:6410, slaves=1, sentinels=3

关闭6410

redis-cli -p 6410 shutdown
-- sentinel的日志信息如下
108297:X 06 Jun 2023 22:25:57.376 # +sdown master mymaster 127.0.0.1 6410
108297:X 06 Jun 2023 22:25:57.447 # +new-epoch 1
108297:X 06 Jun 2023 22:25:57.448 # +vote-for-leader
0906d5f48c088da18eca19a6ef3ba893813d9246 1
108297:X 06 Jun 2023 22:25:57.448 # +odown master mymaster 127.0.0.1 6410 #quorum 2/2
108297:X 06 Jun 2023 22:25:57.448 # Next failover delay: I will not start a failover
before Tue Jun 6 22:31:58 2023
108297:X 06 Jun 2023 22:25:58.687 # +config-update-from sentinel
0906d5f48c088da18eca19a6ef3ba893813d9246 127.0.0.1 26411 @ mymaster 127.0.0.1 6410
108297:X 06 Jun 2023 22:25:58.687 # +switch-master mymaster 127.0.0.1 6410 127.0.0.1
6411
108297:X 06 Jun 2023 22:25:58.687 * +slave slave 127.0.0.1:6410 127.0.0.1 6410 @
mymaster 127.0.0.1 6411

故障转移的步骤

1)主库宕机

2)哨兵从库选择一个为主,规则依次按照

  • 优先级高:slave-priority 100,值越小优先级越高

  • 偏移量最大:原主机数据最全的

  • runid最小:每个redis实例启动后都会随机生成一个40位的runid

3)哨兵向从服务器发送slaveof 新主,复制新master

4)主服务器恢复加入变从

验证

[root@OPS-9-78 redis]#redis-cli -p 26410 info Sentinel master0:name=mymaster, status=ok, address=127.0.0.1:6411, slaves=1, sentinels=3

启动6410节点

/data/redis/6410/bin/redis-server /data/redis/6410/redis_6410.conf

验证6410节点的状态--已自动变为6411的从节点

[root@OPS-9-78 sentinel_26410]# redis-cli -p 6410 info replication
# Replication
role:slave
master_host:127.0.0.1
master_port:6411

5.2 强制主从切换

查看redis主从拓扑(主:6411,从:6410)

[root@OPS-9-78 sentinel_26410]# redis-cli -p 26410 info Sentinel master0:name=mymaster, status=ok, address=127.0.0.1:6411, slaves=1, sentinels=3

强制主从切换

redis-cli -p 26410 sentinel failover mymaster #对mymaster的主节点进行强制故障转移
-- sentinel日志信息如下:
108297:X 06 Jun 2023 22:42:39.425 # Executing user requested FAILOVER of 'mymaster'
108297:X 06 Jun 2023 22:42:39.425 # +new-epoch 2
108297:X 06 Jun 2023 22:42:39.425 # +try-failover master mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:39.450 # +vote-for-leader
290662692d6ee74cdbdd8c12041e98fac2b35512 2
108297:X 06 Jun 2023 22:42:39.450 # +elected-leader master mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:39.450 # +failover-state-select-slave master mymaster
127.0.0.1 6411
108297:X 06 Jun 2023 22:42:39.513 # +selected-slave slave 127.0.0.1:6410 127.0.0.1 6410
@ mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:39.513 * +failover-state-send-slaveof-noone slave
127.0.0.1:6410 127.0.0.1 6410 @ mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:39.575 * +failover-state-wait-promotion slave 127.0.0.1:6410
127.0.0.1 6410 @ mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:40.485 # +promoted-slave slave 127.0.0.1:6410 127.0.0.1 6410
@ mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:40.485 # +failover-state-reconf-slaves master mymaster
127.0.0.1 6411
108297:X 06 Jun 2023 22:42:40.556 # +failover-end master mymaster 127.0.0.1 6411
108297:X 06 Jun 2023 22:42:40.556 # +switch-master mymaster 127.0.0.1 6411 127.0.0.1
6410
108297:X 06 Jun 2023 22:42:40.556 * +slave slave 127.0.0.1:6411 127.0.0.1 6411 @
mymaster 127.0.0.1 6410

结果验证(主:6410,从:6411)

[root@OPS-9-78 sentinel_26410]# redis-cli -p 26410 info Sentinel
master0:name=mymaster,status=ok,address=127.0.0.1:6410,slaves=1,sentinels=3