Hi all!
I have 3 redis nodes (v.4.11, sentinel, 1 master + 2 slaves) and faced a strange problem couple of weeks ago: master suddenly switches to another host because both slaves connections are lost.
I suggested, that the reason is in client-output-buffer-limit for slaves connections, increased it, but nothing changed. I don't think that it's network related issue because other services on this VMs are stable.
Could you please give an advice, what could be the reason of the problem?
Logs, configs and info output are below
Log from master:
40190:M 12 Dec 14:10:23.083 # Connection with slave client id #131352 lost.
40190:M 12 Dec 14:10:29.927 # Connection with slave client id #163853 lost.
40190:S 12 Dec 14:10:34.012 * Before turning into a slave, using my master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer.
40190:S 12 Dec 14:10:34.012 * SLAVE OF 10.10.10.2:6381 enabled (user request from 'id=128421220 addr=10.10.10.1:24446 fd=558 name=sentinel-d8e35ae9-cmd age=10 idle=0 flags=x db=0 sub=0 psub=0 multi= 3 qbuf=0 qbuf-free=32768 obl=36 oll=0 omem=0 events=r cmd=exec')
40190:S 12 Dec 14:10:34.045 # CONFIG REWRITE executed with success.
40190:S 12 Dec 14:10:34.882 * MASTER <-> SLAVE sync started
40190:S 12 Dec 14:10:34.883 * Non blocking connect for SYNC fired the event.
40190:S 12 Dec 14:10:34.884 * Master replied to PING, replication can continue...
40190:S 12 Dec 14:10:34.885 * Trying a partial resynchronization (request 18c067489c9d03637f584dd61d9d609c5e61a309:2298722417497).
40190:S 12 Dec 14:10:34.886 * Full resync from master: 9055605628b944555920df49d918c4335b5f2d0b:2298709630285
40190:S 12 Dec 14:10:34.886 * Discarding previously cached master state.
14154:C 12 Dec 14:10:36.689 * DB saved on disk
14154:C 12 Dec 14:10:36.748 * RDB: 154 MB of memory used by copy-on-write
40190:S 12 Dec 14:10:36.885 * Background saving terminated with success
40190:S 12 Dec 14:10:39.703 * MASTER <-> SLAVE sync: receiving 943754945 bytes from master
40190:S 12 Dec 14:10:44.794 * MASTER <-> SLAVE sync: Flushing old data
40190:S 12 Dec 14:12:25.056 * MASTER <-> SLAVE sync: Loading DB in memory
40190:S 12 Dec 14:12:59.022 * MASTER <-> SLAVE sync: Finished with success
40190:S 12 Dec 14:12:59.022 * 10000 changes in 60 seconds. Saving...
Log from one of the slaves:
10735:M 12 Dec 14:10:23.080 # Setting secondary replication ID to 18c067489c9d03637f584dd61d9d609c5e61a309, valid up to offset: 2298709619422. New replication ID is 9055605628b944555920df49d918c4335b5f2d0b
10735:M 12 Dec 14:10:23.080 # Connection with master lost.
10735:M 12 Dec 14:10:23.080 * Caching the disconnected master state.
10735:M 12 Dec 14:10:23.080 * Discarding previously cached master state.
10735:M 12 Dec 14:10:23.080 * MASTER MODE enabled (user request from 'id=131363 addr=10.10.10.2:42426 fd=19 name= age=781175 idle=0 flags=x db=0 sub=0 psub=0 multi=3 qbuf=0 qbuf-free=32768 obl=36 oll=0 omem=0 events=r cmd=exec') 10735:M 12 Dec 14:10:23.097 # CONFIG REWRITE executed with success.
10735:M 12 Dec 14:10:24.326 * Slave 10.10.10.1:6380 asks for synchronization 10735:M 12 Dec 14:10:24.326 * Partial resynchronization not accepted: Requested offset for second ID was 2298710003403, but I can reply up to 2298709619422
10735:M 12 Dec 14:10:24.326 * Starting BGSAVE for SYNC with target: disk
10735:M 12 Dec 14:10:24.413 * Background saving started by pid 45786
10735:M 12 Dec 14:10:34.885 * Slave 10.10.10.3:6382 asks for synchronization 10735:M 12 Dec 14:10:34.885 * Partial resynchronization not accepted: Requested offset for second ID was 2298722417497, but I can reply up to 2298709619422
10735:M 12 Dec 14:10:34.885 * Waiting for end of BGSAVE for SYNC
45786:C 12 Dec 14:10:39.470 * DB saved on disk
redis.conf
bind 10.10.10.1
protected-mode yes
port 6382
tcp-backlog 511
timeout 0
tcp-keepalive 300
daemonize no
supervised systemd
pidfile "/var/run/redis/redis.pid"
loglevel notice
logfile "/var/log/redis/redis.log"
databases 16
save 900 1
save 300 10
save 60 10000
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename "dump.rdb"
dir "/var/lib/redis"
slave-serve-stale-data yes
slave-read-only yes
repl-diskless-sync no
repl-diskless-sync-delay 5
repl-disable-tcp-nodelay no
slave-priority 100
appendonly no
appendfilename "appendonly.aof"
appendfsync everysec
no-appendfsync-on-rewrite no
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
aof-load-truncated yes
lua-time-limit 5000
slowlog-log-slower-than 10000
slowlog-max-len 128
latency-monitor-threshold 0
notify-keyspace-events ""
hash-max-ziplist-entries 512
hash-max-ziplist-value 64
list-max-ziplist-size -2
list-compress-depth 0
set-max-intset-entries 512
zset-max-ziplist-entries 128
zset-max-ziplist-value 64
hll-sparse-max-bytes 3000
activerehashing yes
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit slave 1000000kb 1000000kb 0
client-output-buffer-limit pubsub 32mb 8mb 60
hz 10
aof-rewrite-incremental-fsync yes
redis-sentinel.conf
bind 10.10.10.3
port 16382
sentinel myid 58ea906be61fb05e6fa14f95b504b47b51a754bb
sentinel deny-scripts-reconfig yes
sentinel monitor im-redis-cluster 10.10.10.3 6382 2
sentinel down-after-milliseconds im-redis-cluster 5000
dir "/"
maxclients 4064
sentinel failover-timeout im-redis-cluster 10000
sentinel config-epoch im-redis-cluster 68
sentinel leader-epoch im-redis-cluster 68
sentinel known-slave im-redis-cluster 10.10.10.2 6381
sentinel known-slave im-redis-cluster 10.10.10.1 6380
sentinel known-sentinel im-redis-cluster 10.10.10.1 16380 d8e35ae957fcd2bd110082c63fb53c1508d130b1
sentinel known-sentinel im-redis-cluster 10.10.10.2 16381 7da4777423c7c27fe1d053a737e1cdb29d44c064
sentinel current-epoch 68
INFO
# Server
redis_version:4.0.11
redis_git_sha1:00000000
redis_git_dirty:0
redis_build_id:bf99aef9569f9a49
redis_mode:standalone
os:Linux 3.10.0-514.16.1.el7.x86_64 x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:7.3.1
process_id:40190
run_id:1738cb50f6a22e675ef52a5179b16a7b962b97af
tcp_port:6382
uptime_in_seconds:1922929
uptime_in_days:22
hz:10
lru_clock:1725481
executable:/usr/bin/redis-server
config_file:/etc/redis/redis.conf
# Clients
connected_clients:709
client_longest_output_list:5
client_biggest_input_buf:147
blocked_clients:21
# Memory
used_memory:3030200720
used_memory_human:2.82G
used_memory_rss:2448142336
used_memory_rss_human:2.28G
used_memory_peak_human:4.18G
used_memory_peak_perc:67.56%
used_memory_overhead:146898953
used_memory_startup:786576
used_memory_dataset:2883301767
used_memory_dataset_perc:95.18%
total_system_memory:33560887296
total_system_memory_human:31.26G
used_memory_lua:130048
used_memory_lua_human:127.00K
maxmemory:0
maxmemory_human:0B
maxmemory_policy:noeviction
mem_fragmentation_ratio:0.81
mem_allocator:jemalloc-4.0.3
active_defrag_running:0
lazyfree_pending_objects:0
# Persistence
loading:0
rdb_changes_since_last_save:349101
rdb_bgsave_in_progress:0
rdb_last_save_time:1545229295
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:23
rdb_current_bgsave_time_sec:-1
rdb_last_cow_size:159281152
aof_enabled:0
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:-1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:0
# Stats
total_connections_received:131904200
total_commands_processed:11228834066
instantaneous_ops_per_sec:16453
total_net_input_bytes:3478041956927
total_net_output_bytes:5876168155463
instantaneous_input_kbps:7567.01
instantaneous_output_kbps:36588.43
rejected_connections:0
sync_full:7
sync_partial_ok:0
sync_partial_err:7
expired_keys:38492426
expired_stale_perc:8.57
expired_time_cap_reached_count:0
evicted_keys:0
keyspace_hits:5423342169
keyspace_misses:35443446
pubsub_channels:40117
pubsub_patterns:62
latest_fork_usec:97519
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
# Replication
role:master
connected_slaves:2
slave0:ip=10.10.10.2,port=6381,state=online,offset=3439821078815,lag=0
slave1:ip=10.10.10.3,port=6380,state=online,offset=3439820626219,lag=0
master_replid:8be873e62c983d5d482c0bdf1e5d1afc312a2e75
master_replid2:9055605628b944555920df49d918c4335b5f2d0b
master_repl_offset:3439824975811
second_repl_offset:3393464791547
repl_backlog_active:1
repl_backlog_size:1048576
repl_backlog_first_byte_offset:3439823927236
repl_backlog_histlen:1048576
# CPU
used_cpu_sys:44282.73
used_cpu_user:95441.03
used_cpu_sys_children:37332.18
used_cpu_user_children:292517.78
# Cluster
cluster_enabled:0
# Keyspace
db0:keys=1002479,expires=261897,avg_ttl=91790875
db1:keys=749050,expires=748989,avg_ttl=3444468
db3:keys=12,expires=12,avg_ttl=643424402
db4:keys=2230,expires=2230,avg_ttl=77709518
db5:keys=2,expires=1,avg_ttl=259246