-
Notifications
You must be signed in to change notification settings - Fork 612
Closed
Labels
Description
Describe the bug
node from multi-master eat memory:
root@kdb11-prod-xprj:~# tail -f /var/log/keydb/keydb-server.log
3846534:3846558:S 25 Jan 2023 16:46:54.913 * Background AOF rewrite terminated with success
3846534:3846558:S 25 Jan 2023 16:46:54.913 * Residual parent diff successfully flushed to the rewritten AOF (0.00 MB)
3846534:3846558:S 25 Jan 2023 16:46:54.913 * Background AOF rewrite finished successfully
3846534:3846558:S 25 Jan 2023 16:46:55.821 * Connecting to MASTER kdb12-prod-xprj:8379
3846534:3846558:S 25 Jan 2023 16:46:55.823 * MASTER <-> REPLICA sync started
3846534:3846558:S 25 Jan 2023 16:46:55.823 * Non blocking connect for SYNC fired the event.
3846534:3846558:S 25 Jan 2023 16:46:55.824 * Master replied to PING, replication can continue...
3846534:3846558:S 25 Jan 2023 16:46:55.824 * Trying a partial resynchronization (request 930eaaec2a0e61d5f610acd0c9e15972f7f8498e:1).
3846534:3846558:S 25 Jan 2023 16:46:55.824 * Successful partial resynchronization with master.
3846534:3846558:S 25 Jan 2023 16:46:55.824 * MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.
3846534:3846558:S 25 Jan 2023 16:48:45.933 * REPLICAOF rdb05-prod-xprj:6379 enabled (user request from 'id=112 addr=127.0.0.1:52604 laddr=127.0.0.1:8379 fd=93 name= age=66 idle=0 flags=N db=0 sub=0 psub=0 multi=-1 qbuf=0 qbuf-free=40954 argv-mem=37 obl=0 oll=0 omem=0 tot-mem=61501 events=r cmd=replicaof user=default redir=-1')
3846534:3846558:S 25 Jan 2023 16:48:46.736 * Connecting to MASTER rdb05-prod-xprj:6379
3846534:3846558:S 25 Jan 2023 16:48:46.739 * MASTER <-> REPLICA sync started
3846534:3846558:S 25 Jan 2023 16:48:46.739 * Non blocking connect for SYNC fired the event.
3846534:3846558:S 25 Jan 2023 16:48:46.740 * Master does not support REPLPING, sending PING instead...
3846534:3846558:S 25 Jan 2023 16:48:46.740 * Non blocking connect for SYNC fired the event.
3846534:3846558:S 25 Jan 2023 16:48:46.740 * Master replied to PING, replication can continue...
3846534:3846558:S 25 Jan 2023 16:48:46.740 * Partial resynchronization not possible (no cached master)
3846534:3846558:S 25 Jan 2023 16:48:47.567 * Full resync from master: 4a70620209e6bf172c84a305d3eea1eb012ee7fc:4155977550183
3846534:3846558:S 25 Jan 2023 16:49:52.001 * MASTER <-> REPLICA sync: receiving 2745236467 bytes from master to disk
3846534:3846558:S 25 Jan 2023 16:50:32.660 * MASTER <-> REPLICA sync: Loading DB in memory
3846534:3846558:S 25 Jan 2023 16:50:35.391 * Loading RDB produced by version 6.0.18
3846534:3846558:S 25 Jan 2023 16:50:35.391 * RDB age 108 seconds
3846534:3846558:S 25 Jan 2023 16:50:35.392 * RDB memory usage when created 6100.30 Mb
3846534:3846558:S 25 Jan 2023 16:50:35.412 # Replication backlog is too small, resizing to: 2097152 bytes
3846534:3846558:S 25 Jan 2023 16:50:35.431 # Replication backlog is too small, resizing to: 4194304 bytes
3846534:3846558:S 25 Jan 2023 16:50:35.640 # Replication backlog is too small, resizing to: 8388608 bytes
3846534:3846558:S 25 Jan 2023 16:50:35.760 # Replication backlog is too small, resizing to: 16777216 bytes
3846534:3846558:S 25 Jan 2023 16:50:35.997 # Replication backlog is too small, resizing to: 33554432 bytes
3846534:3846558:S 25 Jan 2023 16:50:36.351 # Replication backlog is too small, resizing to: 67108864 bytes
3846534:3846558:S 25 Jan 2023 16:50:36.976 # Replication backlog is too small, resizing to: 134217728 bytes
3846534:3846558:S 25 Jan 2023 16:50:38.127 # Replication backlog is too small, resizing to: 268435456 bytes
3846534:3846558:S 25 Jan 2023 16:50:42.034 # Replication backlog is too small, resizing to: 536870912 bytes
3846534:3846558:S 25 Jan 2023 16:50:46.993 # Replication backlog is too small, resizing to: 1073741824 bytes
3846534:3846558:S 25 Jan 2023 16:50:56.598 # Replication backlog is too small, resizing to: 2147483648 bytes
3846534:3846558:S 25 Jan 2023 16:51:16.214 # Replication backlog is too small, resizing to: 4294967296 bytes
3846534:3846558:S 25 Jan 2023 16:51:56.143 # Replication backlog is too small, resizing to: 8589934592 bytes
3846534:3846558:S 25 Jan 2023 16:52:14.264 # Done loading RDB, keys loaded: 0, keys expired: 0.
3846534:3846558:S 25 Jan 2023 16:52:14.264 * RDB contains information on 0 masters
3846534:3846558:S 25 Jan 2023 16:52:14.387 * MASTER <-> REPLICA sync: Finished with success
3846534:3846558:S 25 Jan 2023 16:52:14.659 * Background append only file rewriting started by pid 3847165
3846534:3846558:S 25 Jan 2023 16:53:22.648 * AOF rewrite child asks to stop sending diffs.
3847165:3846558:C 25 Jan 2023 16:53:22.649 * Parent agreed to stop sending diffs. Finalizing AOF...
3847165:3846558:C 25 Jan 2023 16:53:22.649 * Concatenating 1322.85 MB of AOF diff received from parent.
3846534:3846558:S 25 Jan 2023 16:53:26.656 * Background AOF buffer size: 80 MB
3847165:3846558:C 25 Jan 2023 16:53:28.859 * SYNC append only file rewrite performed
3847165:3846558:C 25 Jan 2023 16:53:29.114 * AOF rewrite: 3738 MB of memory used by copy-on-write
3846534:3846558:S 25 Jan 2023 16:53:29.747 * Background AOF rewrite terminated with success
3846534:3846558:S 25 Jan 2023 16:53:29.800 * Residual parent diff successfully flushed to the rewritten AOF (131.47 MB)
3846534:3846558:S 25 Jan 2023 16:53:29.800 * Background AOF rewrite finished successfully
3846534:3846558:S 25 Jan 2023 16:53:29.917 * 50000 changes in 300 seconds. Saving...
3846534:3846558:S 25 Jan 2023 16:53:29.919 * Background saving started
3846534:3847327:S 25 Jan 2023 16:54:29.884 * DB saved on disk
3846534:3847327:S 25 Jan 2023 16:54:30.119 * RDB: 14888 MB of memory used by copy-on-write
3846534:3847327:S 25 Jan 2023 16:54:30.501 # tombstones removed: 998280, remain: 0
3846534:3846558:S 25 Jan 2023 16:54:30.511 * Background saving terminated with success
3846534:3846558:S 25 Jan 2023 16:56:16.277 * 10000000 changes in 60 seconds. Saving...
3846534:3846558:S 25 Jan 2023 16:56:16.279 * Background saving started
3846534:3846558:S 25 Jan 2023 16:57:07.762 # Replication backlog is too small, resizing to: 17179869184 bytes
3846534:3847625:S 25 Jan 2023 16:57:15.430 * DB saved on disk
3846534:3847625:S 25 Jan 2023 16:57:15.590 * RDB: 14960 MB of memory used by copy-on-write
3846534:3847625:S 25 Jan 2023 16:57:15.836 # tombstones removed: 934748, remain: 0
3846534:3846558:S 25 Jan 2023 16:57:15.875 * Background saving terminated with success
3846534:3846558:S 25 Jan 2023 16:57:15.977 * Starting automatic rewriting of AOF on 101% growth
3846534:3846558:S 25 Jan 2023 16:57:17.615 * Background append only file rewriting started by pid 3847739
3846534:3846558:S 25 Jan 2023 16:57:21.711 * Background AOF buffer size: 80 MB
3846534:3846558:S 25 Jan 2023 16:57:26.485 * Background AOF buffer size: 180 MB
3846534:3846558:S 25 Jan 2023 16:57:31.239 * Background AOF buffer size: 280 MB
3846534:3846558:S 25 Jan 2023 16:57:35.807 * Background AOF buffer size: 380 MB
3846534:3846558:S 25 Jan 2023 16:57:40.417 * Background AOF buffer size: 480 MB
3846534:3846558:S 25 Jan 2023 16:57:45.046 * Background AOF buffer size: 580 MB
3846534:3846558:S 25 Jan 2023 16:57:49.479 * Background AOF buffer size: 680 MB
3846534:3846558:S 25 Jan 2023 16:57:57.666 * Background AOF buffer size: 780 MB
3846534:3846558:S 25 Jan 2023 16:58:02.084 * Background AOF buffer size: 880 MB
3846534:3846558:S 25 Jan 2023 16:58:06.601 # Background AOF buffer size: 980 MB
3846534:3846558:S 25 Jan 2023 16:58:10.977 * Background AOF buffer size: 1080 MB
...
3846534:3846558:S 25 Jan 2023 17:23:27.996 * Background AOF buffer size: 39080 MB
3846534:3846558:S 25 Jan 2023 17:23:30.470 # Client id=22 addr=10.102.176.69:56502 laddr=10.103.100.128:8379 fd=86 name= age=2198 idle=0 flags=S db=0 sub=0 psub=0 multi=-1 qbuf=0 qbuf-free=40954 argv-mem=0 obl=0 oll=0 omem=17179868936 tot-mem=17179930400 events=rw cmd=replconf user=default redir=-1 scheduled to be closed ASAP due to exceeding output buffer hard limit.
3846534:3846558:S 25 Jan 2023 17:23:30.470 # Connection with replica 10.102.176.69:8379 lost.
3846534:3846558:S 25 Jan 2023 17:23:30.922 * Reclaiming 17178820608 replication backlog bytes
3846534:3846558:S 25 Jan 2023 17:23:32.681 * Background AOF buffer size: 39180 MB
...
3846534:3846558:S 25 Jan 2023 17:25:59.782 * Background AOF buffer size: 43080 MB
3846534:3846558:S 25 Jan 2023 17:26:03.350 * Background AOF buffer size: 43180 MB
3846534:3846558:S 25 Jan 2023 17:26:06.919 * Background AOF buffer size: 43280 MB
...
3846534:3846558:S 25 Jan 2023 17:31:44.902 * Background AOF buffer size: 55680 MB
3846534:3846558:S 25 Jan 2023 17:31:46.567 * Replica 10.101.231.210:8379 asks for synchronization
3846534:3846558:S 25 Jan 2023 17:31:46.567 * Unable to partial resync with replica 10.101.231.210:8379 for lack of backlog (Replica request was: 43953388965).
3846534:3846558:S 25 Jan 2023 17:31:46.567 * No BGSAVE in progress, but another BG operation is active. BGSAVE for replication delayed
3846534:3846558:S 25 Jan 2023 17:31:49.057 * Background AOF buffer size: 55780 MB
...
3846534:3846558:S 25 Jan 2023 17:33:29.547 * Background AOF buffer size: 58180 MB
3846534:3846558:S 25 Jan 2023 17:33:32.509 * Replica 10.102.176.69:8379 asks for synchronization
3846534:3846558:S 25 Jan 2023 17:33:32.509 * Unable to partial resync with replica 10.102.176.69:8379 for lack of backlog (Replica request was: 47508970853).
3846534:3846558:S 25 Jan 2023 17:33:32.509 * No BGSAVE in progress, but another BG operation is active. BGSAVE for replication delayed
3846534:3846558:S 25 Jan 2023 17:33:33.467 * Background AOF buffer size: 58280 MB
...
3846534:3846558:S 25 Jan 2023 17:39:28.604 * Background AOF buffer size: 66780 MB
3846534:3846558:S 25 Jan 2023 17:39:32.707 * Background AOF buffer size: 66880 MB
3846534:3846558:S 25 Jan 2023 17:39:36.911 # Background AOF buffer size: 66980 MB
3846534:3846558:S 25 Jan 2023 17:39:40.935 * Background AOF buffer size: 67080 MB
To reproduce
- create 5 empty nodes in multi-master configuration
- add one node as slave to standalone keydb:
root@kdb11-prod-xprj:~# keydb-cli -p 8379
Message of the day:
KeyDB has now joined Snap! See the announcement at: https://docs.keydb.dev/news
127.0.0.1:8379> REPLICAOF rdb05-prod-xprj 6379
- look to logs
Expected behavior
Sync in normal progress and memory is not consumed excessively
Additional information
root@kdb11-prod-xprj:~# egrep -v '^#|^$' /etc/keydb/keydb.conf
bind * -::*
protected-mode yes
port 8379
tcp-backlog 511
timeout 0
tcp-keepalive 300
daemonize yes
pidfile /var/run/keydb/keydb-server.pid
loglevel notice
logfile /var/log/keydb/keydb-server.log
databases 128
always-show-logo yes
set-proc-title yes
proc-title-template "{title} {listen-addr} {server-mode}"
save 900 10000
save 300 50000
save 60 10000000
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename dump.rdb
rdb-del-sync-files no
dir /var/lib/keydb
replica-serve-stale-data yes
replica-read-only yes
repl-diskless-sync no
repl-diskless-sync-delay 5
repl-diskless-load disabled
repl-timeout 600
repl-disable-tcp-nodelay no
replica-priority 100
acllog-max-len 128
lazyfree-lazy-eviction no
lazyfree-lazy-expire no
lazyfree-lazy-server-del no
replica-lazy-flush no
lazyfree-lazy-user-del no
lazyfree-lazy-user-flush no
oom-score-adj no
oom-score-adj-values 0 200 800
disable-thp yes
appendonly yes
appendfilename "appendonly.aof"
appendfsync everysec
no-appendfsync-on-rewrite no
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
aof-load-truncated yes
aof-use-rdb-preamble yes
lua-time-limit 5000
slowlog-log-slower-than 10000
slowlog-max-len 128
latency-monitor-threshold 0
notify-keyspace-events ""
hash-max-ziplist-entries 512
hash-max-ziplist-value 64
list-max-ziplist-size -2
list-compress-depth 0
set-max-intset-entries 512
zset-max-ziplist-entries 128
zset-max-ziplist-value 64
hll-sparse-max-bytes 3000
stream-node-max-bytes 4096
stream-node-max-entries 100
activerehashing yes
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 16384mb 16384mb 60
client-output-buffer-limit pubsub 32mb 8mb 60
hz 10
dynamic-hz yes
aof-rewrite-incremental-fsync yes
rdb-save-incremental-fsync yes
jemalloc-bg-thread yes
server-threads 14
active-replica yes
multi-master yes
replicaof kdb12-prod-xprj 8379
replicaof kdb13-prod-xprj 8379
replicaof kdb14-prod-xprj 8379
replicaof kdb15-prod-xprj 8379
root@rdb05-prod-xprj:~# egrep -v '^#|^$' /etc/keydb/keydb.conf
bind 0.0.0.0
protected-mode yes
port 6379
tcp-backlog 10000
timeout 0
tcp-keepalive 300
daemonize yes
supervised no
pidfile /var/run/keydb/keydb-server.pid
loglevel notice
logfile /var/log/keydb/keydb-server.log
databases 16
always-show-logo yes
save 900 2000
save 300 20000
save 60 5000000
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename dump.rdb
dir /var/lib/keydb
replica-serve-stale-data yes
replica-read-only yes
repl-diskless-sync no
repl-diskless-sync-delay 5
repl-disable-tcp-nodelay no
replica-priority 100
maxclients 80000
lazyfree-lazy-eviction no
lazyfree-lazy-expire no
lazyfree-lazy-server-del no
replica-lazy-flush no
appendonly no
appendfilename "appendonly.aof"
appendfsync everysec
no-appendfsync-on-rewrite no
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
aof-load-truncated yes
aof-use-rdb-preamble yes
lua-time-limit 5000
slowlog-log-slower-than 10000
slowlog-max-len 128
latency-monitor-threshold 0
notify-keyspace-events ""
hash-max-ziplist-entries 512
hash-max-ziplist-value 64
list-max-ziplist-size -2
list-compress-depth 0
set-max-intset-entries 512
zset-max-ziplist-entries 128
zset-max-ziplist-value 64
hll-sparse-max-bytes 3000
stream-node-max-bytes 4096
stream-node-max-entries 100
activerehashing yes
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 256mb 64mb 60
client-output-buffer-limit pubsub 32mb 8mb 60
hz 10
dynamic-hz yes
aof-rewrite-incremental-fsync yes
rdb-save-incremental-fsync yes
server-threads 14
root@rdb05-prod-xprj:~# keydb-cli info
# Server
redis_version:6.0.18
redis_git_sha1:00000000
redis_git_dirty:0
redis_build_id:2fb5bbc6b3ce6860
redis_mode:standalone
os:Linux 5.15.35-1-pve x86_64
arch_bits:64
multiplexing_api:epoll
atomicvar_api:atomic-builtin
gcc_version:6.3.0
process_id:156
run_id:fc9986e056efb40e4680c6b2d5a763be880e9b35
tcp_port:6379
uptime_in_seconds:21173154
uptime_in_days:245
hz:40
configured_hz:10
lru_clock:13723784
executable:/usr/bin/keydb-server
config_file:/etc/keydb/keydb.conf
# Clients
connected_clients:4945
client_recent_max_input_buffer:768
client_recent_max_output_buffer:23720
blocked_clients:0
tracking_clients:0
clients_in_timeout_table:0
current_client_thread:0
thread_0_clients:354
thread_1_clients:354
thread_2_clients:353
thread_3_clients:354
thread_4_clients:353
thread_5_clients:353
thread_6_clients:353
thread_7_clients:353
thread_8_clients:354
thread_9_clients:353
thread_10_clients:353
thread_11_clients:353
thread_12_clients:353
thread_13_clients:353
# Memory
used_memory:6394176632
used_memory_human:5.96G
used_memory_rss:8727318528
used_memory_rss_human:8.13G
used_memory_peak:60115190368
used_memory_peak_human:55.99G
used_memory_peak_perc:10.64%
used_memory_overhead:1116899920
used_memory_startup:61160424
used_memory_dataset:5277276712
used_memory_dataset_perc:83.33%
allocator_allocated:6395309624
allocator_active:7677943808
allocator_resident:8834015232
total_system_memory:2151648985088
total_system_memory_human:1.96T
used_memory_lua:37888
used_memory_lua_human:37.00K
used_memory_scripts:0
used_memory_scripts_human:0B
number_of_cached_scripts:0
maxmemory:0
maxmemory_human:0B
maxmemory_policy:noeviction
allocator_frag_ratio:1.20
allocator_frag_bytes:1282634184
allocator_rss_ratio:1.15
allocator_rss_bytes:1156071424
rss_overhead_ratio:0.99
rss_overhead_bytes:-106696704
mem_fragmentation_ratio:1.36
mem_fragmentation_bytes:2332877888
mem_not_counted_for_evict:0
mem_replication_backlog:14680064
mem_clients_slaves:20512
mem_clients_normal:101391752
mem_aof_buffer:0
mem_allocator:jemalloc-5.1.0
active_defrag_running:0
lazyfree_pending_objects:0
# Persistence
loading:0
rdb_changes_since_last_save:4711772
rdb_bgsave_in_progress:0
rdb_last_save_time:1674668148
rdb_last_bgsave_status:ok
rdb_last_bgsave_time_sec:65
rdb_current_bgsave_time_sec:-1
rdb_last_cow_size:2592280576
aof_enabled:0
aof_rewrite_in_progress:0
aof_rewrite_scheduled:0
aof_last_rewrite_time_sec:-1
aof_current_rewrite_time_sec:-1
aof_last_bgrewrite_status:ok
aof_last_write_status:ok
aof_last_cow_size:0
module_fork_in_progress:0
module_fork_last_cow_size:0
# Stats
total_connections_received:126662278
total_commands_processed:2340631566442
instantaneous_ops_per_sec:140397
total_net_input_bytes:505285039502195
total_net_output_bytes:584367299776002
instantaneous_input_kbps:28391.68
instantaneous_output_kbps:58164.20
rejected_connections:0
sync_full:122
sync_partial_ok:0
sync_partial_err:13
expired_keys:70106127
expired_stale_perc:0.00
expired_time_cap_reached_count:0
expire_cycle_cpu_milliseconds:0
evicted_keys:0
keyspace_hits:1274639566828
keyspace_misses:57745615848
pubsub_channels:0
pubsub_patterns:0
latest_fork_usec:913215
migrate_cached_sockets:0
slave_expires_tracked_keys:0
active_defrag_hits:0
active_defrag_misses:0
active_defrag_key_hits:0
active_defrag_key_misses:0
tracking_total_keys:0
tracking_total_items:0
tracking_total_prefixes:0
unexpected_error_replies:0
total_reads_processed:2282427921072
total_writes_processed:2283187334179
# Replication
role:master
connected_slaves:1
slave0:ip=10.103.100.128,port=8379,state=online,offset=4227425854864,lag=0
master_replid:4a70620209e6bf172c84a305d3eea1eb012ee7fc
master_replid2:0000000000000000000000000000000000000000
master_repl_offset:4227427718100
second_repl_offset:-1
repl_backlog_active:1
repl_backlog_size:13082698
repl_backlog_first_byte_offset:4227414635403
repl_backlog_histlen:13082698
# CPU
used_cpu_sys:45224135.042087
used_cpu_user:67970384.790068
used_cpu_sys_children:1880048.877208
used_cpu_user_children:6937351.817358
server_threads:14
long_lock_waits:29608570
# Modules
# Cluster
cluster_enabled:0
# Keyspace
db0:keys=16780191,expires=57,avg_ttl=2649
20:18 $ for h in {11..15}; do ssh root@kdb$h-prod-xprj "hostname; keydb-cli info keyspace"; done
kdb11-prod-xprj
# Keyspace
db0:keys=16792729,expires=15206649,avg_ttl=9223358066324521984,cached_keys=16792729
kdb12-prod-xprj
# Keyspace
db0:keys=16793036,expires=15206764,avg_ttl=9223358066314103808,cached_keys=16793036
kdb13-prod-xprj
# Keyspace
db0:keys=16793513,expires=15206903,avg_ttl=9223358066318473216,cached_keys=16793513
kdb14-prod-xprj
# Keyspace
db0:keys=16782045,expires=102,avg_ttl=7689,cached_keys=16782045
kdb15-prod-xprj
# Keyspace
db0:keys=16792729,expires=15206649,avg_ttl=9223358066324520960,cached_keys=16792729