Hi,
following the conversation regarding gc on large queues, I'm testing to switch from quorum queues to stream queues.
From a fresh 3 nodes clusters I'm trying to create a stream queue and whatever I'm doing it always endup with a stream queue that is not replicated. On the management interface there is a "+0" in red saying "cluster in minority" and the rabbitmq-queues stream_status command show the replicas are not sync:
│ role │ node │ epoch │ offset │ committed_offset │ first_offset │ readers │ segments │
│ replica │ rabbit@admin-25726 │ 0 │ -1 │ 0 │ 0 │ 0 │ 1 │
│ replica │ rabbit@admin-25729 │ 0 │ -1 │ 0 │ 0 │ 0 │ 1 │
│ writer │ rabbit@admin-26586 │ 1 │ 999 │ -1 │ 0 │ 0 │ 1 │
it takes some times to have some logs
I have some warnings on the leader saying that connection to other nodes is impossible (but the connection is possible, firewall is opened, netcat can connect, etc etc)
2023-06-23 12:12:53.277242+00:00 [warning] <0.14580.4> could not connect osiris to replica at [{X,X,X,X},
2023-06-23 12:12:53.277242+00:00 [warning] <0.14580.4> "admin-25726",
2023-06-23 12:12:53.277242+00:00 [warning] <0.14580.4> "admin-25726"]
I also have some stack traces on the replicas (that would explain the previous warning)
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ** Generic server <0.7397.6> terminating
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ** Last message in was {continue,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> #{epoch => 1,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> event_formatter =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {rabbit_stream_queue,format_osiris_event,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> [{resource,<<"bench-stream">>,queue,<<"solo">>}]},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> leader_node =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> 'rabbit@admin-26586',
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> leader_pid => <13016.9995.4>,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> max_segment_size_bytes => 500000000,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> name => "bench-stream_solo_1687521587334265739",
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> nodes =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ['rabbit@admin-26586
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> 'rabbit@admin-25729',
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> 'rabbit@admin-25726'],
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> reference =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {resource,<<"bench-stream">>,queue,<<"solo">>},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> replica_nodes =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ['rabbit@admin-25729',
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> 'rabbit@admin-25726'],
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> retention => []}}
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ** When Server state == "osiris_replica:format_status/1 crashed"
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ** Reason for termination ==
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> ** {{case_clause,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {error,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {child,undefined,#Ref<0.2535608207.818413569.52138>,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {osiris_replica_reader,start_link,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> <<75,63,29,176,141,242,237,94,180,126,90,211,10,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> 5,187,93,129,204,25,31,31,132,251,169,176,89,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> 29,220,149,111,170,100>>,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> hosts =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> [{X,X,X,X},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> "admin-25729",
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> "admin-25729"],
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> leader_pid => <13016.9995.4>,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> name => <<"bench-stream_solo_1687521587334265739">>,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> port => 6478,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> reference =>
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {resource,<<"bench-stream">>,queue,<<"solo">>},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> replica_pid => <0.7397.6>,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> start_offset => {0,empty},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> transport => tcp}]},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> temporary,false,5000,worker,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> [osiris_replica_reader]}}}},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> [{osiris_replica_reader,start,2,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> [{file,"src/osiris_replica_reader.erl"},{line,108}]},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {osiris_replica,handle_continue,2,
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> [{file,"src/osiris_replica.erl"},{line,246}]},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {gen_server,try_dispatch,4,[{file,"gen_server.erl"},{line,1120}]},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {gen_server,loop,7,[{file,"gen_server.erl"},{line,862}]},
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6> {proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,240}]}]}
2023-06-23 12:19:26.494124+00:00 [error] <0.7397.6>
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> crasher:
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> initial call: osiris_replica:init/1
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> pid: <0.7397.6>
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> registered_name: []
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> exception error: no case clause matching
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {error,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {child,undefined,#Ref<0.2535608207.818413569.52138>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {osiris_replica_reader,start_link,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> <<75,63,29,176,141,242,237,94,180,126,90,211,10,5,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> 187,93,129,204,25,31,31,132,251,169,176,89,29,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> 220,149,111,170,100>>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> hosts =>
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> [{X,X,X,X},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> "admin-25729",
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> "admin-25729"],
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> leader_pid => <13016.9995.4>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> name =>
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> <<"bench-stream_solo_1687521587334265739">>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> port => 6478,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> reference =>
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {resource,<<"bench-stream">>,queue,<<"solo">>},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> replica_pid => <0.7397.6>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> start_offset => {0,empty},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> transport => tcp}]},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> temporary,false,5000,worker,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> [osiris_replica_reader]}}}
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> in function osiris_replica_reader:start/2 (src/osiris_replica_reader.erl, line 108)
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> in call from osiris_replica:handle_continue/2 (src/osiris_replica.erl, line 246)
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> in call from gen_server:try_dispatch/4 (gen_server.erl, line 1120)
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> in call from gen_server:loop/7 (gen_server.erl, line 862)
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> ancestors: [osiris_server_sup,osiris_sup,<0.2708.0>]
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> message_queue_len: 1
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> messages: [{'$gen_call',{<13016.14581.4>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> [alias|
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> #Ref<13016.1976662978.281346050.241907>]},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> await}]
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> links: [<0.7398.6>,#Port<0.21652>,<0.2712.0>]
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> dictionary: [{rand_seed,{#{bits => 58,jump => #Fun<rand.3.34006561>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> next => #Fun<rand.0.34006561>,type => exsss,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> uniform => #Fun<rand.1.34006561>,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> uniform_n => #Fun<rand.2.34006561>},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> [69162788875047421|99612874571196158]}}]
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> trap_exit: true
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> status: running
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> heap_size: 28690
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> stack_size: 28
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> reductions: 56824
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> neighbours:
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> neighbour:
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> pid: <0.7398.6>
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> registered_name: []
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> initial_call: {erlang,apply,2}
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> current_function: {prim_inet,accept0,3}
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> ancestors: []
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> message_queue_len: 0
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> links: [<0.7397.6>]
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> trap_exit: false
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> status: waiting
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> heap_size: 233
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> stack_size: 10
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> reductions: 71
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> current_stacktrace: [{prim_inet,accept0,3,[]},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {inet_tcp,accept,1,[{file,"inet_tcp.erl"},{line,219}]},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {osiris_replica,accept,4,
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> [{file,"src/osiris_replica.erl"},
2023-06-23 12:19:26.494676+00:00 [error] <0.7397.6> {line,313}]}]
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> supervisor: {local,osiris_server_sup}
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> errorContext: child_terminated
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> reason: {{case_clause,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> {error,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> {child,undefined,#Ref<0.2535608207.818413569.52138>,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> {osiris_replica_reader,start_link,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> <<75,63,29,176,141,242,237,94,180,126,90,211,10,5,187,93,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> 129,204,25,31,31,132,251,169,176,89,29,220,149,111,170,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> 100>>,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> hosts =>
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> [{X,X,X,X},
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> "admin-25729",
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> "admin-25729"],
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> leader_pid => <13016.9995.4>,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> name => <<"bench-stream_solo_1687521587334265739">>,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> port => 6478,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> reference =>
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> {resource,<<"bench-stream">>,queue,<<"solo">>},
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> replica_pid => <0.7397.6>,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> start_offset => {0,empty},
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> transport => tcp}]},
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> temporary,false,5000,worker,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> [osiris_replica_reader]}}}},
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> [{osiris_replica_reader,start,2,
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> [{file,"src/osiris_replica_reader.erl"},{line,108}]},
2023-06-23 12:19:26.495475+00:00 [error] <0.2712.0> {osiris_replica,handle_continue,2,
I tryed many things up to full reset of the cluster but it always ends the same way. I must miss something obvious mais I don't know what.
If someone has a clue it would make my day :-)
Thanks
++ Jerome