Hi All,
We're observing a SIGSEGV in gRPC 1.20. We're implementing GNMI proto as a reference API.
This SIGSEGV emanates from a thread that's simply blocking/waiting on CompletionQueue::Next(). In other threads, we are actively invoking stream-stream/unary-unary RPCs (Subscribe/Set)
The SIGSEGV traces to grpc core - ev_epollex_linux.cc: kick_one_worker function is accessing 'specific_worker->pollable_obj' which is NULL, thus the offset to p->mu fails.
Was wondering if there's any insight as to what causes this pollable_obj to become null? Is this resource shared? Each thread we have manages it's own private completion queue.
Any ideas where we should be looking?
Thanks,
Bryce
static grpc_error* kick_one_worker(grpc_pollset_worker* specific_worker) {
GPR_TIMER_SCOPE("kick_one_worker", 0);
pollable* p = specific_worker->pollable_obj; <----NULL!
grpc_core::MutexLock lock(&p->mu);
GPR_ASSERT(specific_worker != nullptr);
if (specific_worker->kicked) {
if (grpc_polling_trace.enabled()) {
gpr_log(GPR_INFO, "PS:%p kicked_specific_but_already_kicked", p);
}
GRPC_STATS_INC_POLLSET_KICKED_AGAIN();
return GRPC_ERROR_NONE;
}
(gdb) bt
#0 0x0000007fb95a8d34 in pthread_mutex_lock () from /lib/aarch64-linux-gnu/libpthread.so.0
#1 0x0000007fb8d73a0c in gpr_mu_lock (mu=mu@entry=0x78) at src/core/lib/gpr/sync_posix.cc:103
#2 0x0000007fb8dfb59c in grpc_core::MutexLock::MutexLock (mu=0x78, this=<synthetic pointer>) at ./src/core/lib/gprpp/mutex_lock.h:30
#3 kick_one_worker (specific_worker=specific_worker@entry=0x7f697eeca8) at src/core/lib/iomgr/ev_epollex_linux.cc:694
#4 0x0000007fb8dfbf40 in pollset_kick_all (pollset=<optimized out>) at src/core/lib/iomgr/ev_epollex_linux.cc:793
#5 0x0000007fb8dfbf9c in pollset_shutdown (pollset=0x7f780267c8, closure=<optimized out>) at src/core/lib/iomgr/ev_epollex_linux.cc:866
#6 0x0000007fb8d8f80c in cq_end_op_for_pluck (cq=0x7f780266d0, tag=0x7f697eef20, error=0x0, done=0x7fb8d8bdf8 <finish_batch_completion(void*, grpc_cq_completion*)>, done_arg=0x7f78137de0, storage=0x7f78137e30) at src/core/lib/surface/completion_queue.cc:790
#7 0x0000007fb8d8c278 in receiving_trailing_metadata_ready (bctlp=0x7f78137de0, error=<optimized out>) at src/core/lib/surface/call.cc:1480
#8 0x0000007fb8d79b68 in exec_ctx_run (closure=<optimized out>, error=0x0) at src/core/lib/iomgr/exec_ctx.cc:40
#9 0x0000007fb8d79b68 in exec_ctx_run (closure=<optimized out>, error=0x0) at src/core/lib/iomgr/exec_ctx.cc:40
#10 0x0000007fb8da6768 in grpc_closure_run (error=0x0, c=0x7f78137970) at ./src/core/lib/iomgr/closure.h:259
#11 grpc_core::SubchannelCall::RecvTrailingMetadataReady (arg=<optimized out>, error=<optimized out>) at src/core/ext/filters/client_channel/subchannel.cc:289
#12 0x0000007fb8d79b68 in exec_ctx_run (closure=<optimized out>, error=0x0) at src/core/lib/iomgr/exec_ctx.cc:40
#13 0x0000007fb8d79b68 in exec_ctx_run (closure=<optimized out>, error=0x0) at src/core/lib/iomgr/exec_ctx.cc:40
#14 0x0000007fb8d79ddc in exec_ctx_run (error=0x0, closure=<optimized out>) at src/core/lib/iomgr/exec_ctx.cc:148
#15 grpc_core::ExecCtx::Flush (this=0x7f837fd680) at src/core/lib/iomgr/exec_ctx.cc:148
#16 0x0000007fb8dfc880 in pollset_work (pollset=0x55965fa108, worker_hdl=<optimized out>, deadline=<optimized out>) at ./src/core/lib/iomgr/exec_ctx.h:213
#17 0x0000007fb8d9058c in cq_next (cq=0x55965fa040, deadline=..., reserved=<optimized out>) at src/core/lib/surface/completion_queue.cc:1021
#18 0x0000007fb8d732b4 in grpc::CompletionQueue::AsyncNextInternal (this=0x55965f2fe8, tag=0x7f837fd778, ok=0x7f837fd777, deadline=...) at src/cpp/common/completion_queue_cc.cc:48
#19 0x0000007fb8d593c4 in grpc::CompletionQueue::Next(void**, bool*) () at src/cpp/common/completion_queue_cc.cc:91
#20 0x0000007fb8d509d8 in Client::CrudService::AsyncNotifyChannelStateMonitor() () at src/cpp/common/completion_queue_cc.cc:91
#21 0x0000007fb8d5dce8 in void std::__invoke_impl<void, void (Client::CrudService::*)(), Client::CrudService*>(std::__invoke_memfun_deref, void (Client::CrudService::*&&)(), Client::CrudService*&&) () at src/cpp/common/completion_queue_cc.cc:91
#22 0x0000007fb8d5bf3c in std::__invoke_result<void (Client::CrudService::*)(), Client::CrudService*>::type std::__invoke<void (Client::CrudService::*)(), Client::CrudService*>(void (Client::CrudService::*&&)(), Client::CrudService*&&) ()
at src/cpp/common/completion_queue_cc.cc:91
#23 0x0000007fb8d63434 in decltype (__invoke((_S_declval<0ul>)(), (_S_declval<1ul>)())) std::thread::_Invoker<std::tuple<void (Client::CrudService::*)(), Client::CrudService*> >::_M_invoke<0ul, 1ul>(std::_Index_tuple<0ul, 1ul>) () at src/cpp/common/completion_queue_cc.cc:91
#24 0x0000007fb8d6332c in std::thread::_Invoker<std::tuple<void (Client::CrudService::*)(), Client::CrudService*> >::operator()() () at src/cpp/common/completion_queue_cc.cc:91
#25 0x0000007fb8d63220 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (Client::CrudService::*)(), Client::CrudService*> > >::_M_run() () at src/cpp/common/completion_queue_cc.cc:91
#26 0x0000007fb7a471f4 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb)