An futex wait deadlock when in topology's open method in multiprocessing environment

135 views
Skip to first unread message

shahuwang

unread,
May 24, 2016, 3:55:20 AM5/24/16
to mongodb-user
Hi, all:
   My program randomly hang , and I use strace to find out what happened, the result is:

   
futex(0x20a0b70, FUTEX_WAIT_PRIVATE, 0, NULL


  In gdb, with py-bt, part of all frames :

#5 Frame 0x7f2e5a86d3f0, for file /usr/local/lib/python2.7/dist-packages/pymongo/topology.py, line 59, in open (self=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<thread.lock at remote 0x7f2e5b7fe350>, acquire=<built-in method acquire of thread.lock object at remote 0x7f2e5b7fe350>, _Condition__waiters=[], release=<built-in method release of thread.lock object at remote 0x7f2e5b7fe350>) at remote 0x7f2e5a86a590>) at remote 0x7f2e5a86a550>, _interval=10, _stopped=False, _min_interval=<float at remote 0x1fed2b0>) at remote 0x7f2e5a86a510>, _settings=<TopologySettings(_monitor_class=<type at remote 0x20db090>, _seeds=set([(...)]), _pool_options=<PoolOptions at remote 0x7f2e5a8b0fa0>, _local_threshold_ms=15, _pool_class=<class...(truncated)
   
with self._lock:
#8 Frame 0x7f2e595c2c20, for file /usr/local/lib/python2.7/dist-packages/pymongo/mongo_client.py, line 658, in _get_topology (self=<MongoClient(_MongoClient__kill_cursors_queue=[], _MongoClient__all_credentials={}, _MongoClient__default_database_name=None, _MongoClient__lock=<thread.lock at remote 0x7f2e5b7fe290>, _topology=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<thread.lock at remote 0x7f2e5b7fe350>, acquire=<built-in method acquire of thread.lock object at remote 0x7f2e5b7fe350>, _Condition__waiters=[], release=<built-in method release of thread.lock object at remote 0x7f2e5b7fe350>) at remote 0x7f2e5a86a590>) at remote 0x7f2e5a86a550>, _interval=10, _stopped=False, _min_interval=<float at remote 0x1fed2b0>) at rem...(truncated)
   
self._topology.open()
#11 Frame 0x7f2e5a8799d8, for file /usr/local/lib/python2.7/dist-packages/pymongo/mongo_client.py, line 663, in _get_socket (self=<MongoClient(_MongoClient__kill_cursors_queue=[], _MongoClient__all_credentials={}, _MongoClient__default_database_name=None, _MongoClient__lock=<thread.lock at remote 0x7f2e5b7fe290>, _topology=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<thread.lock at remote 0x7f2e5b7fe350>, acquire=<built-in method acquire of thread.lock object at remote 0x7f2e5b7fe350>, _Condition__waiters=[], release=<built-in method release of thread.lock object at remote 0x7f2e5b7fe350>) at remote 0x7f2e5a86a590>) at remote 0x7f2e5a86a550>, _interval=10, _stopped=False, _min_interval=<float at remote 0x1fed2b0>) at remot...(truncated)
    server
= self._get_topology().select_server(selector)
#18 Frame 0x3934a40, for file /usr/lib/python2.7/contextlib.py, line 17, in __enter__ (self=<GeneratorContextManager(gen=<generator at remote 0x7f2e58c0cc80>) at remote 0x7f2e58bdb2d0>)
   
return self.gen.next()
#25 Frame 0x21bc7a0, for file /usr/local/lib/python2.7/dist-packages/pymongo/collection.py, line 1074, in __create_index (self=<Collection(_BaseObject__read_preference=<Primary at remote 0x7f2e5ba55520>, _Collection__name=u'dead_lock', _Collection__database=<Database(_Database__client=<MongoClient(_MongoClient__kill_cursors_queue=[], _MongoClient__all_credentials={}, _MongoClient__default_database_name=None, _MongoClient__lock=<thread.lock at remote 0x7f2e5b7fe290>, _topology=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<thread.lock at remote 0x7f2e5b7fe350>, acquire=<built-in method acquire of thread.lock object at remote 0x7f2e5b7fe350>, _Condition__waiters=[], release=<built-in method release of thread.lock object at rem...(truncated)
   
with self._socket_for_writes() as sock_info:
#28 Frame 0x7f2e5a87cd70, for file /usr/local/lib/python2.7/dist-packages/pymongo/collection.py, line 1163, in create_index (self=<Collection(_BaseObject__read_preference=<Primary at remote 0x7f2e5ba55520>, _Collection__name=u'dead_lock', _Collection__database=<Database(_Database__client=<MongoClient(_MongoClient__kill_cursors_queue=[], _MongoClient__all_credentials={}, _MongoClient__default_database_name=None, _MongoClient__lock=<thread.lock at remote 0x7f2e5b7fe290>, _topology=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<thread.lock at remote 0x7f2e5b7fe350>, acquire=<built-in method acquire of thread.lock object at remote 0x7f2e5b7fe350>, _Condition__waiters=[], release=<built-in method release of thread.lock object at ...(truncated)
   
self.__create_index(keys, kwargs)
#33 Frame 0x21bc310, for file /usr/local/lib/python2.7/dist-packages/mongoengine/document.py, line 749, in ensure_indexes (cls=<TopLevelDocumentMetaclass(__module__='__main__', _superclasses=(), _subclasses=('DeadLock',), run_once=<function at remote 0x7f2e5a866aa0>, objects=<QuerySetManager at remote 0x7f2e5a86abd0>, _collection=<Collection(_BaseObject__read_preference=<Primary at remote 0x7f2e5ba55520>, _Collection__name=u'dead_lock', _Collection__database=<Database(_Database__client=<MongoClient(_MongoClient__kill_cursors_queue=[], _MongoClient__all_credentials={}, _MongoClient__default_database_name=None, _MongoClient__lock=<thread.lock at remote 0x7f2e5b7fe290>, _topology=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<t...(truncated)
    collection
.create_index(fields, background=background, **opts)
#36 Frame 0x7f2e595bf050, for file lock.py, line 29, in run_once (self=<DeadLock at remote 0x7f2e5a843e50>, i=27130)
   
self.ensure_indexes()








In gdb, with bt command, the first few frames are:

#0  0x00007f2e5e293680 in sem_wait () from /lib/x86_64-linux-gnu/libpthread.so.0
#1  0x0000000000547fa4 in PyThread_acquire_lock (lock=0x20a0b70, waitflag=<optimized out>) at ../Python/thread_pthread.h:324
#2  0x00000000005608ae in lock_PyThread_acquire_lock.71666 (self=0x7f2e5b7fe2f0, args=<optimized out>) at ../Modules/threadmodule.c:52
#3  0x00000000004dce08 in PyObject_Call (kw=0x0, arg=(), func=<built-in method __enter__ of thread.lock object at remote 0x7f2e5b7fe2f0>) at ../Objects/abstract.c:2529
#4  PyObject_CallFunctionObjArgs (callable=callable@entry=<built-in method __enter__ of thread.lock object at remote 0x7f2e5b7fe2f0>) at ../Objects/abstract.c:2760
#5  0x000000000049b417 in PyEval_EvalFrameEx (
    f
=f@entry=Frame 0x7f2e5a86d3f0, for file /usr/local/lib/python2.7/dist-packages/pymongo/topology.py, line 59, in open (self=<Topology(_servers={('localhost', 27017): <Server(_monitor=<Monitor(_server_description=<ServerDescription at remote 0x7f2e5a8427e0>, _executor=<PeriodicExecutor(_thread=<weakproxy at remote 0x7f2e5a85ea48>, _target=<function at remote 0x7f2e5a84d0c8>, _event=<Event(_flag=False, _cond=<_Condition(_Verbose__verbose=False, _Condition__lock=<thread.lock at remote 0x7f2e5b7fe350>, acquire=<built-in method acquire of thread.lock object at remote 0x7f2e5b7fe350>, _Condition__waiters=[], release=<built-in method release of thread.lock object at remote 0x7f2e5b7fe350>) at remote 0x7f2e5a86a590>) at remote 0x7f2e5a86a550>, _interval=10, _stopped=False, _min_interval=<float at remote 0x1fed2b0>) at remote 0x7f2e5a86a510>, _settings=<TopologySettings(_monitor_class=<type at remote 0x20db090>, _seeds=set([(...)]), _pool_options=<PoolOptions at remote 0x7f2e5a8b0fa0>, _local_threshold_ms=15, _pool_class=<class...(truncated),
    throwflag
=throwflag@entry=0) at ../Python/ceval.c:2555


with py-list command, the result is :

 54        def open(self):
 
55            """Start monitoring, or restart after a fork.
  56    
  57            No effect if called multiple times.
  58            """

 
>59            with self._lock:
 
60                self._ensure_opened()
 
61    
 
62        def select_servers(self,
 
63                           selector,
 
64                           server_selection_timeout=None,

with info thr command, the result is:
 
   Id   Target Id         Frame
* 1    Thread 0x7f2e5e69b740 (LWP 25578) "python" 0x00007f2e5e293680 in sem_wait () from /lib/x86_64-linux-gnu/libpthread.so.0

with p *(pthread_mutex_t*)$rdi command, the result is:

$1 = {__data = {__lock = 0, __count = 128, __owner = 1, __nusers = 0, __kind = -1951721312, __spins = 32728, __elision = 0, __list = {__prev = 0x7fd885e429c0, __next = 0x7fd88ba55360}}, 
  __size = "\000\000\000\000\200\000\000\000\001\000\000\000\000\000\000\000\240\030\253\213\330\177\000\000\300)\344\205\330\177\000\000`S\245\213\330\177\000", __align = 549755813888}


Here is the code to reproduce the deadlock,  after loop to 27130, it hang:


# -*-coding: utf-8 -*-
import mongoengine as m
from multiprocessing import Process, Value
import logging

logger
= logging.getLogger("DeadLock")
logger
.setLevel(logging.DEBUG)
ch
= logging.StreamHandler()
ch
.setLevel(logging.DEBUG)
formatter
= logging.Formatter(
   
"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch
.setFormatter(formatter)
logger
.addHandler(ch)
m
.connect("test", alias="test")

class DeadLock(m.Document):
    name
= m.StringField(default="DeadLock")

    meta
= {
       
"db_alias": "test",
       
"indexes": [
           
{"fields": ["name"], "unique": False}
       
],
       
"allow_inheritance": True
   
}

   
def run_once(self, i):
       
self.ensure_indexes()
        logger
.info("%s, %d", self.name, i)

DeadLock(name="DeadLock1").save()
DeadLock(name="DeadLock2").save()

def run():
    dls
= DeadLock.objects()
   
for i in range(1000000):
       
for dl in dls:
            dl
.reload()
            logger
.info("reload")
            ret
= Value("b", True)


           
def run2(ret):
                dl
.run_once(i)
                ret
.value = True


            p
= Process(target=run2, args=(ret, ))
            p
.start()
            p
.join()

if __name__ == "__main__":
    run
()



OS: Ubuntu 14.04 LTS
Python: 2.7.6
mongoengine: 1.10.0
pymongo: 3.0.3
mongo: 3.0.12
Reply all
Reply to author
Forward
0 new messages