We are battling problems where Mongo steadily grows in memory use until the the machine runs out of memory and the kernel decides to kill the process.
We are seeing it happen on both mongodb 3.0.9 and 3.2.6. We are running with WiredTiger on both and they are in a replicset with 3 members. Mongo 3.2 takes a lot longer but it still happens.
As I write this right now our mongo 3.2 server (according to top) is taking up 84.4% of the system memory:
We haven't changed any memory settings so it should stay around ~50%. According to db.serverStatus() WiredTiger maximum size is:
So where do I go from here? How can I diagnose what is taking up memory and how can I stop mongo memory usage from growing?
rsjfe2:PRIMARY> db.serverStatus()
{
"advisoryHostFQDNs" : [
"ip-10-0-5-143.eu-west-1.compute.internal"
],
"version" : "3.2.6",
"process" : "mongod",
"pid" : NumberLong(5316),
"uptime" : 1966330,
"uptimeMillis" : NumberLong(1966329923),
"uptimeEstimate" : 1941973,
"localTime" : ISODate("2016-06-01T15:51:22.238Z"),
"asserts" : {
"regular" : 0,
"warning" : 0,
"msg" : 0,
"user" : 402,
"rollovers" : 0
},
"connections" : {
"current" : 997,
"available" : 50203,
"totalCreated" : NumberLong(128435)
},
"extra_info" : {
"note" : "fields vary by platform",
"page_faults" : 3420
},
"globalLock" : {
"totalTime" : NumberLong("1966329920000"),
"currentQueue" : {
"total" : 0,
"readers" : 0,
"writers" : 0
},
"activeClients" : {
"total" : 1027,
"readers" : 0,
"writers" : 1
}
},
"locks" : {
"Global" : {
"acquireCount" : {
"r" : NumberLong(1227231620),
"w" : NumberLong(135596462),
"R" : NumberLong(3582),
"W" : NumberLong(3587)
},
"acquireWaitCount" : {
"r" : NumberLong(4),
"W" : NumberLong(2)
},
"timeAcquiringMicros" : {
"r" : NumberLong(57066),
"W" : NumberLong(1472)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(544673404),
"w" : NumberLong(135585696),
"R" : NumberLong(1146133),
"W" : NumberLong(10766)
},
"acquireWaitCount" : {
"r" : NumberLong(6),
"w" : NumberLong(42253),
"R" : NumberLong(68975),
"W" : NumberLong(1)
},
"timeAcquiringMicros" : {
"r" : NumberLong(647),
"w" : NumberLong(355404993),
"R" : NumberLong(428210919),
"W" : NumberLong(892)
}
},
"Collection" : {
"acquireCount" : {
"r" : NumberLong(159920428),
"w" : NumberLong(67595408)
}
},
"Metadata" : {
"acquireCount" : {
"w" : NumberLong(67982390)
}
},
"oplog" : {
"acquireCount" : {
"r" : NumberLong(385211759),
"w" : NumberLong(67990288)
}
}
},
"network" : {
"bytesIn" : NumberLong("6434397102795"),
"bytesOut" : NumberLong("26604070213927"),
"numRequests" : NumberLong(615123634)
},
"opcounters" : {
"insert" : 81,
"query" : 147426296,
"update" : 59754153,
"delete" : 5157628,
"getmore" : 181998504,
"command" : 232064713
},
"opcountersRepl" : {
"insert" : 812,
"query" : 0,
"update" : 6252,
"delete" : 440,
"getmore" : 0,
"command" : 0
},
"repl" : {
"hosts" : [
],
"setName" : "rsjfe2",
"setVersion" : 7,
"ismaster" : true,
"secondary" : false,
"electionId" : ObjectId("7fffffff0000000000000000"),
"rbid" : 136503380
},
"storageEngine" : {
"name" : "wiredTiger",
"supportsCommittedReads" : true,
"persistent" : true
},
"tcmalloc" : {
"generic" : {
"current_allocated_bytes" : NumberLong("10665104672"),
"heap_size" : NumberLong("16490340352")
},
"tcmalloc" : {
"pageheap_unmapped_bytes" : NumberLong("2724331520"), "max_total_thread_cache_bytes" : NumberLong(1073741824),
"current_total_thread_cache_bytes" : 310903840,
"central_cache_free_bytes" : 84209344,
"transfer_cache_free_bytes" : 68818944,
"thread_cache_free_bytes" : 310903840,
"aggressive_memory_decommit" : 0
},
"formattedString" : "------------------------------------------------\nMALLOC: 10665104672 (10171.0 MiB) Bytes in use by application\nMALLOC: + 2636972032 ( 2514.8 MiB) Bytes in page heap freelist\nMALLOC: + 84209344 ( 80.3 MiB) Bytes in central cache freelist\nMALLOC: + 68818944 ( 65.6 MiB) Bytes in transfer cache freelist\nMALLOC: + 310903840 ( 296.5 MiB) Bytes in thread cache freelists\nMALLOC: + 29454496 ( 28.1 MiB) Bytes in malloc metadata\nMALLOC: ------------\nMALLOC: = 13795463328 (13156.4 MiB) Actual memory used (physical + swap)\nMALLOC: + 2724331520 ( 2598.1 MiB) Bytes released to OS (aka unmapped)\nMALLOC: ------------\nMALLOC: = 16519794848 (15754.5 MiB) Virtual address space used\nMALLOC:\nMALLOC: 113055 Spans in use\nMALLOC: 1059 Thread heaps in use\nMALLOC: 8192 Tcmalloc page size\n------------------------------------------------\nCall ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).\nBytes released to the OS take up virtual address space but no physical memory.\n"
},
"wiredTiger" : {
"uri" : "statistics:",
"LSM" : {
"application work units currently queued" : 0,
"merge work units currently queued" : 0,
"rows merged in an LSM tree" : 0,
"sleep for LSM checkpoint throttle" : 0,
"sleep for LSM merge throttle" : 0,
"switch work units currently queued" : 0,
"tree maintenance operations discarded" : 0,
"tree maintenance operations executed" : 0,
"tree maintenance operations scheduled" : 0,
"tree queue hit maximum" : 0
},
"async" : {
"current work queue length" : 0,
"maximum work queue length" : 0,
"number of allocation state races" : 0,
"number of flush calls" : 0,
"number of operation slots viewed for allocation" : 0,
"number of times operation allocation failed" : 0,
"number of times worker found no work" : 0,
"total allocations" : 0,
"total compact calls" : 0,
"total insert calls" : 0,
"total remove calls" : 0,
"total search calls" : 0,
"total update calls" : 0
},
"block-manager" : {
"blocks pre-loaded" : 4556,
"blocks read" : 127660899,
"blocks written" : 99738488,
"bytes read" : NumberLong("2338766262272"),
"bytes written" : NumberLong("2446105161728"),
"mapped blocks read" : 0,
"mapped bytes read" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("5665691392486"),
"bytes written from cache" : NumberLong("10621888306533"),
"checkpoint blocked page eviction" : 15121,
"eviction currently operating in aggressive mode" : 0,
"eviction server candidate queue empty when topping up" : 1396910,
"eviction server candidate queue not empty when topping up" : 1563984,
"eviction server evicting pages" : 1,
"eviction server populating queue, but not evicting pages" : 2960893,
"eviction server unable to reach eviction goal" : 0,
"eviction worker thread evicting pages" : 128299431,
"failed eviction of pages that exceeded the in-memory maximum" : 85911,
"hazard pointer blocked page eviction" : 160568,
"in-memory page passed criteria to be split" : 1622871,
"in-memory page splits" : 736566,
"internal pages evicted" : 4634330,
"internal pages split during eviction" : 3391,
"leaf pages split during eviction" : 1441106,
"lookaside table insert calls" : 0,
"lookaside table remove calls" : 0,
"maximum page size at eviction" : 107761047,
"modified pages evicted" : 48893584,
"page split during eviction deepened the tree" : 0,
"page written requiring lookaside records" : 0,
"pages currently held in the cache" : 69308,
"pages evicted because they exceeded the in-memory maximum" : 113607,
"pages evicted because they had chains of deleted items" : 740923,
"pages evicted by application threads" : 0,
"pages read into cache" : 126796905,
"pages read into cache requiring lookaside entries" : 0,
"pages selected for eviction unable to be evicted" : 960333,
"pages walked for eviction" : 216198002388,
"pages written from cache" : 98010810,
"pages written requiring in-memory restoration" : 151,
"percentage overhead" : 8,
"tracked bytes belonging to internal pages in the cache" : 96988262,
"tracked bytes belonging to leaf pages in the cache" : 6690587404,
"tracked bytes belonging to overflow pages in the cache" : 0,
"tracked dirty bytes in the cache" : 176719917,
"tracked dirty pages in the cache" : 582,
"unmodified pages evicted" : 78929821
},
"connection" : {
"auto adjusting condition resets" : 9067162,
"auto adjusting condition wait calls" : 21456268,
"files currently open" : 54,
"memory re-allocations" : 135923156,
"pthread mutex condition wait calls" : 622239639,
"pthread mutex shared lock read-lock calls" : 7484281,
"pthread mutex shared lock write-lock calls" : 3304319,
"total read I/Os" : 127683466,
"total write I/Os" : 124367096
},
"cursor" : {
"cursor create calls" : 258743,
"cursor insert calls" : 222315537,
"cursor next calls" : 861917779,
"cursor prev calls" : 169,
"cursor remove calls" : 99791132,
"cursor reset calls" : 1183530736,
"cursor restarted searches" : 68895,
"cursor search calls" : 780827213,
"cursor search near calls" : 271649736,
"cursor update calls" : 0,
"truncate calls" : 3949
},
"data-handle" : {
"connection data handles currently active" : 51,
"connection sweep candidate became referenced" : 0,
"connection sweep dhandles closed" : 0,
"connection sweep dhandles removed from hash list" : 1577631,
"connection sweep time-of-death sets" : 1578505,
"connection sweeps" : 196629,
"session dhandles swept" : 364107,
"session sweep attempts" : 32607
},
"log" : {
"busy returns attempting to switch slots" : 670543,
"consolidated slot closures" : 23154716,
"consolidated slot join transitions" : 23154716,
"consolidated slot joins" : 68101654,
"consolidated slot unbuffered writes" : 2950109,
"log bytes of payload data" : NumberLong("2256730787505"),
"log bytes written" : NumberLong("2261105677056"),
"log files manually zero-filled" : 0,
"log flush operations" : 19046727,
"log force write operations" : 4554806803,
"log force write operations skipped" : 4537197060,
"log records compressed" : 59994987,
"log records not compressed" : 8062945,
"log records too small to compress" : 43715,
"log release advances write LSN" : 32356,
"log scan operations" : 5,
"log scan records requiring two reads" : 2,
"log server thread advances write LSN" : 23122045,
"log server thread write LSN walk skipped" : 475278080,
"log sync operations" : 14516243,
"log sync_dir operations" : 21572,
"log write operations" : 68101644,
"logging bytes consolidated" : NumberLong("2261102901376"),
"maximum log file size" : 104857600,
"number of pre-allocated log files to create" : 4,
"pre-allocated log files not ready and missed" : 3,
"pre-allocated log files prepared" : 21573,
"pre-allocated log files used" : 21569,
"records processed by log scan" : 11,
"total in-memory size of compressed records" : NumberLong("13089677619072"),
"total log buffer size" : 33554432,
"total size of compressed records" : NumberLong("2254825075716"),
"written slots coalesced" : 315,
"yields waiting for previous log file close" : 0
},
"reconciliation" : {
"fast-path pages deleted" : 33177995,
"page reconciliation calls" : 62584500,
"page reconciliation calls for eviction" : 35461270,
"pages deleted" : 6675350,
"split bytes currently awaiting free" : 161255,
"split objects currently awaiting free" : 12
},
"session" : {
"open cursor count" : 1099,
"open session count" : 44
},
"thread-yield" : {
"page acquire busy blocked" : 1206,
"page acquire eviction blocked" : 152416,
"page acquire locked blocked" : 758687,
"page acquire read blocked" : 569299,
},
"transaction" : {
"number of named snapshots created" : 0,
"number of named snapshots dropped" : 0,
"transaction begins" : 620408601,
"transaction checkpoint currently running" : 0,
"transaction checkpoint generation" : 32352,
"transaction checkpoint max time (msecs)" : 1045,
"transaction checkpoint min time (msecs)" : 6,
"transaction checkpoint most recent time (msecs)" : 305,
"transaction checkpoint total time (msecs)" : 7963466,
"transaction checkpoints" : 32352,
"transaction failures due to cache overflow" : 0,
"transaction range of IDs currently pinned" : 23,
"transaction range of IDs currently pinned by a checkpoint" : 0,
"transaction range of IDs currently pinned by named snapshots" : 0,
"transaction sync calls" : 0,
"transactions committed" : 68043620,
"transactions rolled back" : 552344056
},
"concurrentTransactions" : {
"write" : {
"out" : 1,
"available" : 127,
"totalTickets" : 128
},
"read" : {
"out" : 0,
"available" : 128,
"totalTickets" : 128
}
}
},
"writeBacksQueued" : false,
"mem" : {
"bits" : 64,
"resident" : 12883,
"virtual" : 17493,
"supported" : true,
"mapped" : 0,
"mappedWithJournal" : 0
},
"metrics" : {
"commands" : {
"_isSelf" : {
"failed" : NumberLong(0),
"total" : NumberLong(3)
},
"buildInfo" : {
"failed" : NumberLong(0),
"total" : NumberLong(11)
},
"collStats" : {
"failed" : NumberLong(0),
"total" : NumberLong(32743)
},
"count" : {
"failed" : NumberLong(0),
"total" : NumberLong(2660074)
},
"dbStats" : {
"failed" : NumberLong(0),
"total" : NumberLong(1146125)
},
"delete" : {
"failed" : NumberLong(0),
"total" : NumberLong(5157628)
},
"find" : {
"failed" : NumberLong(0),
"total" : NumberLong(137920083)
},
"findAndModify" : {
"failed" : NumberLong(0),
"total" : NumberLong(1777966)
},
"getLog" : {
"failed" : NumberLong(0),
"total" : NumberLong(11)
},
"getMore" : {
"failed" : NumberLong(0),
"total" : NumberLong(181913479)
},
"insert" : {
"failed" : NumberLong(0),
"total" : NumberLong(81)
},
"isMaster" : {
"failed" : NumberLong(0),
"total" : NumberLong(11295053)
},
"listCollections" : {
"failed" : NumberLong(0),
"total" : NumberLong(5)
},
"listIndexes" : {
"failed" : NumberLong(0),
"total" : NumberLong(8424)
},
"ping" : {
"failed" : NumberLong(0),
"total" : NumberLong(785898)
},
"replSetFresh" : {
"failed" : NumberLong(0),
"total" : NumberLong(1)
},
"replSetGetConfig" : {
"failed" : NumberLong(0),
"total" : NumberLong(3)
},
"replSetGetStatus" : {
"failed" : NumberLong(0),
"total" : NumberLong(1146200)
},
"replSetHeartbeat" : {
"failed" : NumberLong(0),
"total" : NumberLong(1965278)
},
"replSetReconfig" : {
"failed" : NumberLong(0),
"total" : NumberLong(1)
},
"replSetUpdatePosition" : {
"failed" : NumberLong(4),
"total" : NumberLong(211246905)
},
"serverStatus" : {
"failed" : NumberLong(0),
"total" : NumberLong(1)
},
"update" : {
"failed" : NumberLong(0),
"total" : NumberLong(48476414)
},
"whatsmyuri" : {
"failed" : NumberLong(0),
"total" : NumberLong(11)
}
},
"cursor" : {
"timedOut" : NumberLong(2),
"open" : {
"noTimeout" : NumberLong(0),
"pinned" : NumberLong(1),
"total" : NumberLong(2)
}
},
"document" : {
"deleted" : NumberLong(7874247),
"inserted" : NumberLong(162),
"returned" : NumberLong(349497992),
"updated" : NumberLong(60176785)
},
"getLastError" : {
"wtime" : {
"num" : 0,
"totalMillis" : 0
},
"wtimeouts" : NumberLong(0)
},
"operation" : {
"fastmod" : NumberLong(0),
"idhack" : NumberLong(5482186),
"scanAndOrder" : NumberLong(0),
"writeConflicts" : NumberLong(19221)
},
"queryExecutor" : {
"scanned" : NumberLong(222230532),
"scannedObjects" : NumberLong(212811256)
},
"record" : {
"moves" : NumberLong(0)
},
"repl" : {
"executor" : {
"counters" : {
"eventCreated" : 8,
"eventWait" : 8,
"cancels" : 210943035,
"waits" : 71325533,
"scheduledNetCmd" : 1965362,
"scheduledDBWork" : 1,
"scheduledXclWork" : 0,
"scheduledWorkAt" : 207891871,
"scheduledWork" : 282616398,
"schedulingFailures" : 0
},
"queues" : {
"networkInProgress" : 0,
"dbWorkInProgress" : 0,
"exclusiveInProgress" : 0,
"sleepers" : 3,
"ready" : 0,
"free" : 18
},
"unsignaledEvents" : 3,
"eventWaiters" : 0,
"shuttingDown" : false,
"networkInterface" : "NetworkInterfaceASIO inShutdown: 0"
},
"apply" : {
"batches" : {
"num" : 3581,
"totalMillis" : 514
},
"ops" : NumberLong(7504)
},
"buffer" : {
"count" : NumberLong(0),
"maxSizeBytes" : 268435456,
"sizeBytes" : NumberLong(0)
},
"network" : {
"bytes" : NumberLong(507676771),
"getmores" : {
"num" : 3580,
"totalMillis" : 0
},
"ops" : NumberLong(7505),
"readersCreated" : NumberLong(5)
},
"preload" : {
"docs" : {
"num" : 0,
"totalMillis" : 0
},
"indexes" : {
"num" : 0,
"totalMillis" : 0
}
}
},
"storage" : {
"freelist" : {
"search" : {
"bucketExhausted" : NumberLong(0),
"requests" : NumberLong(0),
"scanned" : NumberLong(0)
}
}
},
"ttl" : {
"deletedDocuments" : NumberLong(0),
"passes" : NumberLong(32771)
}
},
"ok" : 1
}