I am running a testbed environment with four slaves and HA Mesos-master environment. I had a small outage and now, none of the mesos-slaves are registering with mesos master.
root@mslave03:/var/log/mesos# tail -400 mesos-slave.mslave03.invalid-user.log.INFO.20141221-114742.42960
Log file created at: 2014/12/21 11:47:42
Running on machine: mslave03
Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
I1221 11:47:42.689198 42960 logging.cpp:172] INFO level logging started!
I1221 11:47:42.689863 42960 main.cpp:142] Build: 2014-11-22 05:29:57 by root
I1221 11:47:42.689898 42960 main.cpp:144] Version: 0.21.0
I1221 11:47:42.689913 42960 main.cpp:147] Git tag: 0.21.0
I1221 11:47:42.689928 42960 main.cpp:151] Git SHA: ab8fa655d34e8e15a4290422df38a18db1c09b5b
I1221 11:47:42.800703 42960 containerizer.cpp:100] Using isolation: cgroups/cpu,cgroups/mem
I1221 11:47:42.840673 42960 linux_launcher.cpp:94] Using /sys/fs/cgroup/freezer as the freezer hierarchy for the Linux launcher
I1221 11:47:42.843667 42960 main.cpp:165] Starting Mesos slave
I1221 11:47:42.847918 42960 slave.cpp:289] Slave resources: cpus(*):2; mem(*):2922; disk(*):10618; ports(*):[31000-32000]
I1221 11:47:42.848075 42960 slave.cpp:318] Slave hostname: 192.168.1.155
I1221 11:47:42.848111 42960 slave.cpp:319] Slave checkpoint: true
I1221 11:47:42.853911 42983 group.cpp:313] Group process (group(1)@
192.168.1.155:5051) connected to ZooKeeper
I1221 11:47:42.853987 42983 group.cpp:790] Syncing group operations: queue size (joins, cancels, datas) = (0, 0, 0)
I1221 11:47:42.854020 42983 group.cpp:385] Trying to create path '/mesos' in ZooKeeper
I1221 11:47:42.859082 42983 detector.cpp:138] Detected a new leader: (id='17')
I1221 11:47:42.860409 42982 state.cpp:33] Recovering state from '/tmp/mesos/meta'
I1221 11:47:42.860496 42979 group.cpp:659] Trying to get '/mesos/info_0000000017' in ZooKeeper
I1221 11:47:42.861368 42983 status_update_manager.cpp:197] Recovering status update manager
I1221 11:47:42.866406 42979 docker.cpp:767] Recovering Docker containers
I1221 11:47:42.871999 42985 containerizer.cpp:281] Recovering containerizer
root@mslave03:/var/log/mesos#
I1221 12:02:46.761798 1212 hierarchical_allocator_process.hpp:375] Activated framework 20141219-205159-16842879-5050-1177-0000
I1221 12:02:46.762552 1211 hierarchical_allocator_process.hpp:405] Deactivated framework 20141219-205159-16842879-5050-1177-0000
I1221 12:02:47.762332 1216 hierarchical_allocator_process.hpp:375] Activated framework 20141219-205159-16842879-5050-1177-0000
I1221 12:02:47.762930 1210 hierarchical_allocator_process.hpp:405] Deactivated framework 20141219-205159-16842879-5050-1177-0000
root@mmaster01:/etc/zookeeper/conf# mesos state
{
"lost_tasks": 0,
"build_user": "root",
"build_time": 1416634197,
"finished_tasks": 0,
"cluster": "Cluster01",
"unregistered_frameworks": [],
"id": "20141221-110839-2516691136-5050-1171",
"git_sha": "ab8fa655d34e8e15a4290422df38a18db1c09b5b",
"build_date": "2014-11-22 05:29:57",
"hostname": "192.168.1.150",
"version": "0.21.0",
"log_dir": "/var/log/mesos",
"killed_tasks": 0,
"deactivated_slaves": 0,
"failed_tasks": 0,
"start_time": 1419178119.63356,
"git_tag": "0.21.0",
"staged_tasks": 0,
"completed_frameworks": [],
"elected_time": 1419178138.05447,
"orphan_tasks": [],
"activated_slaves": 0,
"frameworks": [
{
"tasks": [],
"name": "marathon-0.7.6",
"used_resources": {
"mem": 0,
"disk": 0,
"cpus": 0
},
"webui_url": "",
"hostname": "mmaster02.home",
"checkpoint": true,
"offers": [],
"failover_timeout": 604800,
"completed_tasks": [],
"role": "*",
"resources": {
"mem": 0,
"disk": 0,
"cpus": 0
},
"active": false,
"unregistered_time": 0,
"registered_time": 1419178138.17168,
"reregistered_time": 1419181691.09023,
"id": "20141219-205159-16842879-5050-1177-0000",
"offered_resources": {
"mem": 0,
"disk": 0,
"cpus": 0
},
"user": "root"
}
],
"flags": {
"help": "false",
"ip": "192.168.1.150",
"whitelist": "*",
"cluster": "Cluster01",
"recovery_slave_removal_limit": "100%",
"port": "5050",
"logbufsecs": "0",
"authenticate": "false",
"work_dir": "/var/lib/mesos",
"slave_reregister_timeout": "10mins",
"authenticators": "crammd5",
"hostname": "192.168.1.150",
"authenticate_slaves": "false",
"framework_sorter": "drf",
"version": "false",
"log_dir": "/var/log/mesos",
"logging_level": "INFO",
"log_auto_initialize": "true",
"registry_strict": "false",
"registry_fetch_timeout": "1mins",
"root_submissions": "true",
"webui_dir": "/usr/local/share/mesos/webui",
"registry": "replicated_log",
"allocation_interval": "1secs",
"zk_session_timeout": "10secs",
"quorum": "1",
"user_sorter": "drf",
"quiet": "false",
"registry_store_timeout": "5secs",
"initialize_driver_logging": "true"
},
"started_tasks": 0,
"slaves": []
}
root@mmaster01:/etc/zookeeper/conf#