Yves,
Indeed the --retry looks a bit strange.
I've modified the lsb script to:
stop)
log_daemon_msg "Stopping $DESC" "$NAME"
if start-stop-daemon --stop --quiet --pidfile $PID --retry=TERM/600/KILL/5; then
log_end_msg 0
else
start-stop-daemon --stop --oknodo --exec $DAEMON --name $NAME --retry=TERM/600/KILL/5
log_end_msg $?
fi
;;
I've also modified my corosync configuration. (added the timeout as you said, and also a timeout on promote because it was too short (20s => 600s))
node zabbix-node1 \
attributes standby="off"
node zabbix-testnode3 \
attributes standby="on"
primitive p_mysql ocf:percona:mysql \
params config="/etc/mysql/my.cnf" log="/var/log/mysql/error.log" pid="/var/lib/mysql/mysqld.pid" socket="/var/run/mysqld/mysqld.sock" replication_user="repl_user" replication_passwd="slavepw" max_slave_lag="60" evict_outdated_slaves="false" binary="/usr/sbin/mysqld" test_user="test_user" test_passwd="testpw" \
op monitor interval="5s" role="Master" OCF_CHECK_LEVEL="1" \
op monitor interval="2s" role="Slave" OCF_CHECK_LEVEL="1" \
op start interval="0" timeout="600s" \
op stop interval="0" timeout="600s" \
op promote interval="0" timeout="600s"
primitive vip ocf:heartbeat:IPaddr2 \
params ip="172.24.195.51" nic="bond0" \
op monitor interval="15" \
meta target-role="Started"
primitive zabbix lsb:zabbix-server \
op start interval="0" timeout="60" delay="5s" \
op monitor interval="30s" \
op stop interval="0" timeout="600s"
ms ms_MySQL p_mysql \
meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" globally-unique="false" target-role="Started" is-managed="true"
colocation vip_and_zabbix_on_master inf: vip zabbix ms_MySQL:Master
order ms_MySQL_promote_before_zabbix inf: ms_MySQL:promote zabbix
order zabbix_before_vip inf: zabbix vip:start
property $id="cib-bootstrap-options" \
dc-version="1.1.6-9971ebba4494012a93c03b40a2c58ec0eb60f50c" \
cluster-infrastructure="openais" \
expected-quorum-votes="2" \
stonith-enabled="false" \
no-quorum-policy="ignore" \
last-lrm-refresh="1381309001"
property $id="mysql_replication" \
p_mysql_REPL_INFO="zabbix-testnode3|mysql-bin.000002|267851"
rsc_defaults $id="rsc-options" \
resource-stickiness="100"
But now I have a 'duplicate entry' probem in the database.
It seems like the 'zabbix sync' goes fine. But after the sync mysql is flushing all pages (see output hereunder) and at that moment the DB on the other node is already up (and writes to the (new master) database).
I think this is creating the 'duplicate entries', I got in the log and the DB gets corrupt.
Should the promote not waiting until the demote is finished? or how does it works internally?
PS: Mysql Page flushing can take up to 10minutes on the slowest server.
It's during this process (on the slave that is going in standby) the other DB is up at that moment and accepting queries... (but in my opinion not processed all queries from the 'old' master)
131009 15:37:29 [Note] /usr/sbin/mysqld: Normal shutdown
131009 15:37:29 [Note] Event Scheduler: Purging the queue. 0 events
131009 15:37:29 InnoDB: Starting shutdown...
131009 15:37:31 InnoDB: Waiting for 211 pages to be flushed
131009 15:38:29 InnoDB: Waiting for master thread to be suspended
131009 15:38:33 InnoDB: Waiting for 208 pages to be flushed
131009 15:39:29 InnoDB: Waiting for master thread to be suspended
131009 15:39:34 InnoDB: Waiting for 209 pages to be flushed
131009 15:40:29 InnoDB: Waiting for master thread to be suspended
131009 15:40:35 InnoDB: Waiting for 195 pages to be flushed
131009 15:41:00 InnoDB: Shutdown completed; log sequence number 218746000465
Thanks!
Erik
Op dinsdag 8 oktober 2013 16:13:06 UTC+2 schreef yves: