Add another test for reconnect, and fix failure/race condition in pt-slave-delay

This commit is contained in:
Baron Schwartz
2012-06-04 11:33:06 -04:00
parent 9aaafe8478
commit 56e20d269a
2 changed files with 40 additions and 40 deletions

View File

@@ -2117,44 +2117,43 @@ sub main {
$now = time(); $now = time();
# TODO: this is a race condition. See 0xdeadbeef below. # If the database connection is gone, we must live on!
if ( !$slave_dbh || !$slave_dbh->ping() ) { # Try 10 times, for about 2 minutes, to reconnect to the slave,
# Try 10 times, for about 2 minutes, to reconnect to the slave, # increasing wait time from 3 to 15 seconds.
# increasing wait time from 3 to 15 seconds. $o->set('ask-pass', 0); # don't ask again
$o->set('ask-pass', 0); # don't ask again my $tries = 10;
my $tries = 10; my $rt = new Retry();
my $rt = new Retry(); $rt->retry(
$rt->retry( tries => $tries,
tries => $tries, retry_on_die => 1,
retry_on_die => 1, wait => sub {
wait => sub { my ( %args ) = @_;
my ( %args ) = @_; return unless $oktorun;
return unless $oktorun; my $t = min($args{tryno} * 3, 15);
my $t = min($args{tryno} * 3, 15); info("Lost connection, sleeping $t seconds "
info("Could not reconnect to slave, sleeping $t seconds " . "and trying " . ($tries-$args{tryno}) . " more times")
. "and trying " . ($tries-$args{tryno}) . " more times") if $tries - $args{tryno};
if $tries - $args{tryno}; sleep $t;
sleep $t; info("Trying to reconnect");
}, eval {
try => sub {
return unless $oktorun;
info("Lost connection to slave, trying to reconnect");
$slave_dbh = get_dbh($dp, $slave_dsn); $slave_dbh = get_dbh($dp, $slave_dsn);
return $slave_dbh; };
}, },
on_success => sub { try => sub {
info("Reconnected to slave"); return unless $oktorun;
}, $status = $slave_dbh->selectrow_hashref("SHOW SLAVE STATUS");
on_failure => sub { return $status;
return unless $oktorun; },
die "Failed to reconnect to slave"; on_success => sub {
}, info("Reconnected to slave");
); },
last unless $oktorun; # might have gotten interrupt while waiting on_failure => sub {
} return unless $oktorun;
# 0xdeadbeef (see above): just because we reconnected in the above Retry die "Failed to reconnect to slave";
# does not mean we have a connection here! },
$status = $slave_dbh->selectrow_hashref("SHOW SLAVE STATUS"); );
last unless $oktorun; # might have gotten interrupt while waiting
if ( !$status || ! %$status ) { if ( !$status || ! %$status ) {
die "No SLAVE STATUS found"; die "No SLAVE STATUS found";
} }

View File

@@ -45,10 +45,11 @@ my $output;
# the child should restart the slave, and the tool should report # the child should restart the slave, and the tool should report
# that it reconnected and did some work, ending with "Setting slave # that it reconnected and did some work, ending with "Setting slave
# to run normally". # to run normally".
diag('Running...');
my $pid = fork(); my $pid = fork();
if ( $pid ) { if ( $pid ) {
# parent # parent
$output = `$cmd --interval 1 --run-time 8 2>&1`; $output = `$cmd --interval 1 --run-time 4 2>&1`;
like( like(
$output, $output,
qr/Lost connection.+?Reconnected to slave.+Setting slave to run/ms, qr/Lost connection.+?Reconnected to slave.+Setting slave to run/ms,
@@ -71,11 +72,11 @@ waitpid ($pid, 0);
$pid = fork(); $pid = fork();
if ( $pid ) { if ( $pid ) {
# parent. Note the --database mysql # parent. Note the --database mysql
$output = `$cmd --database mysql --interval 1 --run-time 8 2>&1`; $output = `$cmd --database mysql --interval 1 --run-time 4 2>&1`;
like( like(
$output, $output,
qr/Lost connection.+?Reconnected to slave.+Setting slave to run/ms, qr/Lost connection.+?Reconnected to slave.+Setting slave to run/ms,
"Reconnect to slave" "Reconnect to slave when KILL'ed"
); );
} }
else { else {