diff --git a/bin/pt-slave-delay b/bin/pt-slave-delay index 99947767..8fd21dfc 100755 --- a/bin/pt-slave-delay +++ b/bin/pt-slave-delay @@ -2117,44 +2117,43 @@ sub main { $now = time(); - # TODO: this is a race condition. See 0xdeadbeef below. - if ( !$slave_dbh || !$slave_dbh->ping() ) { - # Try 10 times, for about 2 minutes, to reconnect to the slave, - # increasing wait time from 3 to 15 seconds. - $o->set('ask-pass', 0); # don't ask again - my $tries = 10; - my $rt = new Retry(); - $rt->retry( - tries => $tries, - retry_on_die => 1, - wait => sub { - my ( %args ) = @_; - return unless $oktorun; - my $t = min($args{tryno} * 3, 15); - info("Could not reconnect to slave, sleeping $t seconds " - . "and trying " . ($tries-$args{tryno}) . " more times") - if $tries - $args{tryno}; - sleep $t; - }, - try => sub { - return unless $oktorun; - info("Lost connection to slave, trying to reconnect"); + # If the database connection is gone, we must live on! + # Try 10 times, for about 2 minutes, to reconnect to the slave, + # increasing wait time from 3 to 15 seconds. + $o->set('ask-pass', 0); # don't ask again + my $tries = 10; + my $rt = new Retry(); + $rt->retry( + tries => $tries, + retry_on_die => 1, + wait => sub { + my ( %args ) = @_; + return unless $oktorun; + my $t = min($args{tryno} * 3, 15); + info("Lost connection, sleeping $t seconds " + . "and trying " . ($tries-$args{tryno}) . " more times") + if $tries - $args{tryno}; + sleep $t; + info("Trying to reconnect"); + eval { $slave_dbh = get_dbh($dp, $slave_dsn); - return $slave_dbh; - }, - on_success => sub { - info("Reconnected to slave"); - }, - on_failure => sub { - return unless $oktorun; - die "Failed to reconnect to slave"; - }, - ); - last unless $oktorun; # might have gotten interrupt while waiting - } - # 0xdeadbeef (see above): just because we reconnected in the above Retry - # does not mean we have a connection here! - $status = $slave_dbh->selectrow_hashref("SHOW SLAVE STATUS"); + }; + }, + try => sub { + return unless $oktorun; + $status = $slave_dbh->selectrow_hashref("SHOW SLAVE STATUS"); + return $status; + }, + on_success => sub { + info("Reconnected to slave"); + }, + on_failure => sub { + return unless $oktorun; + die "Failed to reconnect to slave"; + }, + ); + last unless $oktorun; # might have gotten interrupt while waiting + if ( !$status || ! %$status ) { die "No SLAVE STATUS found"; } diff --git a/t/pt-slave-delay/auto_restart.t b/t/pt-slave-delay/auto_restart.t index 3e26b22f..11ba3acc 100644 --- a/t/pt-slave-delay/auto_restart.t +++ b/t/pt-slave-delay/auto_restart.t @@ -45,10 +45,11 @@ my $output; # the child should restart the slave, and the tool should report # that it reconnected and did some work, ending with "Setting slave # to run normally". +diag('Running...'); my $pid = fork(); if ( $pid ) { # parent - $output = `$cmd --interval 1 --run-time 8 2>&1`; + $output = `$cmd --interval 1 --run-time 4 2>&1`; like( $output, qr/Lost connection.+?Reconnected to slave.+Setting slave to run/ms, @@ -71,11 +72,11 @@ waitpid ($pid, 0); $pid = fork(); if ( $pid ) { # parent. Note the --database mysql - $output = `$cmd --database mysql --interval 1 --run-time 8 2>&1`; + $output = `$cmd --database mysql --interval 1 --run-time 4 2>&1`; like( $output, qr/Lost connection.+?Reconnected to slave.+Setting slave to run/ms, - "Reconnect to slave" + "Reconnect to slave when KILL'ed" ); } else {