Wait for replicas in wait_for_last_checksum(). Add 'Server shutdown in progress' to Cxn::lost_connection().

This commit is contained in:
Daniel Nichter
2015-06-24 19:33:01 -07:00
parent 7764955209
commit df2dc82d11
3 changed files with 47 additions and 18 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.swp

View File

@@ -3619,7 +3619,8 @@ sub lost_connection {
my ($self, $e) = @_; my ($self, $e) = @_;
return 0 unless $e; return 0 unless $e;
return $e =~ m/MySQL server has gone away/ return $e =~ m/MySQL server has gone away/
|| $e =~ m/Lost connection to MySQL server/; || $e =~ m/Lost connection to MySQL server/
|| $e =~ m/Server shutdown in progress/;
} }
sub dbh { sub dbh {
@@ -10299,6 +10300,7 @@ sub main {
slaves => $slaves, slaves => $slaves,
max_chunk => $max_chunk, max_chunk => $max_chunk,
check_pr => $check_pr, check_pr => $check_pr,
have_time => $have_time,
OptionParser => $o, OptionParser => $o,
); );
@@ -11102,7 +11104,7 @@ sub check_slave_tables {
warn ts($msg); warn ts($msg);
$have_warned = 1; $have_warned = 1;
} }
sleep 2; # wait between failed reconnects attempts sleep 2; # wait between failed reconnect attempts
} }
next; # try again next; # try again
} # eval error } # eval error
@@ -11334,11 +11336,11 @@ sub have_more_chunks {
sub wait_for_last_checksum { sub wait_for_last_checksum {
my (%args) = @_; my (%args) = @_;
my @required_args = qw(tbl repl_table slaves max_chunk OptionParser); my @required_args = qw(tbl repl_table slaves max_chunk have_time OptionParser);
foreach my $arg ( @required_args ) { foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless defined $args{$arg}; die "I need a $arg argument" unless defined $args{$arg};
} }
my ($tbl, $repl_table, $slaves, $max_chunk, $o) = @args{@required_args}; my ($tbl, $repl_table, $slaves, $max_chunk, $have_time, $o) = @args{@required_args};
my $check_pr = $args{check_pr}; my $check_pr = $args{check_pr};
# Requiring "AND master_crc IS NOT NULL" avoids a race condition # Requiring "AND master_crc IS NOT NULL" avoids a race condition
@@ -11354,8 +11356,11 @@ sub wait_for_last_checksum {
my $n_slaves = scalar @$slaves - 1; my $n_slaves = scalar @$slaves - 1;
my @chunks; my @chunks;
my %skip_slave; my %skip_slave;
while ( $oktorun && ($chunks[0] || 0) < $max_chunk ) { my %have_warned;
my $checked_all;
while ( $oktorun && $have_time->() && (!$checked_all || (($chunks[0] || 0) < $max_chunk)) ) {
@chunks = (); @chunks = ();
$checked_all = 1;
for my $i ( 0..$n_slaves ) { for my $i ( 0..$n_slaves ) {
my $slave = $slaves->[$i]; my $slave = $slaves->[$i];
if ( $skip_slave{$i} ) { if ( $skip_slave{$i} ) {
@@ -11363,16 +11368,35 @@ sub wait_for_last_checksum {
'due to previous error it caused'); 'due to previous error it caused');
next; next;
} }
PTDEBUG && _d('Getting last checksum on', $slave->name());
eval { eval {
my ($chunk) = $slave->dbh()->selectrow_array($sql); my ($chunk) = $slave->dbh()->selectrow_array($sql);
PTDEBUG && _d($slave->name(), 'max chunk:', $chunk); PTDEBUG && _d($slave->name(), 'max chunk:', $chunk);
push @chunks, $chunk || 0; push @chunks, $chunk || 0;
}; };
if (my $e = $EVAL_ERROR) {
PTDEBUG && _d($e);
if ( $slave->lost_connection($e) ) {
if ( !$have_warned{$i} && $o->get('quiet') < 2 ) {
warn ts("Lost connection to " . $slave->name() . " while "
. "waiting for the last checksum of table "
. "$tbl->{db}.$tbl->{tbl} to replicate. Will reconnect "
. "and try again. No more warnings for this replica will "
. "be printed.\n");
$have_warned{$i}++;
}
eval { $slave->connect() };
if ( $EVAL_ERROR ) { if ( $EVAL_ERROR ) {
PTDEBUG && _d($EVAL_ERROR);
sleep 1; # wait between failed reconnect attempts
}
$checked_all = 0;
}
else {
if ( $o->get('quiet') < 2 ) { if ( $o->get('quiet') < 2 ) {
warn ts("Error waiting for the last checksum of table " warn ts("Error waiting for the last checksum of table "
. "$tbl->{db}.$tbl->{tbl} to replicate to " . "$tbl->{db}.$tbl->{tbl} to replicate to "
. "replica " . $slave->name() . ": $EVAL_ERROR\n" . "replica " . $slave->name() . ": $e\n"
. "Check that the replica is running and has the " . "Check that the replica is running and has the "
. "replicate table $repl_table. Checking the replica " . "replicate table $repl_table. Checking the replica "
. "for checksum differences will probably cause " . "for checksum differences will probably cause "
@@ -11380,9 +11404,11 @@ sub wait_for_last_checksum {
} }
$tbl->{checksum_results}->{errors}++; $tbl->{checksum_results}->{errors}++;
$skip_slave{$i} = 1; $skip_slave{$i} = 1;
}
next; next;
} }
} }
# If we have no chunks, which can happen if the slaves # If we have no chunks, which can happen if the slaves
# were skipped due to errors, then @chunks will be empty # were skipped due to errors, then @chunks will be empty
# and nothing of the following applies. In fact, it # and nothing of the following applies. In fact, it

View File

@@ -199,9 +199,11 @@ sub lost_connection {
my ($self, $e) = @_; my ($self, $e) = @_;
return 0 unless $e; return 0 unless $e;
return $e =~ m/MySQL server has gone away/ return $e =~ m/MySQL server has gone away/
|| $e =~ m/Lost connection to MySQL server/; || $e =~ m/Lost connection to MySQL server/
|| $e =~ m/Server shutdown in progress/;
# The 1st pattern means that MySQL itself died or was stopped. # The 1st pattern means that MySQL itself died or was stopped.
# The 2nd pattern means that our cxn was killed (KILL <id>). # The 2nd pattern means that our cxn was killed (KILL <id>).
# The 3rd pattern means MySQL is about to shut down.
} }
# Sub: dbh # Sub: dbh