diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index cd8ffc0e..feb51eef 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -5822,7 +5822,6 @@ sub main { if ( ($expl->{key} || '') ne $nibble_iter->nibble_index() ) { MKDEBUG && _d('Chunk', $args{nibbleno}, 'of table', "$tbl->{db}.$tbl->{tbl} not using chunk index, skipping"); - $tbl->{nibble_time} = 0; return 0; # next boundary } @@ -5835,7 +5834,6 @@ sub main { && $oversize_chunk ) { MKDEBUG && _d('Chunk', $args{nibbleno}, 'of table', "$tbl->{db}.$tbl->{tbl} is too large, skipping"); - $tbl->{nibble_time} = 0; return 0; # next boundary } } @@ -5867,12 +5865,18 @@ sub main { my $chunk = $nibble_iter->nibble_number(); # Nibble time will be zero if the chunk was skipped. - if ( $tbl->{nibble_time} == 0 ) { + if ( !defined $tbl->{nibble_time} ) { MKDEBUG && _d('Skipping chunk', $chunk); $tbl->{checksum_results}->{skipped}++; return; } + # Max chunk number that worked. This may be less than the total + # number of chunks if, for example, chunk 16 of 16 times out, but + # chunk 15 worked. The max chunk is used for checking for diffs + # on the slaves, in the done callback. + $tbl->{max_chunk} = $chunk; + # Fetch the checksum that we just executed from the replicate table. $fetch_sth->execute(@{$tbl}{qw(db tbl)}, $chunk); my ($crc, $cnt) = $fetch_sth->fetchrow_array(); @@ -5951,14 +5955,14 @@ sub main { my (%args) = @_; my $tbl = $args{tbl}; my $nibble_iter = $args{NibbleIterator}; - my $max_chunk = $nibble_iter->nibble_number(); + my $max_chunk = $tbl->{max_chunk}; # Don't need to do anything here if we're just --explain'ing. return if $o->get('explain'); # Wait for all slaves to run all checksum chunks, # then check for differences. - if ( $o->get('replicate-check') && scalar @$slaves ) { + if ( $max_chunk && $o->get('replicate-check') && scalar @$slaves ) { MKDEBUG && _d('Checking slave diffs'); my $check_pr; @@ -6318,8 +6322,7 @@ sub exec_nibble { if ( $o->get('quiet') < 2 ) { warn "$error\n"; } - $tbl->{checksum_results}->{errors}++; - return 0; # zero nibble time, skip this nibble + return; # skip this nibble } # This die will be caught by the eval inside the TABLE loop. diff --git a/t/pt-table-checksum/error_handling.t b/t/pt-table-checksum/error_handling.t index 50d895d9..88cd61c5 100644 --- a/t/pt-table-checksum/error_handling.t +++ b/t/pt-table-checksum/error_handling.t @@ -23,7 +23,7 @@ if ( !$master_dbh ) { plan skip_all => 'Cannot connect to sandbox master'; } else { - plan tests => 2; + plan tests => 6; } # The sandbox servers run with lock_wait_timeout=3 and it's not dynamic @@ -59,6 +59,57 @@ is( "Only one warning for MySQL error 1265" ); +# ############################################################################ +# Lock wait timeout +# ############################################################################ +$master_dbh->do('use sakila'); +$master_dbh->do('begin'); +$master_dbh->do('select * from city for update'); + +$output = output( + sub { pt_table_checksum::main(@args, qw(-t sakila.city)) }, + stderr => 1, + trf => sub { return PerconaTest::normalize_checksum_results(@_) }, +); + +like( + $output, + qr/Lock wait timeout exceeded/, + "Catches lock wait timeout" +); + +like( + $output, + qr/^0 0 0 1 1 sakila.city/m, + "Skips chunk that times out" +); + +# Lock wait timeout for sandbox servers is 3s, so sleep 4 then commit +# to release the lock. That should allow the checksum query to finish. +my ($id) = $master_dbh->selectrow_array('select connection_id()'); +system("sleep 4 ; /tmp/12345/use -e 'KILL $id' >/dev/null"); + +$output = output( + sub { pt_table_checksum::main(@args, qw(-t sakila.city)) }, + stderr => 1, + trf => sub { return PerconaTest::normalize_checksum_results(@_) }, +); + +unlike( + $output, + qr/Lock wait timeout exceeded/, + "Lock wait timeout retried" +); + +like( + $output, + qr/^0 0 600 1 0 sakila.city/m, + "Checksum retried after lock wait timeout" +); + +# Reconnect to master since we just killed ourself. +$master_dbh = $sb->get_dbh_for('master'); + # ############################################################################# # Done. # ############################################################################# diff --git a/t/pt-table-checksum/retry_timeouts.t b/t/pt-table-checksum/retry_timeouts.t deleted file mode 100644 index 11299eab..00000000 --- a/t/pt-table-checksum/retry_timeouts.t +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env perl - -BEGIN { - die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n" - unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH}; - unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib"; -}; - -use strict; -use warnings FATAL => 'all'; -use English qw(-no_match_vars); -use Test::More; - -use PerconaTest; -use Sandbox; -require "$trunk/bin/pt-table-checksum"; - -my $vp = new VersionParser(); -my $dp = new DSNParser(opts=>$dsn_opts); -my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); -my $dbh = $sb->get_dbh_for('master'); - -if ( !$dbh ) { - plan skip_all => 'Cannot connect to sandbox master'; -} -else { - plan tests => 3; -} - -my $output; -my $cnf = '/tmp/12345/my.sandbox.cnf'; -my @args = ('-F', $cnf, 'h=127.1', qw(--replicate test.checksums --create-replicate-table -t sakila.city)); - -$sb->create_dbs($dbh, ['test']); - -$dbh->do('use sakila'); -$dbh->do('begin'); -$dbh->do('select * from city for update'); - -# city table is now locked until we commit. The child proc is going -# to wait 3 seconds for innodb_lock_wait_timeout, then it should try -# again. So if we commit at 4 seconds, the child should succeed and -# the checksum will appear in test.checksums. - -my $pid = fork(); -if ( !$pid ) { - # child - my $output = output( - sub { pt_table_checksum::main(@args) }, - stderr => 1, - ); - exit 0; -} - -sleep 4; -$dbh->do('commit'); - -waitpid ($pid, 0); # reap child - -my $row = $dbh->selectrow_hashref('select * from test.checksums'); -ok( - $row && $row->{db} eq 'sakila' && $row->{tbl} eq 'city', - "Checksum after lock wait timeout" -); - - -# Repeat the test but this time let the retry fail to see that the -# failure is captured. -my $outfile = '/tmp/mk-table-checksum-output.txt'; -diag(`rm -rf $outfile >/dev/null`); - -$dbh->do('truncate table test.checksums'); - -$dbh->do('begin'); -$dbh->do('select * from city for update'); - -$pid = fork(); -if ( !$pid ) { - # child - my $output = output( - sub { pt_table_checksum::main(@args) }, - stderr => 1, - file => $outfile, - ); - exit 0; -} - -sleep 8; -$dbh->do('commit'); - -waitpid ($pid, 0); # reap child - -$row = $dbh->selectrow_hashref('select * from test.checksums'); -ok( - !defined $row, - "No checksum due to lock wait timeout" -); - -$output = `cat $outfile`; -like( - $output, - qr/Lock wait timeout exceeded/i, - "Lock wait timeout exceeded error captured" -); - -diag(`rm -rf $outfile >/dev/null`); - -# ############################################################################# -# Done. -# ############################################################################# -$sb->wipe_clean($dbh); -exit;