Merge ~percona-toolkit-dev/percona-toolkit/fix-ptc-checksum-tbl-bug-1008778.

This commit is contained in:
Daniel Nichter
2012-06-05 13:08:21 -04:00
5 changed files with 136 additions and 31 deletions

View File

@@ -6274,6 +6274,7 @@ sub main {
check_repl_table(
dbh => $master_dbh,
repl_table => $repl_table,
slaves => $slaves,
OptionParser => $o,
TableParser => $tp,
Quoter => $q,
@@ -7210,11 +7211,11 @@ sub print_checksum_diffs {
sub check_repl_table {
my ( %args ) = @_;
my @required_args = qw(dbh repl_table OptionParser TableParser Quoter);
my @required_args = qw(dbh repl_table slaves OptionParser TableParser Quoter);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $repl_table, $o, $tp, $q) = @args{@required_args};
my ($dbh, $repl_table, $slaves, $o, $tp, $q) = @args{@required_args};
PTDEBUG && _d('Checking --replicate table', $repl_table);
# If the repl db doesn't exit, auto-create it, maybe.
@@ -7265,6 +7266,43 @@ sub check_repl_table {
. "$repl_table.\n" unless $have_tbl_privs;
}
# Check and wait for the repl table to appear on all slaves.
# https://bugs.launchpad.net/percona-toolkit/+bug/1008778
if ( scalar @$slaves ) {
my $waiting_for;
my $pr;
if ( $o->get('progress') ) {
$pr = new Progress(
jobsize => scalar @$slaves,
spec => $o->get('progress'),
callback => sub {
print STDERR "Waiting for the --replicate table to replicate to "
. $waiting_for->name() . "...\n";
},
);
$pr->start();
}
foreach my $slave ( @$slaves ) {
PTDEBUG && _d('Checking if', $slave->name(), 'has repl table');
$waiting_for = $slave;
my $slave_has_repl_table = $tp->check_table(
dbh => $slave->dbh(),
db => $db,
tbl => $tbl,
);
while ( !$slave_has_repl_table ) {
$pr->update(sub { return 0; }) if $pr;
sleep 0.5;
$slave_has_repl_table = $tp->check_table(
dbh => $slave->dbh(),
db => $db,
tbl => $tbl,
);
}
}
}
return; # success, repl table is ready to go
}

View File

@@ -743,7 +743,7 @@ sub _d {
# This is because otherwise, errors thrown during cleanup
# would be skipped.
sub full_output {
my ( $code ) = @_;
my ( $code, %args ) = @_;
die "I need a code argument" unless $code;
my (undef, $file) = tempfile();
@@ -756,6 +756,19 @@ sub full_output {
my $status;
warn $file;
if (my $pid = fork) {
if ( my $t = $args{wait_for} ) {
# Wait for t seconds then kill the child.
sleep $t;
my $tries = 3;
# Most tools require 2 interrupts to make them stop.
while ( kill(0, $pid) && $tries-- ) {
kill SIGTERM, $pid;
sleep 0.10;
}
# Child didn't respond to SIGTERM? Then kill -9 it.
kill SIGKILL, $pid if kill(0, $pid);
sleep 0.25;
}
waitpid($pid, 0);
$status = $?;
}

View File

@@ -39,10 +39,9 @@ elsif ( !$slave2_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave2';
}
else {
plan tests => 4;
plan tests => 5;
}
# The sandbox servers run with lock_wait_timeout=3 and it's not dynamic
# so we need to specify --lock-wait-timeout=3 else the tool will die.
# And --max-load "" prevents waiting for status variables. Setting
@@ -99,6 +98,40 @@ is(
# Now wait until the SQL thread is started again.
wait_until_slave_running($slave1_dbh, $slave2_dbh);
# #############################################################################
# Wait for --replicate table to replicate.
# https://bugs.launchpad.net/percona-toolkit/+bug/1008778
# #############################################################################
$master_dbh->do("DROP DATABASE IF EXISTS percona");
wait_until(sub {
my $dbs = $slave2_dbh->selectall_arrayref("SHOW DATABASES");
return !grep { $_->[0] eq 'percona' } @$dbs;
});
$sb->load_file('master', "t/pt-table-checksum/samples/dsn-table.sql");
$slave2_dbh->do("STOP SLAVE");
wait_until(sub {
my $ss = $slave2_dbh->selectrow_hashref("SHOW SLAVE STATUS");
return $ss->{slave_io_running} eq 'Yes';
});
($output) = PerconaTest::full_output(
sub { pt_table_checksum::main(@args, qw(-t sakila.country),
"--recursion-method", "dsn=F=/tmp/12345/my.sandbox.cnf,t=dsns.dsns");
},
wait_for => 3, # wait this many seconds then kill that ^
);
like(
$output,
qr/Waiting for the --replicate table to replicate to h=127.1,P=12347/,
"--progress for --replicate table (bug 1008778)"
);
$slave2_dbh->do("START SLAVE");
wait_until_slave_running($slave2_dbh);
# #############################################################################
# Done.
# #############################################################################

View File

@@ -0,0 +1,15 @@
DROP DATABASE IF EXISTS dsns;
CREATE DATABASE dsns;
USE dsns;
CREATE TABLE dsns (
id int auto_increment primary key,
parent_id int default null,
dsn varchar(255) not null
);
INSERT INTO dsns VALUES
-- (1, null, 'h=127.1,P=12345,u=msandbox,p=msandbox'), -- master
(2, 1, 'h=127.1,P=12346,u=msandbox,p=msandbox'),
(3, 2, 'h=127.1,P=12347,u=msandbox,p=msandbox');

View File

@@ -11,6 +11,8 @@ use warnings FATAL => 'all';
use English qw(-no_match_vars);
use Test::More;
$ENV{PERCONA_TOOLKIT_TEST_USE_DSN_NAMES} = 1;
use PerconaTest;
use Sandbox;
shift @INC; # our unshift (above)
@@ -33,41 +35,50 @@ elsif ( !$slave2_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave2';
}
else {
plan tests => 5;
plan tests => 4;
}
# The sandbox servers run with lock_wait_timeout=3 and it's not dynamic
# so we need to specify --lock-wait-timeout=3 else the tool will die.
# And --max-load "" prevents waiting for status variables.
my $master_dsn = 'h=127.1,P=12345,u=msandbox,p=msandbox';
my @args = ($master_dsn, qw(--lock-wait-timeout 3), '--max-load', '');
my @args = ($master_dsn, qw(--lock-wait-timeout 3), '--max-load', '',
'--progress', 'time,1');
my $output;
my $row;
my $exit_status;
wait_until( # slaves aren't lagging
sub {
$row = $slave1_dbh->selectrow_hashref('show slave status');
return 0 if $row->{Seconds_Behind_Master};
$row = $slave2_dbh->selectrow_hashref('show slave status');
return 0 if $row->{Seconds_Behind_Master};
return 1;
}
) or die "Slaves are still lagging";
# Create the checksum table, else stopping the slave below
# will cause the tool to wait forever for the --replicate
# table to replicate to the stopped slave.
pt_table_checksum::main(@args, qw(-t sakila.city --quiet));
# ############################################################################
# --check-slave-lag
# ############################################################################
# Stop slave1.
PerconaTest::wait_until_no_lag($slave1_dbh, $slave2_dbh);
$slave1_dbh->do('stop slave sql_thread');
$row = $slave1_dbh->selectrow_hashref('show slave status');
is(
$row->{slave_sql_running},
'No',
'Stopped slave SQL thread on slave1'
wait_until(sub {
my $ss = $slave1_dbh->selectrow_hashref("SHOW SLAVE STATUS");
return $ss->{slave_sql_running} eq 'Yes';
});
# Try to checksum, but since slave1 is stopped, the tool should
# wait for it to stop "lagging".
($output) = PerconaTest::full_output(
sub { pt_table_checksum::main(@args, qw(-t sakila.city)) },
wait_for => 3,
);
like(
$output,
qr/Replica h=127.0.0.1,P=12346 is stopped/,
"Waits for stopped replica"
);
# Checksum but only use slave2 to check for lag.
$exit_status = pt_table_checksum::main(@args, qw(-t sakila.city --quiet),
qw(--no-replicate-check), '--check-slave-lag', 'P=12347');
@@ -84,15 +95,10 @@ is(
"Checksummed table"
);
# Start slave2 sql_thread and stop slave1 sql_thread and test that
# mk-table-checksum is really checking and waiting for just --slave-lag-dbh.
$slave1_dbh->do('start slave sql_thread');
$row = $slave1_dbh->selectrow_hashref('show slave status');
is(
$row->{slave_sql_running},
'Yes',
'Started slave SQL thread on slave1'
) or BAIL_OUT("Failed to restart SQL thread on slave2 (12347)");
$slave1_dbh->do('START SLAVE sql_thread');
$slave2_dbh->do('STOP SLAVE');
$slave2_dbh->do('START SLAVE');
PerconaTest::wait_until_slave_running($slave1_dbh, $slave2_dbh);
# #############################################################################
# Done.