add sanity checks and slave-lag synchronization to prevent replication breakage, false sandbox breakage, etc.

This commit is contained in:
Baron Schwartz
2012-06-06 04:47:45 -04:00
parent c2bb24befc
commit 9865ad451d
4 changed files with 82 additions and 40 deletions

View File

@@ -730,6 +730,13 @@ sub get_master_binlog_pos {
return $ms->{position}; return $ms->{position};
} }
sub get_slave_pos_relative_to_master {
my ($dbh) = @_;
my $sql = "SHOW SLAVE STATUS";
my $ss = $dbh->selectrow_hashref($sql);
return $ss->{exec_master_log_pos};
}
sub _d { sub _d {
my ($package, undef, $line) = caller 0; my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }

View File

@@ -186,26 +186,8 @@ sub wipe_clean {
} }
my $slave2_dbh = $self->get_dbh_for('slave2'); my $slave2_dbh = $self->get_dbh_for('slave2');
my $ok = PerconaTest::wait_until( $self->wait_for_slaves();
sub {
my $dbs = $slave2_dbh->selectall_arrayref("SHOW DATABASES");
if ( grep { $_->[0] !~ m/$test_dbs/ } @$dbs ) {
PTDEVDEBUG && _d('Waiting for databases to drop', Dumper($dbs));
return 0;
}
return 1;
}
);
$slave2_dbh->disconnect; $slave2_dbh->disconnect;
if ( !$ok ) {
# If this happen, chances are ok() is going to throw
# ERROR: Databases are left on slave1: foo
# Or maybe not if by chance the DROP statement replicates
# between now and then.
Test::More::diag("WARNING: Timeout in Sandbox::wipe_clean() "
. "waiting for databases to drop");
}
return; return;
} }
@@ -314,6 +296,8 @@ sub leftover_databases {
sub ok { sub ok {
my ($self) = @_; my ($self) = @_;
my @errors; my @errors;
# First, wait for all slaves to be caught up to their masters.
$self->wait_for_slaves();
push @errors, $self->master_is_ok('master'); push @errors, $self->master_is_ok('master');
push @errors, $self->slave_is_ok('slave1', 'master'); push @errors, $self->slave_is_ok('slave1', 'master');
push @errors, $self->slave_is_ok('slave2', 'slave1', 1); push @errors, $self->slave_is_ok('slave2', 'slave1', 1);
@@ -327,6 +311,23 @@ sub ok {
return !@errors; return !@errors;
} }
# Dings a heartbeat on the master, and waits until the slave catches up fully to
# that.
sub wait_for_slaves {
my $self = shift;
my $now = time();
my $master_dbh = $self->get_dbh_for('master');
my $slave2_dbh = $self->get_dbh_for('slave2');
$master_dbh->do("update percona_test.sentinel set a=$now where id = 1");
PerconaTest::wait_until(
sub {
my $then = $slave2_dbh->selectall_arrayref(
"select a from percona_test.sentinel where id = 1")->[0]->[0];
return $now == $then;
}, undef, 1000
);
}
sub _d { sub _d {
my ($package, undef, $line) = caller 0; my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }

View File

@@ -284,13 +284,26 @@ case $opt in
if [ $? -eq 0 -a "$MYSQL_VERSION" '>' "4.1" ]; then if [ $? -eq 0 -a "$MYSQL_VERSION" '>' "4.1" ]; then
echo -n "Loading sakila database... " echo -n "Loading sakila database... "
./load-sakila-db 12345 ./load-sakila-db 12345
../util/checksum-test-dataset
exit_status=$((exit_status | $?)) exit_status=$((exit_status | $?))
if [ $? -ne 0 ]; then if [ $exit_status -ne 0 ]; then
echo "FAILED" echo "FAILED"
else else
echo "OK" echo "OK"
fi fi
../util/checksum-test-dataset
now=$(date +%s);
/tmp/12345/use -e 'create table percona_test.sentinel(id int primary key, a int unsigned)';
/tmp/12345/use -e "insert into percona_test.sentinel(id, a) values(1, $now)";
echo -n "Waiting for replication to finish..."
while true; do
found=$(/tmp/12347/use -ss -e 'select a from percona_test.sentinel where id = 1' 2>/dev/null)
if [ "$found" = "$now" ]; then
break
fi
echo -n '.'
sleep 1
done
echo ' OK'
fi fi
fi fi
if [ $exit_status -eq 0 ]; then if [ $exit_status -eq 0 ]; then

View File

@@ -55,7 +55,7 @@ my $row;
# You must call this sub if the master 12345 or slave1 12346 is restarted, # You must call this sub if the master 12345 or slave1 12346 is restarted,
# else a slave might notice that its master went away and enter the "trying # else a slave might notice that its master went away and enter the "trying
# to reconnect" state, and then replication will break as the tests continue. # to reconnect" state, and then replication will break as the tests continue.
sub reset_slaves { sub restart_slave_threads {
$slave1_dbh->do('STOP SLAVE'); $slave1_dbh->do('STOP SLAVE');
$slave2_dbh->do('STOP SLAVE'); $slave2_dbh->do('STOP SLAVE');
$slave1_dbh->do('START SLAVE'); $slave1_dbh->do('START SLAVE');
@@ -67,6 +67,7 @@ sub reset_slaves {
# ############################################################################# # #############################################################################
# Add a replication filter to the slaves. # Add a replication filter to the slaves.
diag('Stopping 12346 and 12347 to reconfigure them with replication filters');
diag(`/tmp/12347/stop >/dev/null`); diag(`/tmp/12347/stop >/dev/null`);
diag(`/tmp/12346/stop >/dev/null`); diag(`/tmp/12346/stop >/dev/null`);
for my $port ( qw(12346 12347) ) { for my $port ( qw(12346 12347) ) {
@@ -93,16 +94,16 @@ is(
like( like(
$output, $output,
qr/h=127.0.0.1,P=12346/, qr/h=127.0.0.1,P=12346/,
"Warns about replication fitler on slave1" "Warns about replication filter on slave1"
); );
like( like(
$output, $output,
qr/h=127.0.0.1,P=12347/, qr/h=127.0.0.1,P=12347/,
"Warns about replication fitler on slave2" "Warns about replication filter on slave2"
); );
# Disable the check. # Disable the check and run again
$output = output( $output = output(
sub { pt_table_checksum::main(@args, qw(-t sakila.country), sub { pt_table_checksum::main(@args, qw(-t sakila.country),
qw(--no-check-replication-filters)) }, qw(--no-check-replication-filters)) },
@@ -112,10 +113,18 @@ $output = output(
like( like(
$output, $output,
qr/sakila\.country$/, qr/sakila\.country$/,
"--no-check-replication-filters" "--no-check-replication-filters didn't cause warning, and the tool ran"
);
cmp_ok(
PerconaTest::get_master_binlog_pos($master_dbh),
'>',
$pos,
"Did checksum with replication filter"
); );
# Remove the replication filter from the slave. # Remove the replication filter from the slave.
diag('Restarting the slaves again to remove the replication filters');
diag(`/tmp/12347/stop >/dev/null`); diag(`/tmp/12347/stop >/dev/null`);
diag(`/tmp/12346/stop >/dev/null`); diag(`/tmp/12346/stop >/dev/null`);
for my $port ( qw(12346 12347) ) { for my $port ( qw(12346 12347) ) {
@@ -131,17 +140,23 @@ $slave2_dbh = $sb->get_dbh_for('slave2');
# Write some results to master and slave for dbs mysql and sakila. # Write some results to master and slave for dbs mysql and sakila.
$sb->wipe_clean($master_dbh); $sb->wipe_clean($master_dbh);
pt_table_checksum::main(@args, qw(--chunk-time 0 --chunk-size 100), $output = output(
'-t', 'mysql.user,sakila.city', qw(--quiet)); sub {
pt_table_checksum::main(@args, qw(--chunk-time 0 --chunk-size 100),
'-t', 'mysql.user,sakila.city', qw(--quiet));
},
stderr => 1,
);
PerconaTest::wait_for_table($slave1_dbh, 'percona.checksums', "db='sakila' and tbl='city' and chunk=6"); PerconaTest::wait_for_table($slave1_dbh, 'percona.checksums', "db='sakila' and tbl='city' and chunk=6");
# Add a replication filter to the master: ignore db mysql. # Add a replication filter to the master: ignore db mysql.
$master_dbh->disconnect(); $master_dbh->disconnect();
diag('Restarting 12345 to add binlog_ignore_db filter');
diag(`/tmp/12345/stop >/dev/null`); diag(`/tmp/12345/stop >/dev/null`);
diag(`cp /tmp/12345/my.sandbox.cnf /tmp/12345/orig.cnf`); diag(`cp /tmp/12345/my.sandbox.cnf /tmp/12345/orig.cnf`);
diag(`echo "binlog-ignore-db=mysql" >> /tmp/12345/my.sandbox.cnf`); diag(`echo "binlog-ignore-db=mysql" >> /tmp/12345/my.sandbox.cnf`);
diag(`/tmp/12345/start >/dev/null`); diag(`/tmp/12345/start >/dev/null`);
reset_slaves(); restart_slave_threads();
$master_dbh = $sb->get_dbh_for('master'); $master_dbh = $sb->get_dbh_for('master');
# Checksum the tables again in 1 chunk. Since db percona isn't being # Checksum the tables again in 1 chunk. Since db percona isn't being
@@ -158,7 +173,7 @@ $row = $slave1_dbh->selectall_arrayref("select db,tbl,chunk from percona.checksu
is_deeply( is_deeply(
$row, $row,
[[qw(sakila city 1)]], [[qw(sakila city 1)]],
"binlog-ignore-db" "binlog-ignore-db and --empty-replicate-table"
) or print STDERR Dumper($row); ) or print STDERR Dumper($row);
$master_dbh->do("use percona"); $master_dbh->do("use percona");
@@ -177,12 +192,13 @@ wait_until(
# Restore original config. Then add a binlog-do-db filter so master # Restore original config. Then add a binlog-do-db filter so master
# will only replicate statements when USE mysql is in effect. # will only replicate statements when USE mysql is in effect.
$master_dbh->disconnect(); $master_dbh->disconnect();
diag('Restarting master to reconfigure with binlog-do-db filter only');
diag(`/tmp/12345/stop >/dev/null`); diag(`/tmp/12345/stop >/dev/null`);
diag(`cp /tmp/12345/orig.cnf /tmp/12345/my.sandbox.cnf`); diag(`cp /tmp/12345/orig.cnf /tmp/12345/my.sandbox.cnf`);
diag(`echo "binlog-do-db=mysql" >> /tmp/12345/my.sandbox.cnf`); diag(`echo "binlog-do-db=mysql" >> /tmp/12345/my.sandbox.cnf`);
diag(`/tmp/12345/start >/dev/null`); diag(`/tmp/12345/start >/dev/null`);
$master_dbh = $sb->get_dbh_for('master'); $master_dbh = $sb->get_dbh_for('master');
reset_slaves(); restart_slave_threads();
$output = output( $output = output(
sub { pt_table_checksum::main(@args, qw(--no-check-replication-filters), sub { pt_table_checksum::main(@args, qw(--no-check-replication-filters),
@@ -203,7 +219,7 @@ is_deeply(
"binlog-do-do, without --replicate-database" "binlog-do-do, without --replicate-database"
) or print STDERR Dumper($row); ) or print STDERR Dumper($row);
# Now force --replicate-database test and the checksums should not replicate. # Now force --replicate-database sakila and the checksums should not replicate.
$master_dbh->do("use mysql"); $master_dbh->do("use mysql");
$master_dbh->do("truncate table percona.checksums"); $master_dbh->do("truncate table percona.checksums");
wait_until( wait_until(
@@ -218,7 +234,12 @@ $pos = PerconaTest::get_master_binlog_pos($master_dbh);
pt_table_checksum::main(@args, qw(--quiet --no-check-replication-filters), pt_table_checksum::main(@args, qw(--quiet --no-check-replication-filters),
qw(-t mysql.user --replicate-database sakila --no-replicate-check)); qw(-t mysql.user --replicate-database sakila --no-replicate-check));
sleep 1; my $pos_after = PerconaTest::get_master_binlog_pos($master_dbh);
wait_until(
sub {
$pos_after <= PerconaTest::get_slave_pos_relative_to_master($slave1_dbh);
}
);
$row = $slave1_dbh->selectall_arrayref("select * from percona.checksums where db='mysql' AND tbl='user'"); $row = $slave1_dbh->selectall_arrayref("select * from percona.checksums where db='mysql' AND tbl='user'");
ok( ok(
@@ -237,19 +258,19 @@ is(
# ############################################################################# # #############################################################################
# Restore the original config. # Restore the original config.
diag('Restoring original sandbox server configuration');
$master_dbh->disconnect(); $master_dbh->disconnect();
diag(`/tmp/12345/stop >/dev/null`); diag(`/tmp/12345/stop >/dev/null`);
diag(`mv /tmp/12345/orig.cnf /tmp/12345/my.sandbox.cnf`); diag(`mv /tmp/12345/orig.cnf /tmp/12345/my.sandbox.cnf`);
diag(`/tmp/12345/start >/dev/null`); diag(`/tmp/12345/start >/dev/null`);
$master_dbh = $sb->get_dbh_for('master'); $master_dbh = $sb->get_dbh_for('master');
# Reset the slaves and clear the binlogs. # Get the master's binlog pos so we can check its binlogs for USE statements
diag(`$trunk/sandbox/test-env reset`); $row = $master_dbh->selectrow_hashref('show master status');
pt_table_checksum::main(@args, qw(--quiet)); pt_table_checksum::main(@args, qw(--quiet));
$row = $master_dbh->selectrow_hashref('show master status'); $output = `$ENV{PERCONA_TOOLKIT_SANDBOX}/bin/mysqlbinlog /tmp/12345/data/$row->{file} --start-position=$row->{position} | grep 'use ' | grep -v '^# Warning' | sort -u`;
$output = `$ENV{PERCONA_TOOLKIT_SANDBOX}/bin/mysqlbinlog /tmp/12345/data/$row->{file} | grep 'use ' | grep -v '^# Warning' | sort -u`;
is( is(
$output, $output,
@@ -261,12 +282,12 @@ use sakila/*!*/;
"USE each table's database (binlog dump)" "USE each table's database (binlog dump)"
); );
# Clear the binlogs. # Get the master's binlog pos so we can check its binlogs for USE statements
diag(`$trunk/sandbox/test-env reset`); $row = $master_dbh->selectrow_hashref('show master status');
pt_table_checksum::main(@args, qw(--quiet --replicate-database percona)); pt_table_checksum::main(@args, qw(--quiet --replicate-database percona));
$output = `$ENV{PERCONA_TOOLKIT_SANDBOX}/bin/mysqlbinlog /tmp/12345/data/$row->{file} | grep 'use ' | grep -v '^# Warning'`; $output = `$ENV{PERCONA_TOOLKIT_SANDBOX}/bin/mysqlbinlog /tmp/12345/data/$row->{file} --start-position=$row->{position} | grep 'use ' | grep -v '^# Warning'`;
is( is(
$output, $output,
"use percona/*!*/; "use percona/*!*/;