Merge Kenny's pt-slave-restart-gtid-support branch.

This commit is contained in:
Daniel Nichter
2014-05-28 15:25:08 -07:00
7 changed files with 419 additions and 13 deletions

View File

@@ -4766,6 +4766,12 @@ use sigtrap qw(handler finish untrapped normal-signals);
use Percona::Toolkit;
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
use Data::Dumper;
local $Data::Dumper::Indent = 1;
local $Data::Dumper::Sortkeys = 1;
local $Data::Dumper::Quotekeys = 0;
$OUTPUT_AUTOFLUSH = 1;
my $o;
@@ -4976,10 +4982,36 @@ sub watch_server {
$start_sql .= " UNTIL RELAY_LOG_FILE = '$file', RELAY_LOG_POS = $pos";
}
my $set_skip = $dbh->prepare("SET GLOBAL SQL_SLAVE_SKIP_COUNTER = "
. $o->get('skip-count'));
my $start = $dbh->prepare($start_sql);
my $stop = $dbh->prepare('STOP SLAVE');
my $start = $dbh->prepare($start_sql);
my $stop = $dbh->prepare('STOP SLAVE');
# ########################################################################
# Detect if GTID is enabled. Skipping an event is done differently.
# ########################################################################
# When MySQL 5.6.5 or higher is used and gtid is enabled, skipping a
# transaction is not possible with SQL_SLAVE_SKIP_COUNTER
my $skip_event;
my $have_gtid = 0;
if ( VersionParser->new($dbh) >= '5.6.5' ) {
my $row = $dbh->selectrow_arrayref('SELECT @@GLOBAL.gtid_mode');
PTDEBUG && _d('@@GLOBAL.gtid_mode:', $row->[0]);
if ( $row && $row->[0] eq 'ON' ) {
$have_gtid = 1;
}
}
PTDEBUG && _d('Have GTID:', $have_gtid);
# If GTID is enabled, slave_parallel_workers should be == 0.
# It's currently not possible to know what GTID event the failed trx is.
if ( $have_gtid ) {
my $threads = $dbh->selectrow_hashref(
'SELECT @@GLOBAL.slave_parallel_workers AS threads');
if ( $threads->{threads} > 0 ) {
die "Cannot skip transactions properly because GTID is enabled "
. "and slave_parallel_workers > 0. See 'GLOBAL TRANSACTION IDS' "
. "in the tool's documentation.\n";
}
}
# ########################################################################
# Lookup tables of things to do when a problem is detected.
@@ -4989,7 +5021,7 @@ sub watch_server {
[ qr/Could not parse relay log event entry/ => 'refetch_relay_log' ],
[ qr/Incorrect key file for table/ => 'repair_table' ],
# This must be the last one. It's a catch-all rule: skip and restart.
[ qr/./ => 'skip' ],
[ qr/./ => ($have_gtid ? 'skip_gtid' : 'skip') ],
);
# ########################################################################
@@ -5012,9 +5044,61 @@ sub watch_server {
},
skip => sub {
my ( $stat, $dbh ) = @_;
PTDEBUG && _d('Found non-relay-log error');
my $set_skip = $dbh->prepare("SET GLOBAL SQL_SLAVE_SKIP_COUNTER = "
. $o->get('skip-count'));
$set_skip->execute();
},
skip_gtid => sub {
my ( $stat, $dbh ) = @_;
# Get master_uuid from SHOW SLAVE STATUS if a UUID is not specified
# with --master-uuid.
my $gtid_uuid = $o->get('master-uuid');
if ( !$gtid_uuid ) {
$gtid_uuid = $stat->{master_uuid};
die "No master_uuid" unless $gtid_uuid; # shouldn't happen
}
# We need the highest transaction in the executed_gtid_set.
# and then we need to increase it by 1 (the one we want to skip)
# Notes:
# - does not work with parallel replication
# - it skips the next transaction from the master_uuid
# (when a slaveB is replicating from slaveA,
# the master_uuid is it's own master, slaveA)
my ($gtid_exec_ids) = ($stat->{executed_gtid_set} || '') =~ m/$gtid_uuid([0-9-:]*)/;
$gtid_exec_ids =~ s/:[0-9]-/:/g;
die "No executed GTIDs" unless $gtid_exec_ids;
my @gtid_exec_ranges = split(/:/, $gtid_exec_ids);
delete $gtid_exec_ranges[0]; # undef the first value, it's always empty
# Get the highest id by sorting the array, removing the undef value.
my @gtid_exec_sorted = sort { $a <=> $b }
grep { defined($_) } @gtid_exec_ranges;
my $gtid_exec_last = $gtid_exec_sorted[-1];
PTDEBUG && _d("\n",
"GTID: master_uuid:", $gtid_uuid, "\n",
"GTID: executed_gtid_set:", $gtid_exec_ids, "\n",
"GTID: max for master_uuid:", $gtid_exec_sorted[-1], "\n",
"GTID: last executed gtid:", $gtid_uuid, ":", $gtid_exec_last);
# Set the sessions next gtid, write an empty transaction
my $skipped = 0;
while ( $skipped++ < $o->get('skip-count') ) {
my $gtid_next = $gtid_exec_last + $skipped;
my $sql = "SET GTID_NEXT='$gtid_uuid:$gtid_next'";
PTDEBUG && _d($sql);
my $sth = $dbh->prepare($sql);
$sth->execute();
$dbh->begin_work();
$dbh->commit();
}
# Set the session back to the automatically generated GTID_NEXT.
$dbh->do("SET GTID_NEXT='AUTOMATIC'");
},
repair_table => sub {
my ( $stat, $dbh ) = @_;
PTDEBUG && _d('Found corrupt table');
@@ -5301,6 +5385,28 @@ sleep time, whichever is less.
=back
=head1 GLOBAL TRANSACTION IDS
As of Percona Toolkit 2.2.8, pt-slave-restart supports Global Transaction IDs
introduced in MySQL 5.6.5. It's important to keep in mind that:
=over
=item *
pt-slave-restart will not skip transactions when multiple replication threads
are being used (slave_parallel_workers > 0). pt-slave-restart does not know
what the GTID event is of the failed transaction of a specific slave thread.
=item *
The default behavior is to skip the next transaction from the slave's master.
Writes can originate on different servers, each with their own UUID.
See L<"--master-uuid">.
=back
=head1 EXIT STATUS
An exit status of 0 (sometimes also called a return value or return code)
@@ -5555,6 +5661,26 @@ type: int; default: 1
Number of statements to skip when restarting the slave.
=item --master-uuid
type: string
When using GTID, an empty transaction should be created in order to skip it.
If writes are coming from different nodes in the replication tree above, it is
not possible to know which event from which UUID to skip.
By default, transactions from the slave's master (C<'Master_UUID'> from
C<SHOW SLAVE STATUS>) are skipped.
For example, with
master1 -> slave1 -> slave2
When skipping events on slave2 that were written to master1, you must specify
the UUID of master1, else the tool will use the UUID of slave1 by default.
See L<"GLOBAL TRANSACTION IDS">.
=item --sleep
type: int; default: 1

View File

@@ -379,6 +379,7 @@ sub verify_test_data {
# Diff the two sets of checksums: host to master (ref).
my @diffs;
foreach my $c ( @checksums ) {
next unless $c->{checksum};
if ( $c->{checksum} ne $ref->{$c->{table}}->{checksum} ) {
push @diffs, $c->{table};
}

View File

@@ -113,6 +113,14 @@ make_sandbox() {
echo "query_cache_size=$QUERY_CACHE_SIZE" >> /tmp/$port/my.sandbox.cnf
fi
if [ -n "$GTID" ]; then
echo "gtid_mode=on" >> /tmp/$port/my.sandbox.cnf
echo "enforce_gtid_consistency" >> /tmp/$port/my.sandbox.cnf
fi
if [ -n "$REPLICATION_THREADS" ]; then
echo "slave_parallel_workers=$REPLICATION_THREADS" >> /tmp/$port/my.sandbox.cnf
fi
if [ -n "$EXTRA_DEFAULTS_FILE" ]; then
cat "$EXTRA_DEFAULTS_FILE" >> /tmp/$port/my.sandbox.cnf
fi

View File

@@ -315,13 +315,16 @@ case $opt in
fi
if [ $? -eq 0 -a "$MYSQL_VERSION" '>' "4.1" ]; then
echo -n "Loading sakila database... "
./load-sakila-db 12345 "${2:-""}"
exit_status=$((exit_status | $?))
if [ $exit_status -ne 0 ]; then
echo "FAILED"
else
echo "OK"
SAKILA=${SAKILA:-1}
if [ $SAKILA -eq 1 ]; then
echo -n "Loading sakila database... "
./load-sakila-db 12345 "${2:-""}"
exit_status=$((exit_status | $?))
if [ $exit_status -ne 0 ]; then
echo "FAILED"
else
echo "OK"
fi
fi
# Create percona_test db and checksum all the tables.

203
t/pt-slave-restart/gtid.t Normal file
View File

@@ -0,0 +1,203 @@
#!/usr/bin/env perl
BEGIN {
die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n"
unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH};
unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib";
};
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use Test::More;
use PerconaTest;
use Sandbox;
require "$trunk/bin/pt-slave-restart";
if ( $sandbox_version lt '5.6' ) {
plan skip_all => "Requires MySQL 5.6";
}
diag(`SAKILA=0 GTID=1 $trunk/sandbox/test-env restart`);
my $dp = new DSNParser(opts=>$dsn_opts);
my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
my $master_dbh = $sb->get_dbh_for('master');
my $slave1_dbh = $sb->get_dbh_for('slave1');
my $slave2_dbh = $sb->get_dbh_for('slave2');
if ( !$master_dbh ) {
plan skip_all => 'Cannot connect to sandbox master';
}
elsif ( !$slave1_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave1';
}
elsif ( !$slave2_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave2';
}
my $slave1_dsn = $sb->dsn_for("slave1");
my $slave2_dsn = $sb->dsn_for("slave2");
my $pid_file = "/tmp/pt-slave-restart-test-$PID.pid";
my $log_file = "/tmp/pt-slave-restart-test-$PID.log";
my $cmd = "$trunk/bin/pt-slave-restart --daemonize --run-time 5 --max-sleep .25 --pid $pid_file --log $log_file";
sub start {
my ( $extra ) = @_;
stop() or return;
system "$cmd $extra";
PerconaTest::wait_for_files($pid_file);
}
sub stop() {
return 1 if !is_running();
diag(`$trunk/bin/pt-slave-restart --stop -q >/dev/null 2>&1 &`);
wait_until(sub { !-f $pid_file }, 0.3, 2);
diag(`rm -f /tmp/pt-slave-restart-sentinel`);
return is_running() ? 0 : 1;
}
sub is_running {
chomp(my $running = `ps -eaf | grep -v grep | grep '$cmd'`);
if (!-f $pid_file && !$running) {
return 0;
} elsif (-f $pid_file && !$running) {
diag(`rm -f $pid_file`);
return 0;
}
return 1;
}
sub wait_repl_broke {
my $dbh = shift;
return wait_until(
sub {
my $row = $dbh->selectrow_hashref('show slave status');
return $row->{last_sql_errno};
}
);
}
sub wait_repl_ok {
my $dbh = shift;
wait_until(
sub {
my $row = $dbh->selectrow_hashref('show slave status');
return $row->{last_sql_errno} == 0;
},
0.30,
5,
);
}
# #############################################################################
# Basic test to see if restart works with GTID.
# #############################################################################
$master_dbh->do('DROP DATABASE IF EXISTS test');
$master_dbh->do('CREATE DATABASE test');
$master_dbh->do('CREATE TABLE test.t (a INT)');
$sb->wait_for_slaves;
# Bust replication
$slave1_dbh->do('DROP TABLE test.t');
$master_dbh->do('INSERT INTO test.t SELECT 1');
wait_repl_broke($slave1_dbh) or die "Failed to break replication";
my $r = $slave1_dbh->selectrow_hashref('show slave status');
like($r->{last_error}, qr/Table 'test.t' doesn't exist'/, 'slave: Replication broke');
# Start pt-slave-restart and wait up to 5s for it to fix replication
# (it should take < 1s but tests can be really slow sometimes).
start("$slave1_dsn") or die "Failed to start pt-slave-restart";
wait_repl_ok($slave1_dbh);
# Check if replication is fixed.
$r = $slave1_dbh->selectrow_hashref('show slave status');
like(
$r->{last_errno},
qr/^0$/,
'Event is skipped',
) or BAIL_OUT("Replication is broken");
# Stop pt-slave-restart.
stop() or die "Failed to stop pt-slave-restart";
# #############################################################################
# Test the slave of the master.
# #############################################################################
$master_dbh->do('DROP DATABASE IF EXISTS test');
$master_dbh->do('CREATE DATABASE test');
$master_dbh->do('CREATE TABLE test.t (a INT)');
$sb->wait_for_slaves;
# Bust replication
$slave2_dbh->do('DROP TABLE test.t');
$master_dbh->do('INSERT INTO test.t SELECT 1');
wait_repl_broke($slave2_dbh) or die "Failed to break replication";
# fetch the master uuid, which is the machine we need to skip an event from
$r = $master_dbh->selectrow_hashref('select @@GLOBAL.server_uuid as uuid');
my $uuid = $r->{uuid};
$r = $slave2_dbh->selectrow_hashref('show slave status');
like($r->{last_error}, qr/Table 'test.t' doesn't exist'/, 'slaveofslave: Replication broke');
# Start an instance
start("--master-uuid=$uuid $slave2_dsn") or die;
wait_repl_ok($slave2_dbh);
$r = $slave2_dbh->selectrow_hashref('show slave status');
like(
$r->{last_errno},
qr/^0$/,
'Skips event from master on slave2'
) or BAIL_OUT("Replication is broken");
stop() or die "Failed to stop pt-slave-restart";
# #############################################################################
# Test skipping 2 events in a row.
# #############################################################################
$master_dbh->do('DROP DATABASE IF EXISTS test');
$master_dbh->do('CREATE DATABASE test');
$master_dbh->do('CREATE TABLE test.t (a INT)');
$sb->wait_for_slaves;
# Bust replication
$slave2_dbh->do('DROP TABLE test.t');
$master_dbh->do('INSERT INTO test.t SELECT 1');
$master_dbh->do('INSERT INTO test.t SELECT 1');
wait_repl_broke($slave2_dbh) or die "Failed to break replication";
# fetch the master uuid, which is the machine we need to skip an event from
$r = $master_dbh->selectrow_hashref('select @@GLOBAL.server_uuid as uuid');
$uuid = $r->{uuid};
$r = $slave2_dbh->selectrow_hashref('show slave status');
like($r->{last_error}, qr/Table 'test.t' doesn't exist'/, 'slaveofslaveskip2: Replication broke');
# Start an instance
start("--skip-count=2 --master-uuid=$uuid $slave2_dsn") or die;
wait_repl_ok($slave2_dbh);
$r = $slave2_dbh->selectrow_hashref('show slave status');
like(
$r->{last_errno},
qr/^0$/,
'Skips multiple events'
) or BAIL_OUT("Replication is broken");
stop() or die "Failed to stop pt-slave-restart";
# #############################################################################
# Done.
# #############################################################################
diag(`rm -f $pid_file $log_file >/dev/null`);
diag(`$trunk/sandbox/test-env restart`);
ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox");
done_testing;

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env perl
BEGIN {
die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n"
unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH};
unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib";
};
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use Test::More;
use PerconaTest;
use Sandbox;
require "$trunk/bin/pt-slave-restart";
if ( $sandbox_version lt '5.6' ) {
plan skip_all => 'MySQL Version ' . $sandbox_version
. ' < 5.6, GTID is not available, skipping tests';
}
diag("Stopping/reconfiguring/restarting sandboxes 12345, 12346 and 12347");
diag(`$trunk/sandbox/test-env stop >/dev/null`);
diag(`REPLICATION_THREADS=2 GTID=1 $trunk/sandbox/test-env start >/dev/null`);
my $dp = new DSNParser(opts=>$dsn_opts);
my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
my $master_dbh = $sb->get_dbh_for('master');
my $slave_dbh = $sb->get_dbh_for('slave1');
my $slave2_dbh = $sb->get_dbh_for('slave2');
if ( !$master_dbh ) {
plan skip_all => 'Cannot connect to sandbox master';
}
elsif ( !$slave_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave1';
}
elsif ( !$slave2_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave2';
}
# #############################################################################
# pt-slave-restart should exit!
# #############################################################################
# Start an instance
my $output=`$trunk/bin/pt-slave-restart --run-time=1s -h 127.0.0.1 -P 12346 -u msandbox -p msandbox 2>&1`;
like(
$output,
qr/It is impossible to skip transactions properly./,
"pt-slave-restart exits with multiple replication threads"
);
# #############################################################################
# Done.
# #############################################################################
diag(`rm -f /tmp/pt-slave-re*`);
diag(`$trunk/sandbox/test-env stop >/dev/null`);
diag(`$trunk/sandbox/test-env start >/dev/null`);
ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox");
done_testing;

View File

@@ -61,6 +61,7 @@ my $sql = "CHECKSUM TABLES "
. join(", ", map { "sakila.$_" } @tables_in_sakila);
my @checksums = @{$dbh->selectall_arrayref($sql, {Slice => {} })};
foreach my $c ( @checksums ) {
next unless $c->{Checksum};
$dbh->do("INSERT INTO percona_test.checksums(db_tbl, checksum)
VALUES('$c->{Table}', $c->{Checksum})");
}