diff --git a/bin/pt-slave-restart b/bin/pt-slave-restart index be4131cd..2bcc1eba 100755 --- a/bin/pt-slave-restart +++ b/bin/pt-slave-restart @@ -4766,6 +4766,12 @@ use sigtrap qw(handler finish untrapped normal-signals); use Percona::Toolkit; use constant PTDEBUG => $ENV{PTDEBUG} || 0; +use Data::Dumper; + +local $Data::Dumper::Indent = 1; +local $Data::Dumper::Sortkeys = 1; +local $Data::Dumper::Quotekeys = 0; + $OUTPUT_AUTOFLUSH = 1; my $o; @@ -4976,10 +4982,36 @@ sub watch_server { $start_sql .= " UNTIL RELAY_LOG_FILE = '$file', RELAY_LOG_POS = $pos"; } - my $set_skip = $dbh->prepare("SET GLOBAL SQL_SLAVE_SKIP_COUNTER = " - . $o->get('skip-count')); - my $start = $dbh->prepare($start_sql); - my $stop = $dbh->prepare('STOP SLAVE'); + my $start = $dbh->prepare($start_sql); + my $stop = $dbh->prepare('STOP SLAVE'); + + # ######################################################################## + # Detect if GTID is enabled. Skipping an event is done differently. + # ######################################################################## + # When MySQL 5.6.5 or higher is used and gtid is enabled, skipping a + # transaction is not possible with SQL_SLAVE_SKIP_COUNTER + my $skip_event; + my $have_gtid = 0; + if ( VersionParser->new($dbh) >= '5.6.5' ) { + my $row = $dbh->selectrow_arrayref('SELECT @@GLOBAL.gtid_mode'); + PTDEBUG && _d('@@GLOBAL.gtid_mode:', $row->[0]); + if ( $row && $row->[0] eq 'ON' ) { + $have_gtid = 1; + } + } + PTDEBUG && _d('Have GTID:', $have_gtid); + + # If GTID is enabled, slave_parallel_workers should be == 0. + # It's currently not possible to know what GTID event the failed trx is. + if ( $have_gtid ) { + my $threads = $dbh->selectrow_hashref( + 'SELECT @@GLOBAL.slave_parallel_workers AS threads'); + if ( $threads->{threads} > 0 ) { + die "Cannot skip transactions properly because GTID is enabled " + . "and slave_parallel_workers > 0. See 'GLOBAL TRANSACTION IDS' " + . "in the tool's documentation.\n"; + } + } # ######################################################################## # Lookup tables of things to do when a problem is detected. @@ -4989,7 +5021,7 @@ sub watch_server { [ qr/Could not parse relay log event entry/ => 'refetch_relay_log' ], [ qr/Incorrect key file for table/ => 'repair_table' ], # This must be the last one. It's a catch-all rule: skip and restart. - [ qr/./ => 'skip' ], + [ qr/./ => ($have_gtid ? 'skip_gtid' : 'skip') ], ); # ######################################################################## @@ -5012,9 +5044,61 @@ sub watch_server { }, skip => sub { my ( $stat, $dbh ) = @_; - PTDEBUG && _d('Found non-relay-log error'); + my $set_skip = $dbh->prepare("SET GLOBAL SQL_SLAVE_SKIP_COUNTER = " + . $o->get('skip-count')); $set_skip->execute(); }, + skip_gtid => sub { + my ( $stat, $dbh ) = @_; + + # Get master_uuid from SHOW SLAVE STATUS if a UUID is not specified + # with --master-uuid. + my $gtid_uuid = $o->get('master-uuid'); + if ( !$gtid_uuid ) { + $gtid_uuid = $stat->{master_uuid}; + die "No master_uuid" unless $gtid_uuid; # shouldn't happen + } + + # We need the highest transaction in the executed_gtid_set. + # and then we need to increase it by 1 (the one we want to skip) + # Notes: + # - does not work with parallel replication + # - it skips the next transaction from the master_uuid + # (when a slaveB is replicating from slaveA, + # the master_uuid is it's own master, slaveA) + my ($gtid_exec_ids) = ($stat->{executed_gtid_set} || '') =~ m/$gtid_uuid([0-9-:]*)/; + $gtid_exec_ids =~ s/:[0-9]-/:/g; + die "No executed GTIDs" unless $gtid_exec_ids; + + my @gtid_exec_ranges = split(/:/, $gtid_exec_ids); + delete $gtid_exec_ranges[0]; # undef the first value, it's always empty + + # Get the highest id by sorting the array, removing the undef value. + my @gtid_exec_sorted = sort { $a <=> $b } + grep { defined($_) } @gtid_exec_ranges; + my $gtid_exec_last = $gtid_exec_sorted[-1]; + + PTDEBUG && _d("\n", + "GTID: master_uuid:", $gtid_uuid, "\n", + "GTID: executed_gtid_set:", $gtid_exec_ids, "\n", + "GTID: max for master_uuid:", $gtid_exec_sorted[-1], "\n", + "GTID: last executed gtid:", $gtid_uuid, ":", $gtid_exec_last); + + # Set the sessions next gtid, write an empty transaction + my $skipped = 0; + while ( $skipped++ < $o->get('skip-count') ) { + my $gtid_next = $gtid_exec_last + $skipped; + my $sql = "SET GTID_NEXT='$gtid_uuid:$gtid_next'"; + PTDEBUG && _d($sql); + my $sth = $dbh->prepare($sql); + $sth->execute(); + $dbh->begin_work(); + $dbh->commit(); + } + + # Set the session back to the automatically generated GTID_NEXT. + $dbh->do("SET GTID_NEXT='AUTOMATIC'"); + }, repair_table => sub { my ( $stat, $dbh ) = @_; PTDEBUG && _d('Found corrupt table'); @@ -5301,6 +5385,28 @@ sleep time, whichever is less. =back +=head1 GLOBAL TRANSACTION IDS + +As of Percona Toolkit 2.2.8, pt-slave-restart supports Global Transaction IDs +introduced in MySQL 5.6.5. It's important to keep in mind that: + +=over + +=item * + +pt-slave-restart will not skip transactions when multiple replication threads +are being used (slave_parallel_workers > 0). pt-slave-restart does not know +what the GTID event is of the failed transaction of a specific slave thread. + +=item * + +The default behavior is to skip the next transaction from the slave's master. +Writes can originate on different servers, each with their own UUID. + +See L<"--master-uuid">. + +=back + =head1 EXIT STATUS An exit status of 0 (sometimes also called a return value or return code) @@ -5555,6 +5661,26 @@ type: int; default: 1 Number of statements to skip when restarting the slave. +=item --master-uuid + +type: string + +When using GTID, an empty transaction should be created in order to skip it. +If writes are coming from different nodes in the replication tree above, it is +not possible to know which event from which UUID to skip. + +By default, transactions from the slave's master (C<'Master_UUID'> from +C) are skipped. + +For example, with + + master1 -> slave1 -> slave2 + +When skipping events on slave2 that were written to master1, you must specify +the UUID of master1, else the tool will use the UUID of slave1 by default. + +See L<"GLOBAL TRANSACTION IDS">. + =item --sleep type: int; default: 1 diff --git a/lib/Sandbox.pm b/lib/Sandbox.pm index 8c3055ee..3ee79ff4 100644 --- a/lib/Sandbox.pm +++ b/lib/Sandbox.pm @@ -379,6 +379,7 @@ sub verify_test_data { # Diff the two sets of checksums: host to master (ref). my @diffs; foreach my $c ( @checksums ) { + next unless $c->{checksum}; if ( $c->{checksum} ne $ref->{$c->{table}}->{checksum} ) { push @diffs, $c->{table}; } diff --git a/sandbox/start-sandbox b/sandbox/start-sandbox index 79ab4c7d..ea9dd833 100755 --- a/sandbox/start-sandbox +++ b/sandbox/start-sandbox @@ -113,6 +113,14 @@ make_sandbox() { echo "query_cache_size=$QUERY_CACHE_SIZE" >> /tmp/$port/my.sandbox.cnf fi + if [ -n "$GTID" ]; then + echo "gtid_mode=on" >> /tmp/$port/my.sandbox.cnf + echo "enforce_gtid_consistency" >> /tmp/$port/my.sandbox.cnf + fi + if [ -n "$REPLICATION_THREADS" ]; then + echo "slave_parallel_workers=$REPLICATION_THREADS" >> /tmp/$port/my.sandbox.cnf + fi + if [ -n "$EXTRA_DEFAULTS_FILE" ]; then cat "$EXTRA_DEFAULTS_FILE" >> /tmp/$port/my.sandbox.cnf fi diff --git a/sandbox/test-env b/sandbox/test-env index df687d14..ff851068 100755 --- a/sandbox/test-env +++ b/sandbox/test-env @@ -315,13 +315,16 @@ case $opt in fi if [ $? -eq 0 -a "$MYSQL_VERSION" '>' "4.1" ]; then - echo -n "Loading sakila database... " - ./load-sakila-db 12345 "${2:-""}" - exit_status=$((exit_status | $?)) - if [ $exit_status -ne 0 ]; then - echo "FAILED" - else - echo "OK" + SAKILA=${SAKILA:-1} + if [ $SAKILA -eq 1 ]; then + echo -n "Loading sakila database... " + ./load-sakila-db 12345 "${2:-""}" + exit_status=$((exit_status | $?)) + if [ $exit_status -ne 0 ]; then + echo "FAILED" + else + echo "OK" + fi fi # Create percona_test db and checksum all the tables. diff --git a/t/pt-slave-restart/gtid.t b/t/pt-slave-restart/gtid.t new file mode 100644 index 00000000..d467af21 --- /dev/null +++ b/t/pt-slave-restart/gtid.t @@ -0,0 +1,203 @@ +#!/usr/bin/env perl + +BEGIN { + die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n" + unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH}; + unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib"; +}; + +use strict; +use warnings FATAL => 'all'; +use English qw(-no_match_vars); +use Test::More; + +use PerconaTest; +use Sandbox; +require "$trunk/bin/pt-slave-restart"; + +if ( $sandbox_version lt '5.6' ) { + plan skip_all => "Requires MySQL 5.6"; +} + +diag(`SAKILA=0 GTID=1 $trunk/sandbox/test-env restart`); + +my $dp = new DSNParser(opts=>$dsn_opts); +my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); +my $master_dbh = $sb->get_dbh_for('master'); +my $slave1_dbh = $sb->get_dbh_for('slave1'); +my $slave2_dbh = $sb->get_dbh_for('slave2'); + +if ( !$master_dbh ) { + plan skip_all => 'Cannot connect to sandbox master'; +} +elsif ( !$slave1_dbh ) { + plan skip_all => 'Cannot connect to sandbox slave1'; +} +elsif ( !$slave2_dbh ) { + plan skip_all => 'Cannot connect to sandbox slave2'; +} + +my $slave1_dsn = $sb->dsn_for("slave1"); +my $slave2_dsn = $sb->dsn_for("slave2"); + +my $pid_file = "/tmp/pt-slave-restart-test-$PID.pid"; +my $log_file = "/tmp/pt-slave-restart-test-$PID.log"; +my $cmd = "$trunk/bin/pt-slave-restart --daemonize --run-time 5 --max-sleep .25 --pid $pid_file --log $log_file"; + +sub start { + my ( $extra ) = @_; + stop() or return; + system "$cmd $extra"; + PerconaTest::wait_for_files($pid_file); +} + +sub stop() { + return 1 if !is_running(); + diag(`$trunk/bin/pt-slave-restart --stop -q >/dev/null 2>&1 &`); + wait_until(sub { !-f $pid_file }, 0.3, 2); + diag(`rm -f /tmp/pt-slave-restart-sentinel`); + return is_running() ? 0 : 1; +} + +sub is_running { + chomp(my $running = `ps -eaf | grep -v grep | grep '$cmd'`); + if (!-f $pid_file && !$running) { + return 0; + } elsif (-f $pid_file && !$running) { + diag(`rm -f $pid_file`); + return 0; + } + return 1; +} + +sub wait_repl_broke { + my $dbh = shift; + return wait_until( + sub { + my $row = $dbh->selectrow_hashref('show slave status'); + return $row->{last_sql_errno}; + } + ); +} + +sub wait_repl_ok { + my $dbh = shift; + wait_until( + sub { + my $row = $dbh->selectrow_hashref('show slave status'); + return $row->{last_sql_errno} == 0; + }, + 0.30, + 5, + ); +} + +# ############################################################################# +# Basic test to see if restart works with GTID. +# ############################################################################# + +$master_dbh->do('DROP DATABASE IF EXISTS test'); +$master_dbh->do('CREATE DATABASE test'); +$master_dbh->do('CREATE TABLE test.t (a INT)'); +$sb->wait_for_slaves; + +# Bust replication +$slave1_dbh->do('DROP TABLE test.t'); +$master_dbh->do('INSERT INTO test.t SELECT 1'); +wait_repl_broke($slave1_dbh) or die "Failed to break replication"; + +my $r = $slave1_dbh->selectrow_hashref('show slave status'); +like($r->{last_error}, qr/Table 'test.t' doesn't exist'/, 'slave: Replication broke'); + +# Start pt-slave-restart and wait up to 5s for it to fix replication +# (it should take < 1s but tests can be really slow sometimes). +start("$slave1_dsn") or die "Failed to start pt-slave-restart"; +wait_repl_ok($slave1_dbh); + +# Check if replication is fixed. +$r = $slave1_dbh->selectrow_hashref('show slave status'); +like( + $r->{last_errno}, + qr/^0$/, + 'Event is skipped', +) or BAIL_OUT("Replication is broken"); + +# Stop pt-slave-restart. +stop() or die "Failed to stop pt-slave-restart"; + +# ############################################################################# +# Test the slave of the master. +# ############################################################################# + +$master_dbh->do('DROP DATABASE IF EXISTS test'); +$master_dbh->do('CREATE DATABASE test'); +$master_dbh->do('CREATE TABLE test.t (a INT)'); +$sb->wait_for_slaves; + +# Bust replication +$slave2_dbh->do('DROP TABLE test.t'); +$master_dbh->do('INSERT INTO test.t SELECT 1'); +wait_repl_broke($slave2_dbh) or die "Failed to break replication"; + +# fetch the master uuid, which is the machine we need to skip an event from +$r = $master_dbh->selectrow_hashref('select @@GLOBAL.server_uuid as uuid'); +my $uuid = $r->{uuid}; + +$r = $slave2_dbh->selectrow_hashref('show slave status'); +like($r->{last_error}, qr/Table 'test.t' doesn't exist'/, 'slaveofslave: Replication broke'); + +# Start an instance +start("--master-uuid=$uuid $slave2_dsn") or die; +wait_repl_ok($slave2_dbh); + +$r = $slave2_dbh->selectrow_hashref('show slave status'); +like( + $r->{last_errno}, + qr/^0$/, + 'Skips event from master on slave2' +) or BAIL_OUT("Replication is broken"); + +stop() or die "Failed to stop pt-slave-restart"; + +# ############################################################################# +# Test skipping 2 events in a row. +# ############################################################################# + +$master_dbh->do('DROP DATABASE IF EXISTS test'); +$master_dbh->do('CREATE DATABASE test'); +$master_dbh->do('CREATE TABLE test.t (a INT)'); +$sb->wait_for_slaves; + +# Bust replication +$slave2_dbh->do('DROP TABLE test.t'); +$master_dbh->do('INSERT INTO test.t SELECT 1'); +$master_dbh->do('INSERT INTO test.t SELECT 1'); +wait_repl_broke($slave2_dbh) or die "Failed to break replication"; + +# fetch the master uuid, which is the machine we need to skip an event from +$r = $master_dbh->selectrow_hashref('select @@GLOBAL.server_uuid as uuid'); +$uuid = $r->{uuid}; + +$r = $slave2_dbh->selectrow_hashref('show slave status'); +like($r->{last_error}, qr/Table 'test.t' doesn't exist'/, 'slaveofslaveskip2: Replication broke'); + +# Start an instance +start("--skip-count=2 --master-uuid=$uuid $slave2_dsn") or die; +wait_repl_ok($slave2_dbh); + +$r = $slave2_dbh->selectrow_hashref('show slave status'); +like( + $r->{last_errno}, + qr/^0$/, + 'Skips multiple events' +) or BAIL_OUT("Replication is broken"); + +stop() or die "Failed to stop pt-slave-restart"; + +# ############################################################################# +# Done. +# ############################################################################# +diag(`rm -f $pid_file $log_file >/dev/null`); +diag(`$trunk/sandbox/test-env restart`); +ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); +done_testing; diff --git a/t/pt-slave-restart/gtid_parallelreplication.t b/t/pt-slave-restart/gtid_parallelreplication.t new file mode 100644 index 00000000..439870f4 --- /dev/null +++ b/t/pt-slave-restart/gtid_parallelreplication.t @@ -0,0 +1,64 @@ +#!/usr/bin/env perl + +BEGIN { + die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n" + unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH}; + unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib"; +}; + +use strict; +use warnings FATAL => 'all'; +use English qw(-no_match_vars); +use Test::More; + +use PerconaTest; +use Sandbox; +require "$trunk/bin/pt-slave-restart"; + +if ( $sandbox_version lt '5.6' ) { + plan skip_all => 'MySQL Version ' . $sandbox_version + . ' < 5.6, GTID is not available, skipping tests'; +} + +diag("Stopping/reconfiguring/restarting sandboxes 12345, 12346 and 12347"); + +diag(`$trunk/sandbox/test-env stop >/dev/null`); +diag(`REPLICATION_THREADS=2 GTID=1 $trunk/sandbox/test-env start >/dev/null`); + +my $dp = new DSNParser(opts=>$dsn_opts); +my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); +my $master_dbh = $sb->get_dbh_for('master'); +my $slave_dbh = $sb->get_dbh_for('slave1'); +my $slave2_dbh = $sb->get_dbh_for('slave2'); + +if ( !$master_dbh ) { + plan skip_all => 'Cannot connect to sandbox master'; +} +elsif ( !$slave_dbh ) { + plan skip_all => 'Cannot connect to sandbox slave1'; +} +elsif ( !$slave2_dbh ) { + plan skip_all => 'Cannot connect to sandbox slave2'; +} + +# ############################################################################# +# pt-slave-restart should exit! +# ############################################################################# +# Start an instance +my $output=`$trunk/bin/pt-slave-restart --run-time=1s -h 127.0.0.1 -P 12346 -u msandbox -p msandbox 2>&1`; + +like( + $output, + qr/It is impossible to skip transactions properly./, + "pt-slave-restart exits with multiple replication threads" +); + +# ############################################################################# +# Done. +# ############################################################################# +diag(`rm -f /tmp/pt-slave-re*`); +diag(`$trunk/sandbox/test-env stop >/dev/null`); +diag(`$trunk/sandbox/test-env start >/dev/null`); + +ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); +done_testing; diff --git a/util/checksum-test-dataset b/util/checksum-test-dataset index b6ef25ae..ea738727 100755 --- a/util/checksum-test-dataset +++ b/util/checksum-test-dataset @@ -61,6 +61,7 @@ my $sql = "CHECKSUM TABLES " . join(", ", map { "sakila.$_" } @tables_in_sakila); my @checksums = @{$dbh->selectall_arrayref($sql, {Slice => {} })}; foreach my $c ( @checksums ) { + next unless $c->{Checksum}; $dbh->do("INSERT INTO percona_test.checksums(db_tbl, checksum) VALUES('$c->{Table}', $c->{Checksum})"); }