From c5ec28fcd631935f7adc4659f313cf807f0c1e3e Mon Sep 17 00:00:00 2001 From: Carlos Salguero Date: Thu, 8 Nov 2018 14:18:18 -0300 Subject: [PATCH] PT-1637 Added --fail-on stopped-replication param to table checksum --- bin/pt-online-schema-change | 8 ++- bin/pt-table-checksum | 27 ++++++-- lib/ReplicaLagWaiter.pm | 6 ++ lib/Sandbox.pm | 1 + t/pt-table-checksum/pt-1637.t | 92 +++++++++++++++++++++++++ t/pt-table-checksum/samples/pt-1637.sql | 37 ++++++++++ 6 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 t/pt-table-checksum/pt-1637.t create mode 100644 t/pt-table-checksum/samples/pt-1637.sql diff --git a/bin/pt-online-schema-change b/bin/pt-online-schema-change index 02e225cc..d98f9151 100755 --- a/bin/pt-online-schema-change +++ b/bin/pt-online-schema-change @@ -4980,6 +4980,9 @@ sub wait { . " seconds on $dsn_name. Waiting.\n"; } else { + if ($self->{fail_on_stopped_replication}) { + die 'replication is stopped'; + } print STDERR "Replica $dsn_name is stopped. Waiting.\n"; } return; @@ -4989,6 +4992,9 @@ sub wait { $pr_first_report = sub { my $dsn_name = $worst->{cxn}->name(); if ( !defined $worst->{lag} ) { + if ($self->{fail_on_stopped_replication}) { + die 'replication is stopped'; + } print STDERR "Replica $dsn_name is stopped. Waiting.\n"; } return; @@ -5002,7 +5008,7 @@ sub wait { my $lag = $get_lag->($lagged_slaves[$i]->{cxn}); PTDEBUG && _d($lagged_slaves[$i]->{cxn}->name(), 'slave lag:', $lag); - if ( defined $lag && $lag > $max_lag ) { + if ( !defined $lag || $lag > $max_lag ) { $lagged_slaves[$i]->{lag} = $lag; } else { diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index c33268c4..96099608 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -8596,6 +8596,9 @@ sub wait { . " seconds on $dsn_name. Waiting.\n"; } else { + if ($self->{fail_on_stopped_replication}) { + die 'replication is stopped'; + } print STDERR "Replica $dsn_name is stopped. Waiting.\n"; } return; @@ -8605,6 +8608,9 @@ sub wait { $pr_first_report = sub { my $dsn_name = $worst->{cxn}->name(); if ( !defined $worst->{lag} ) { + if ($self->{fail_on_stopped_replication}) { + die 'replication is stopped'; + } print STDERR "Replica $dsn_name is stopped. Waiting.\n"; } return; @@ -9878,14 +9884,15 @@ my $original_qrt_plugin_master_status = undef; # http://www.tldp.org/LDP/abs/html/exitcodes.html our %PTC_EXIT_STATUS = ( # General flags: - ERROR => 1, - ALREADY_RUNNING => 2, - CAUGHT_SIGNAL => 4, - NO_SLAVES_FOUND => 8, + ERROR => 1, + ALREADY_RUNNING => 2, + CAUGHT_SIGNAL => 4, + NO_SLAVES_FOUND => 8, # Tool-specific flags: - TABLE_DIFF => 16, - SKIP_CHUNK => 32, - SKIP_TABLE => 64, + TABLE_DIFF => 16, + SKIP_CHUNK => 32, + SKIP_TABLE => 64, + REPLICATION_STOPPED => 128, ); # The following two hashes are used in exec_nibble(). @@ -10719,6 +10726,7 @@ sub main { oktorun => sub { return $oktorun && $have_time->(); }, get_lag => $get_lag, sleep => $sleep, + fail_on_stopped_replication => $o->get('fail-on-stopped-replication'), ); my $get_status; @@ -13274,6 +13282,11 @@ L<"--[no]empty-replicate-table">). If specified twice, the tool actually iterates through the chunking algorithm, printing the upper and lower boundary values for each chunk, but not executing the checksum queries. +=item --fail-on-stopped-replication + +If replication is stopped, fail with an error (exit status 128) instead of waiting +until replication is restarted. + =item --float-precision type: int diff --git a/lib/ReplicaLagWaiter.pm b/lib/ReplicaLagWaiter.pm index 194406e0..9ab4789f 100644 --- a/lib/ReplicaLagWaiter.pm +++ b/lib/ReplicaLagWaiter.pm @@ -91,6 +91,9 @@ sub wait { . " seconds on $dsn_name. Waiting.\n"; } else { + if ($self->{fail_on_stopped_replication}) { + die 'replication is stopped'; + } print STDERR "Replica $dsn_name is stopped. Waiting.\n"; } return; @@ -103,6 +106,9 @@ sub wait { $pr_first_report = sub { my $dsn_name = $worst->{cxn}->name(); if ( !defined $worst->{lag} ) { + if ($self->{fail_on_stopped_replication}) { + die 'replication is stopped'; + } print STDERR "Replica $dsn_name is stopped. Waiting.\n"; } return; diff --git a/lib/Sandbox.pm b/lib/Sandbox.pm index cec2b70d..264c3b30 100644 --- a/lib/Sandbox.pm +++ b/lib/Sandbox.pm @@ -67,6 +67,7 @@ my %port_for = ( chan_master1 => 2900, chan_master2 => 2901, chan_slave1 => 2902, + chan_slave2 => 2903, ); my %server_type = ( diff --git a/t/pt-table-checksum/pt-1637.t b/t/pt-table-checksum/pt-1637.t new file mode 100644 index 00000000..b2f27cfe --- /dev/null +++ b/t/pt-table-checksum/pt-1637.t @@ -0,0 +1,92 @@ +#!/usr/bin/env perl + +BEGIN { + die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n" + unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH}; + unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib"; +}; + +use strict; +use warnings FATAL => 'all'; +use English qw(-no_match_vars); +use Test::More; + +use PerconaTest; +use Sandbox; +use SqlModes; +require "$trunk/bin/pt-table-checksum"; + +my $dp = new DSNParser(opts=>$dsn_opts); +my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); + +diag ('Starting second sandbox master'); +my ($master1_dbh, $master1_dsn) = $sb->start_sandbox( + server => 'chan_master1', + type => 'master', +); + +diag ('Starting second sandbox slave 1'); +my ($slave1_dbh, $slave1_dsn) = $sb->start_sandbox( + server => 'chan_slave1', + type => 'slave', + master => 'chan_master1', +); + +diag ('Starting second sandbox slave 2'); +my ($slave2_dbh, $slave2_dsn) = $sb->start_sandbox( + server => 'chan_slave2', + type => 'slave', + master => 'chan_master1', +); + +my $dbh = $sb->get_dbh_for('chan_master1'); + +if ( !$dbh ) { + plan skip_all => 'Cannot connect to sandbox master'; +} +else { + plan tests => 2; +} + +diag("loading samples"); +$sb->load_file('chan_master1', 't/pt-table-checksum/samples/pt-1637.sql'); + + +my @args = ($master1_dsn, + "--set-vars", "innodb_lock_wait_timeout=50", + "--ignore-databases", "mysql", "--no-check-binlog-format", + "--recursion-method", "dsn=h=127.0.0.1,D=test,t=dsns", + "--run-time", "5", "--fail-on-stopped-replication", +); + +diag(join(" ", @args)); + +# The sandbox servers run with lock_wait_timeout=3 and it's not dynamic +# so we need to specify --set-vars innodb_lock_wait_timeout=3 else the tool will die. +my $master_dsn = $sb->dsn_for('master'); +$sb->do_as_root("chan_slave1", 'stop slave IO_thread;'); + +my $output; +my $exit_status; + +$output = output( + sub { $exit_status = pt_table_checksum::main(@args) }, + stderr => 1, +); +diag($output); + +is( + $exit_status, + 0, + "PT-1616 pt-table-cheksum before --resume with binary fields exit status", +); + +$sb->stop_sandbox('chan_master1'); +$sb->stop_sandbox('chan_slave1'); +$sb->stop_sandbox('chan_slave2'); +# ############################################################################# +# Done. +# ############################################################################# +$sb->wipe_clean($dbh); +ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); +exit; diff --git a/t/pt-table-checksum/samples/pt-1637.sql b/t/pt-table-checksum/samples/pt-1637.sql new file mode 100644 index 00000000..63fafa41 --- /dev/null +++ b/t/pt-table-checksum/samples/pt-1637.sql @@ -0,0 +1,37 @@ +CREATE DATABASE IF NOT EXISTS `percona`; + +CREATE TABLE `percona`.`checksums` ( + db CHAR(64) NOT NULL, + tbl CHAR(64) NOT NULL, + chunk INT NOT NULL, + chunk_time FLOAT NULL, + chunk_index VARCHAR(200) NULL, + lower_boundary TEXT NULL, + upper_boundary TEXT NULL, + this_crc CHAR(40) NOT NULL, + this_cnt INT NOT NULL, + master_crc CHAR(40) NULL, + master_cnt INT NULL, + ts TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (db, tbl, chunk), + INDEX ts_db_tbl (ts, db, tbl) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +CREATE DATABASE IF NOT EXISTS test; + +CREATE TABLE `test`.`dsns` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `parent_id` int(11) DEFAULT NULL, + `dsn` varchar(255) NOT NULL, + PRIMARY KEY (`id`) +); + +-- From Sandbox.pm +-- chan_master1 => 2900, +-- chan_master2 => 2901, +-- chan_slave1 => 2902, +-- chan_slave2 => 2903, + +INSERT INTO `test`.`dsns` VALUES +(1, NULL, "h=127.0.0.1,P=2902,u=msandbox,p=msandbox"), +(2, NULL, "h=127.0.0.1,P=2903,u=msandbox,p=msandbox");