From 840ba6926b33d5366de1174b63f04048af05be91 Mon Sep 17 00:00:00 2001 From: Sveta Smirnova Date: Sat, 22 Mar 2025 00:01:56 +0300 Subject: [PATCH] PT-2355 - pt-online-schema-change should not resume a job with empty boundaries - The tool should still fail, so I improved error message and added test case --- bin/pt-online-schema-change | 10 +- t/pt-online-schema-change/pt-2355.t | 239 ++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 t/pt-online-schema-change/pt-2355.t diff --git a/bin/pt-online-schema-change b/bin/pt-online-schema-change index b8fb2662..5b942d33 100755 --- a/bin/pt-online-schema-change +++ b/bin/pt-online-schema-change @@ -8943,12 +8943,20 @@ sub main { $sth->finish(); PTDEBUG && _d('Last chunk:', Dumper($last_chunk)); - if ( !$last_chunk || !$last_chunk->{new_table_name} ) { + if ( !$last_chunk ) { $oktorun = undef; _die("Option --resume refers non-existing job ID: ${old_job_id}. Exiting." , UNSUPPORTED_OPERATION); } + if ( !$last_chunk->{new_table_name} + || !$last_chunk->{lower_boundary} + || !$last_chunk->{upper_boundary} ) { + $oktorun = undef; + _die("Option --resume refers job ${old_job_id} with empty boundaries. Exiting." + , UNSUPPORTED_OPERATION); + } + if ( $last_chunk->{db} ne $db || $last_chunk->{tbl} ne $tbl || $last_chunk->{altr} ne $o->get('alter') ){ diff --git a/t/pt-online-schema-change/pt-2355.t b/t/pt-online-schema-change/pt-2355.t new file mode 100644 index 00000000..bf916528 --- /dev/null +++ b/t/pt-online-schema-change/pt-2355.t @@ -0,0 +1,239 @@ +#!/usr/bin/env perl + +BEGIN { + die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n" + unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH}; + unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib"; +}; + +use strict; +use warnings FATAL => 'all'; +use threads; + +use English qw(-no_match_vars); +use Test::More; + +use Data::Dumper; +use PerconaTest; +use Sandbox; +use SqlModes; +use File::Temp qw/ tempdir tempfile /; + +our $delay = 10; +my $max_lag = $delay / 2; +my $output; +my $exit; + +my $tmp_file = File::Temp->new(); +my $tmp_file_name = $tmp_file->filename; +unlink $tmp_file_name; + +require "$trunk/bin/pt-online-schema-change"; + +my $dp = new DSNParser(opts=>$dsn_opts); +my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); +if ($sb->is_cluster_mode) { + plan skip_all => 'Not for PXC'; +} + +my $source_dbh = $sb->get_dbh_for('source'); +my $replica_dbh1 = $sb->get_dbh_for('replica1'); +my $replica_dbh2 = $sb->get_dbh_for('replica2'); +my $source_dsn = 'h=127.0.0.1,P=12345,u=msandbox,p=msandbox'; +my $replica_dsn1 = 'h=127.0.0.1,P=12346,u=msandbox,p=msandbox'; +my $replica_dsn2 = 'h=127.0.0.1,P=12347,u=msandbox,p=msandbox'; +my $sample = "t/pt-online-schema-change/samples"; +my $plugin = "$trunk/$sample/plugins"; + +# We need sync_relay_log=1 to keep changes after replica restart +my $cnf = '/tmp/12347/my.sandbox.cnf'; +diag(`cp $cnf $cnf.bak`); +diag(`echo "[mysqld]" > /tmp/12347/my.sandbox.2.cnf`); +diag(`echo "sync_relay_log=1" >> /tmp/12347/my.sandbox.2.cnf`); +diag(`echo "sync_relay_log_info=1" >> /tmp/12347/my.sandbox.2.cnf`); +diag(`echo "relay_log_recovery=1" >> /tmp/12347/my.sandbox.2.cnf`); +diag(`echo "!include /tmp/12347/my.sandbox.2.cnf" >> $cnf`); +diag(`/tmp/12347/stop >/dev/null`); +sleep 1; +diag(`/tmp/12347/start >/dev/null`); + +sub reset_query_cache { + my @dbhs = @_; + return if ($sandbox_version ge '8.0'); + foreach my $dbh (@dbhs) { + $dbh->do('RESET QUERY CACHE'); + } +} + +sub run_broken_job { + my ($args) = @_; + my ($fh, $filename) = tempfile(); + my $pid = fork(); + + if (!$pid) { + open(STDERR, '>', $filename); + open(STDOUT, '>', $filename); + exec("$trunk/bin/pt-online-schema-change $args"); + } + + sleep($max_lag + $max_lag/2); + # stop replica 12347 + diag(`/tmp/12347/stop >/dev/null`); + sleep 1; + + waitpid($pid, 0); + my $output = do { + local $/ = undef; + <$fh>; + }; + + return $output; +} + +sub set_delay { + $sb->wait_for_replicas(); + + diag("Setting replica delay to $delay seconds"); + diag(`/tmp/12345/use -N test -e "DROP TABLE IF EXISTS pt1717_back"`); + + $replica_dbh1->do("STOP ${replica_name}"); + $replica_dbh1->do("CHANGE ${source_change} TO ${source_name}_DELAY=$delay"); + $replica_dbh1->do("START ${replica_name}"); + + # Run a full table scan query to ensure the replica is behind the source + # There is no query cache in MySQL 8.0+ + reset_query_cache($source_dbh, $source_dbh); + # Update one row so replica is delayed + $source_dbh->do('UPDATE `test`.`pt1717` SET f2 = f2 + 1 LIMIT 1'); + $source_dbh->do('UPDATE `test`.`pt1717` SET f2 = f2 + 1 WHERE f1 = ""'); + + # Creating copy of table pt1717, so we can compare data later + diag(`/tmp/12345/use -N test -e "CREATE TABLE pt1717_back like pt1717"`); + diag(`/tmp/12345/use -N test -e "INSERT INTO pt1717_back SELECT * FROM pt1717"`); +} + +# 1) Set the replica delay to 0 just in case we are re-running the tests without restarting the sandbox. +# 2) Load sample data +# 3) Set the replica delay to 30 seconds to be able to see the 'waiting' message. +diag("Setting replica delay to 0 seconds"); +$replica_dbh1->do("STOP ${replica_name}"); +$source_dbh->do("RESET ${source_reset}"); +$replica_dbh1->do("RESET ${replica_name}"); +$replica_dbh1->do("START ${replica_name}"); + +diag('Loading test data'); +$sb->load_file('source', "t/pt-online-schema-change/samples/pt-1717.sql"); + +# Should be greater than chunk-size and big enough, so pt-osc will wait for delay +my $num_rows = 5000; +my $chunk_size = 10; +diag("Loading $num_rows into the table. This might take some time."); +diag(`util/mysql_random_data_load --host=127.0.0.1 --port=12345 --user=msandbox --password=msandbox test pt1717 $num_rows`); + +diag("Starting tests..."); + +set_delay(); + +# We need to sleep, otherwise pt-osc can finish before replica is delayed +sleep($max_lag); + +my $args = "$source_dsn,D=test,t=pt1717 --execute --chunk-size ${chunk_size} --max-lag $max_lag --alter 'ADD INDEX idx1(f1)' --pid $tmp_file_name --progress time,5 --no-drop-new-table --no-drop-triggers --history"; + +$output = run_broken_job($args); + +like( + $output, + qr/`test`.`pt1717` was not altered/s, + "pt-osc stopped with error as expected", +) or diag($output); + +diag(`/tmp/12347/start >/dev/null`); +$sb->wait_for_replicas(); + +$output = `/tmp/12345/use -N -e "select job_id, upper_boundary from percona.pt_osc_history"`; +my ($job_id, $upper_boundary) = split(/\s+/, $output); + +my $copied_rows = `/tmp/12345/use -N -e "select count(*) from test._pt1717_new"`; +chomp($copied_rows); + +ok( + $copied_rows eq $upper_boundary, + 'Upper chunk boundary stored correctly' +) or diag("Copied_rows: ${copied_rows}, upper boundary: ${upper_boundary}");; + +($output, $exit) = full_output( + sub { pt_online_schema_change::main("$source_dsn,D=test,t=pt1717", + "--execute", "--chunk-size=${chunk_size}", "--max-lag=${max_lag}", + "--alter=ADD INDEX idx1(f1)", + "--resume=${job_id}", + ) } +); + +is( + $exit, + 0, + 'pt-osc works correctly with --resume' +) or diag($exit); + +like( + $output, + qr/Successfully altered/, + 'Success message printed' +) or diag($output); + +# Corrupting job record, so we can test error message +diag(`/tmp/12345/use -N -e "update percona.pt_osc_history set new_table_name=NULL where job_id=${job_id}"`); + +($output, $exit) = full_output( + sub { pt_online_schema_change::main("$source_dsn,D=test,t=pt1717", + "--execute", "--chunk-size=${chunk_size}", "--max-lag=${max_lag}", + "--alter=ADD INDEX idx1(f1)", + "--resume=${job_id}", + ) } +); + +is( + $exit, + 17, + 'pt-osc works correctly fails with empty boundaries' +) or diag($exit); + +like( + $output, + qr/Option --resume refers job \d+ with empty boundaries. Exiting./, + 'Correct error message printed' +) or diag($output); + +unlike( + $output, + qr/Option --resume refers non-existing job ID: \d+. Exiting./, + 'Misleading error message not printed' +) or diag($output); + +# ############################################################################# +# Done. +# ############################################################################# +diag("Cleaning"); +$replica_dbh2 = $sb->get_dbh_for('replica2'); +diag("Setting replica delay to 0 seconds"); +$replica_dbh1->do("STOP ${replica_name}"); +$replica_dbh2->do("STOP ${replica_name}"); +$source_dbh->do("RESET ${source_reset}"); +$replica_dbh1->do("RESET ${source_reset}"); +$replica_dbh1->do("RESET ${replica_name}"); +$replica_dbh2->do("RESET ${replica_name}"); +$replica_dbh1->do("START ${replica_name}"); +$replica_dbh2->do("START ${replica_name}"); + +diag(`mv $cnf.bak $cnf`); + +diag(`/tmp/12347/stop >/dev/null`); +diag(`/tmp/12347/start >/dev/null`); + +diag("Dropping test database"); +$source_dbh->do("DROP DATABASE IF EXISTS test"); +$sb->wait_for_replicas(); + +$sb->wipe_clean($source_dbh); +ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); +done_testing;