diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index 3f84a63d..9c4af576 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -3691,16 +3691,20 @@ $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Quotekeys = 0; +local $EVAL_ERROR; +eval { + require Quoter; +}; + sub new { my ( $class, %args ) = @_; - my @required_args = qw(Quoter); - foreach my $arg ( @required_args ) { - die "I need a $arg argument" unless $args{$arg}; - } my $self = { %args }; + $self->{Quoter} ||= Quoter->new(); return bless $self, $class; } +sub Quoter { shift->{Quoter} } + sub get_create_table { my ( $self, $dbh, $db, $tbl ) = @_; die "I need a dbh parameter" unless $dbh; @@ -8544,7 +8548,7 @@ sub main { Quoter => $q, ); - my $slaves; # all slaves (that we can find) + my $slaves = []; # all slaves (that we can find) my $slave_lag_cxns; # slaves whose lag we'll check my $replica_lag; # ReplicaLagWaiter object @@ -8932,6 +8936,29 @@ sub main { $last_chunk = undef; } + if ( $o->get('check-slave-tables') ) { + eval { + check_slave_tables( + slaves => $slaves, + db => $tbl->{db}, + tbl => $tbl->{tbl}, + checksum_cols => $tbl->{checksum_cols}, + TableParser => $tp, + ); + }; + if ( $EVAL_ERROR ) { + my $msg + = "Skipping table $tbl->{db}.$tbl->{tbl} because it has " + . "problems on these replicas:\n" + . $EVAL_ERROR + . "This can break replication. If you understand the risks, " + . "specify --no-check-slave-tables to disable this check.\n"; + warn ts($msg); + $tbl->{checksum_results}->{errors}++; + $oktonibble = 0; + } + } + if ( $o->get('explain') ) { # --explain level 1: print the checksum and next boundary # statements. @@ -9422,6 +9449,18 @@ sub main { ); } + # Make a list of the columns being checksummed. As the option's + # docs note, this really only makes sense when checksumming one + # table, unless the tables have a common set of columns. + # TODO: this now happens in 3 places, search for 'columns'. + my $tbl_struct = $tbl->{tbl_struct}; + my $ignore_col = $o->get('ignore-columns') || {}; + my $all_cols = $o->get('columns') || $tbl_struct->{cols}; + my @cols = map { lc $_ } + grep { !$ignore_col->{$_} } + @$all_cols; + $tbl->{checksum_cols} = \@cols; + # Finally, checksum the table. # The "1 while" loop is necessary because we're executing REPLACE # statements which don't return rows and NibbleIterator only @@ -9745,18 +9784,24 @@ sub print_checksum_diffs { sub check_repl_table { my ( %args ) = @_; - my @required_args = qw(dbh repl_table slaves OptionParser TableParser Quoter); + my @required_args = qw(dbh repl_table slaves + OptionParser TableParser Quoter); foreach my $arg ( @required_args ) { die "I need a $arg argument" unless $args{$arg}; } my ($dbh, $repl_table, $slaves, $o, $tp, $q) = @args{@required_args}; + PTDEBUG && _d('Checking --replicate table', $repl_table); + # ######################################################################## + # Create the --replicate database. + # ######################################################################## + # If the repl db doesn't exit, auto-create it, maybe. my ($db, $tbl) = $q->split_unquote($repl_table); - my $sql = "SHOW DATABASES LIKE '$db'"; - PTDEBUG && _d($sql); - my @db_exists = $dbh->selectrow_array($sql); + my $show_db_sql = "SHOW DATABASES LIKE '$db'"; + PTDEBUG && _d($show_db_sql); + my @db_exists = $dbh->selectrow_array($show_db_sql); if ( !@db_exists && !$o->get('create-replicate-table') ) { die "--replicate database $db does not exist and " . "--no-create-replicate-table was specified. You need " @@ -9764,22 +9809,46 @@ sub check_repl_table { } if ( $o->get('create-replicate-table') ) { - $sql = "CREATE DATABASE IF NOT EXISTS " - . $q->quote($db) - . " /* pt-table-checksum */"; -# local $@; - PTDEBUG && _d($sql); + # Even if the db already exists, do this in case it does not exist + # on a slave. + my $create_db_sql + = "CREATE DATABASE IF NOT EXISTS " + . $q->quote($db) + . " /* pt-table-checksum */"; + PTDEBUG && _d($create_db_sql); eval { - $dbh->do($sql); + $dbh->do($create_db_sql); }; if ( $EVAL_ERROR ) { + # CREATE DATABASE IF NOT EXISTS failed but the db could already + # exist and the error could be due, for example, to the user not + # having privs to create it, but they still have privs to use it. + if ( @db_exists ) { - die "Error executing $sql: $EVAL_ERROR\n" - . "The database exists on the master, but replication will " - . "break if it does not also exist on all replicas. If the " - . "--replicate database and table already exist on the master " - . "and all replicas, or if you created them manually, then " - . "specify --no-create-replicate-table to avoid this error." + # Repl db already exists on the master, so check if it's also + # on all slaves. If not, and given that creating it failed, + # we'll die because we can't be sure if it's ok on all slaves. + # The user can verify and disable this check if it's ok. + my $e = $EVAL_ERROR; # CREATE DATABASE error + my @slaves_missing_db; + foreach my $slave ( @$slaves ) { + PTDEBUG && _d($show_db_sql, 'on', $slave->name()); + my @db_exists_in_slave + = $slave->dbh->selectrow_array($show_db_sql); + if ( !@db_exists_in_slave ) { + push @slaves_missing_db, $slave; + } + } + if ( @slaves_missing_db ) { + warn $e; # CREATE DATABASE error + die "The --replicate database $db exists on the master but " + . "$create_db_sql on the master failed (see the error above) " + . "and the database does not exist on these replicas:\n" + . join("\n", map { " " . $_->name() } @slaves_missing_db) + . "\nThis can break replication. If you understand " + . "the risks, specify --no-create-replicate-table to disable " + . "this check.\n"; + } } else { die "--replicate database $db does not exist and it cannot be " @@ -9792,41 +9861,64 @@ sub check_repl_table { # USE the correct db (probably the repl db, but maybe --replicate-database). use_repl_db(%args); + # ######################################################################## + # Create the --replicate table. + # ######################################################################## + # Check if the repl table exists; if not, create it, maybe. my $tbl_exists = $tp->check_table( dbh => $dbh, db => $db, tbl => $tbl, ); + PTDEBUG && _d('--replicate table exists:', $tbl_exists ? 'yes' : 'no'); + + if ( !$tbl_exists && !$o->get('create-replicate-table') ) { + die "--replicate table $repl_table does not exist and " + . "--no-create-replicate-table was specified. " + . "You need to create the table.\n"; + } + + # We used to check the table privs here, but: + # https://bugs.launchpad.net/percona-toolkit/+bug/916168 # Always create the table, unless --no-create-replicate-table # was passed in; see https://bugs.launchpad.net/percona-toolkit/+bug/950294 - if ( !$tbl_exists ) { - if ( !$o->get('create-replicate-table') ) { - die "--replicate table $repl_table does not exist and " - . "--no-create-replicate-table was specified. " - . "You need to create the table.\n"; - } - } - else { - PTDEBUG && _d('--replicate table', $repl_table, 'already exists'); - # We used to check the table privs here, but: - # https://bugs.launchpad.net/percona-toolkit/+bug/916168 - } - if ( $o->get('create-replicate-table') ) { - local $@; eval { create_repl_table(%args); }; if ( $EVAL_ERROR ) { + # CREATE TABLE IF NOT EXISTS failed but the table could already + # exist and the error could be due, for example, to the user not + # having privs to create it, but they still have privs to use it. + if ( $tbl_exists ) { - die "Error executing $sql: $EVAL_ERROR\n" - . "The table exists on the master, but replication will " - . "break if it does not also exist on all replicas. If the " - . "--replicate database and table already exist on the master " - . "and all replicas, or if you created them manually, then " - . "specify --no-create-replicate-table to avoid this error." + # Repl table already exists on the master, so check if it's also + # on all slaves. If not, and given that creating it failed, + # we'll die because we can't be sure if it's ok on all slaves. + # The user can verify and disable this check if it's ok. + my $e = $EVAL_ERROR; # CREATE TABLE error + my $ddl = $tp->get_create_table($dbh, $db, $tbl); + my $tbl_struct = $tp->parse($ddl); + eval { + check_slave_tables( + slaves => $slaves, + db => $db, + tbl => $tbl, + checksum_cols => $tbl_struct->{cols}, + TableParser => $tp, + ); + }; + if ( $EVAL_ERROR ) { + warn $e; # CREATE TABLE error + die "The --replicate table $repl_table exists on the master but " + . "but it has problems on these replicas:\n" + . $EVAL_ERROR + . "\nThis can break replication. If you understand " + . "the risks, specify --no-create-replicate-table to disable " + . "this check.\n"; + } } else { die "--replicate table $tbl does not exist and it cannot be " @@ -9876,6 +9968,60 @@ sub check_repl_table { return; # success, repl table is ready to go } +# Check that db.tbl exists on all slaves and has the checksum cols, +# else when we check for diffs we'll break replication by selecting +# a nonexistent column. +sub check_slave_tables { + my (%args) = @_; + my @required_args = qw(slaves db tbl checksum_cols TableParser); + foreach my $arg ( @required_args ) { + die "I need a $arg argument" unless $args{$arg}; + } + my ($slaves, $db, $tbl, $checksum_cols, $tp) = @args{@required_args}; + + my @problems; + SLAVE: + foreach my $slave ( @$slaves ) { + my $slave_has_table = $tp->check_table( + dbh => $slave->dbh, + db => $db, + tbl => $tbl, + ); + if ( !$slave_has_table ) { + push @problems, "Table $db.$tbl does not exist on replica " + . $slave->name; + next SLAVE; + } + + my $slave_tbl_struct = eval { + $tp->parse( + $tp->get_create_table($slave->dbh, $db, $tbl) + ); + }; + if ( $EVAL_ERROR ) { + push @problems, "Error parsing table $db.$tbl on replica " + . $slave->name . ": $EVAL_ERROR"; + next SLAVE; + } + + my @slave_missing_cols; + foreach my $col ( @$checksum_cols ) { + if ( !$slave_tbl_struct->{is_col}->{$col} ) { + push @slave_missing_cols, $col; + } + } + if ( @slave_missing_cols ) { + push @problems, "Table $db.$tbl on replica " . $slave->name + . " is missing these columns: " + . join(", ", @slave_missing_cols); + } + } + + die join("\n", @problems) . "\n" if @problems; + + return; +} + # Sub: use_repl_db # USE the correct database for the --replicate table. # This sub must be called before any work is done with the --replicatte @@ -10596,6 +10742,16 @@ a normal replica to monitor. See also L<"REPLICA CHECKS">. +=item --[no]check-slave-tables + +default: yes; group: Safety + +Checks that tables on slaves exist and have all the checksum L<"--columns">. +Tables missing on slaves or not having all the checksum L<"--columns"> can +cause the tool to break replication when it tries to check for differences. +Only disable this check if you are aware of the risks and are sure that all +tables on all slaves exist and are identical to the master. + =item --chunk-index type: string @@ -10697,9 +10853,12 @@ of leaving it at the default. short form: -c; type: array; group: Filter -Checksum only this comma-separated list of columns. If a table doesn't have +Checksum only this comma-separated list of columns. If a table doesn't have any of the specified columns it will be skipped. +This option applies to all tables, so it really only makes sense when +checksumming one table unless the tables have a common set of columns. + =item --config type: Array; group: Config diff --git a/lib/PerconaTest.pm b/lib/PerconaTest.pm index e06b7ee3..46c6cd15 100644 --- a/lib/PerconaTest.pm +++ b/lib/PerconaTest.pm @@ -740,8 +740,9 @@ sub full_output { or die "Cannot open file $file: $OS_ERROR"; *STDOUT->autoflush(1); - open *STDERR, '>', $file - or die "Cannot open file $file: $OS_ERROR"; + my (undef, $file2) = tempfile(); + open *STDERR, '>', $file2 + or die "Cannot open file $file2: $OS_ERROR"; *STDERR->autoflush(1); my $status; @@ -766,8 +767,11 @@ sub full_output { exit $code->(); } close $_ or die "Cannot close $_: $OS_ERROR" for qw(STDOUT STDERR); - my $output = do { local $/; open my $fh, "<", $file or die $!; <$fh> }; + my $output = slurp_file($file) . slurp_file($file2); + unlink $file; + unlink $file2; + return ($output, $status); } diff --git a/lib/TableParser.pm b/lib/TableParser.pm index 3d2428e0..d38c1bd0 100644 --- a/lib/TableParser.pm +++ b/lib/TableParser.pm @@ -38,16 +38,20 @@ $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Quotekeys = 0; +local $EVAL_ERROR; +eval { + require Quoter; +}; + sub new { my ( $class, %args ) = @_; - my @required_args = qw(Quoter); - foreach my $arg ( @required_args ) { - die "I need a $arg argument" unless $args{$arg}; - } my $self = { %args }; + $self->{Quoter} ||= Quoter->new(); return bless $self, $class; } +sub Quoter { shift->{Quoter} } + sub get_create_table { my ( $self, $dbh, $db, $tbl ) = @_; die "I need a dbh parameter" unless $dbh; diff --git a/t/lib/TableParser.t b/t/lib/TableParser.t index 9dcb0af5..61693283 100644 --- a/t/lib/TableParser.t +++ b/t/lib/TableParser.t @@ -9,7 +9,7 @@ BEGIN { use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); -use Test::More tests => 40; +use Test::More; use TableParser; use Quoter; @@ -897,4 +897,4 @@ is_deeply( # ############################################################################# $sb->wipe_clean($dbh) if $dbh; ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); -exit; +done_testing; diff --git a/t/pt-table-checksum/error_handling.t b/t/pt-table-checksum/error_handling.t index 5334d0a2..1a14bbbb 100644 --- a/t/pt-table-checksum/error_handling.t +++ b/t/pt-table-checksum/error_handling.t @@ -11,6 +11,8 @@ use warnings FATAL => 'all'; use English qw(-no_match_vars); use Test::More; +$ENV{PERCONA_TOOLKIT_TEST_USE_DSN_NAMES} = 1; + use PerconaTest; use Sandbox; shift @INC; # our unshift (above) @@ -20,12 +22,13 @@ require "$trunk/bin/pt-table-checksum"; my $dp = new DSNParser(opts=>$dsn_opts); my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); my $master_dbh = $sb->get_dbh_for('master'); +my $slave1_dbh = $sb->get_dbh_for('slave1'); if ( !$master_dbh ) { plan skip_all => 'Cannot connect to sandbox master'; } -else { - plan tests => 7; +elsif ( !$slave1_dbh ) { + plan skip_all => 'Cannot connect to sandbox slave'; } # The sandbox servers run with lock_wait_timeout=3 and it's not dynamic @@ -34,6 +37,7 @@ else { my $master_dsn = 'h=127.1,P=12345,u=msandbox,p=msandbox'; my @args = ($master_dsn, qw(--lock-wait-timeout 3), '--max-load', ''); my $output; +my $exit_status; $sb->create_dbs($master_dbh, [qw(test)]); @@ -113,9 +117,95 @@ like( # Reconnect to master since we just killed ourself. $master_dbh = $sb->get_dbh_for('master'); +# ############################################################################# +# pt-table-checksum breaks replication if a slave table is missing or different +# https://bugs.launchpad.net/percona-toolkit/+bug/1009510 +# ############################################################################# + +# Just re-using this simple table. +$sb->load_file('master', "t/pt-table-checksum/samples/600cities.sql"); + +$master_dbh->do("SET SQL_LOG_BIN=0"); +$master_dbh->do("ALTER TABLE test.t ADD COLUMN col3 int"); +$master_dbh->do("SET SQL_LOG_BIN=1"); + +$output = output( + sub { $exit_status = pt_table_checksum::main(@args, + qw(-t test.t)) }, + stderr => 1, +); + +like( + $output, + qr/Skipping table test.t/, + "Skip table missing column on slave (bug 1009510)" +); + +like( + $output, + qr/replica h=127.0.0.1,P=12346 is missing these columns: col3/, + "Checked slave1 (bug 1009510)" +); + +like( + $output, + qr/replica h=127.0.0.1,P=12347 is missing these columns: col3/, + "Checked slave2 (bug 1009510)" +); + +is( + $exit_status, + 1, + "Non-zero exit status (bug 1009510)" +); + +$output = output( + sub { $exit_status = pt_table_checksum::main(@args, + qw(-t test.t), '--columns', 'id,city') }, + stderr => 1, +); + +unlike( + $output, + qr/Skipping table test.t/, + "Doesn't skip table missing column on slave with --columns (bug 1009510)" +); + +is( + $exit_status, + 0, + "Zero exit status with --columns (bug 1009510)" +); + +# Use the --replicate table created by the previous ^ tests. + +# Create a user that can't create the --replicate table. +diag(`/tmp/12345/use -uroot -pmsandbox < $trunk/t/lib/samples/ro-checksum-user.sql`); +diag(`/tmp/12345/use -uroot -pmsandbox -e "GRANT REPLICATION CLIENT, REPLICATION SLAVE ON *.* TO ro_checksum_user\@'%'"`); + +# Remove the --replicate table from slave1 and slave2, +# so it's only on the master... +$slave1_dbh->do("DROP DATABASE percona"); +$sb->wait_for_slaves; + +$output = output( + sub { $exit_status = pt_table_checksum::main( + "h=127.1,u=ro_checksum_user,p=msandbox,P=12345", + qw(--lock-wait-timeout 3 -t mysql.user)) }, + stderr => 1, +); + +like( + $output, + qr/database percona exists on the master/, + "CREATE DATABASE error and db is missing on slaves (bug 1039569)" +); + +diag(`/tmp/12345/use -uroot -pmsandbox -e "DROP USER ro_checksum_user\@'%'"`); + # ############################################################################# # Done. # ############################################################################# $sb->wipe_clean($master_dbh); ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); -exit; +done_testing; diff --git a/t/pt-table-checksum/privs.t b/t/pt-table-checksum/privs.t index 36dd23b7..270ac575 100644 --- a/t/pt-table-checksum/privs.t +++ b/t/pt-table-checksum/privs.t @@ -132,16 +132,17 @@ like( "Read-only user (bug 987694): checksummed rows" ); -($output) = full_output( - sub { $exit_status = pt_table_checksum::main(@args, +($output, $exit_status) = full_output( + sub { pt_table_checksum::main(@args, "$master_dsn,u=ro_checksum_user,p=msandbox", qw(--recursion-method none) ) } ); -like($output, - qr/\QThe database exists on the master, but replication will break/, - "Error if db exists on the master, can't CREATE DATABASE, and --no-create-replicate-table was not specified", +is( + $exit_status, + 0, + "No error if db exists on the master, can't CREATE DATABASE, --no-create-replicate-table was not specified, but the database does exist in all slaves" ); diag(qx{/tmp/12345/use -u root -e 'DROP TABLE `percona`.`checksums`'});