diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index 135f9888..8a2fc127 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -3580,6 +3580,9 @@ sub new { else { my $index = $nibble_params->{index}; # brevity my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols}; + my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); + my $limit = $chunk_size - 1; + PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); my $asc = $args{TableNibbler}->generate_asc_stmt( %args, @@ -3590,18 +3593,52 @@ sub new { ); PTDEBUG && _d('Ascend params:', Dumper($asc)); - my $from = "$tbl->{name} FORCE INDEX(`$index`)"; - my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); - my $first_lb_sql = "SELECT /*!40001 SQL_NO_CACHE */ " . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $from" + . " FROM $tbl->{name}" . ($where ? " WHERE $where" : '') . " ORDER BY $order_by" . " LIMIT 1" . " /*first lower boundary*/"; - PTDEBUG && _d('First lower boundary statement:', $first_lb_sql); + PTDEBUG && _d($first_lb_sql); + my $first_lower = $cxn->dbh()->selectrow_arrayref($first_lb_sql); + PTDEBUG && _d('First lower boundary:', Dumper($first_lower)); + + if ( !$args{chunk_index} || (lc($args{chunk_index}) ne lc($index)) ) { + + my $sql + = "EXPLAIN SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$asc->{scols}}) + . " FROM $tbl->{name}" + . " WHERE " . $asc->{boundaries}->{'>='} + . ($where ? " AND ($where)" : '') + . " ORDER BY $order_by" + . " LIMIT ?, 2" + . " /*get MySQL index*/"; + my $sth = $cxn->dbh()->prepare($sql); + my $mysql_index = _get_mysql_index( + Cxn => $cxn, + sth => $sth, + params => [@$first_lower, $limit], + ); + PTDEBUG && _d('MySQL index:', $mysql_index); + + if ( lc($index) ne lc($mysql_index) ) { + my $chosen_index_struct = $tbl->{tbl_struct}->{keys}->{$index}; + my $mysql_index_struct = $tbl->{tbl_struct}->{keys}->{$mysql_index}; + warn "The best index for chunking $tbl->{name} is $index (" + . ($chosen_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$chosen_index_struct->{cols}} + . " columns), but index $mysql_index (" + . ($mysql_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$mysql_index_struct->{cols}} + . " columns) that MySQL chose will be used instead.\n"; + $index = $mysql_index; + } + } + + my $from = "$tbl->{name} FORCE INDEX(`$index`)"; my $resume_lb_sql; if ( $args{resume} ) { @@ -3663,14 +3700,11 @@ sub new { . " /*explain $comments{nibble}*/"; PTDEBUG && _d('Explain nibble statement:', $explain_nibble_sql); - my $limit = $chunk_size - 1; - PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - $self = { %args, index => $index, limit => $limit, - first_lb_sql => $first_lb_sql, + first_lower => $first_lower, last_ub_sql => $last_ub_sql, ub_sql => $ub_sql, nibble_sql => $nibble_sql, @@ -3858,7 +3892,7 @@ sub can_nibble { } my ($cxn, $tbl, $chunk_size, $o) = @args{@required_args}; - my ($row_est, $mysql_index) = get_row_estimate( + my $row_est = get_row_estimate( Cxn => $cxn, tbl => $tbl, where => $o->has('where') ? $o->get('where') : '', @@ -3876,7 +3910,7 @@ sub can_nibble { $one_nibble = 1; } - my $index = _find_best_index(%args, mysql_index => $mysql_index); + my $index = _find_best_index(%args); if ( !$index && !$one_nibble ) { die "There is no good index and the table is oversized."; } @@ -3980,6 +4014,18 @@ sub _get_index_cardinality { return $cardinality; } +sub _get_mysql_index { + my (%args) = @_; + my @required_args = qw(Cxn sth params); + my ($cxn, $sth, $params) = @args{@required_args}; + PTDEBUG && _d($sth->{Statement}, 'params:', @$params); + $sth->execute(@$params); + my $row = $sth->fetchrow_hashref(); + $sth->finish(); + PTDEBUG && _d(Dumper($row)); + return $row->{key}; +} + sub get_row_estimate { my (%args) = @_; my @required_args = qw(Cxn tbl); @@ -3989,11 +4035,11 @@ sub get_row_estimate { my ($cxn, $tbl) = @args{@required_args}; my $sql = "EXPLAIN SELECT * FROM $tbl->{name} " - . "WHERE " . ($args{where} || '1=1'); + . "WHERE " . ($args{where} || '1=1 /*get row estimate*/'); PTDEBUG && _d($sql); my $expl = $cxn->dbh()->selectrow_hashref($sql); PTDEBUG && _d(Dumper($expl)); - return ($expl->{rows} || 0), $expl->{key}; + return $expl->{rows} || 0; } sub _prepare_sths { @@ -4025,9 +4071,6 @@ sub _get_bounds { my $dbh = $self->{Cxn}->dbh(); - $self->{first_lower} = $dbh->selectrow_arrayref($self->{first_lb_sql}); - PTDEBUG && _d('First lower boundary:', Dumper($self->{first_lower})); - if ( my $nibble = $self->{resume} ) { if ( defined $nibble->{lower_boundary} && defined $nibble->{upper_boundary} ) { @@ -6387,10 +6430,7 @@ sub main { my $chunk_size_limit = $o->get('chunk-size-limit'); my @too_large; foreach my $slave ( @$slaves ) { - # get_row_estimate() returns (row_est, index), but - # we only need the row_est. Maybe in the future we'll - # care what index MySQL will use on a slave. - my ($n_rows) = NibbleIterator::get_row_estimate( + my $n_rows = NibbleIterator::get_row_estimate( Cxn => $slave, tbl => $tbl, where => $o->get('where'), diff --git a/lib/NibbleIterator.pm b/lib/NibbleIterator.pm index 2730fe6a..0be676df 100644 --- a/lib/NibbleIterator.pm +++ b/lib/NibbleIterator.pm @@ -120,8 +120,11 @@ sub new { else { my $index = $nibble_params->{index}; # brevity my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols}; + my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); + my $limit = $chunk_size - 1; + PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - # Figure out how to nibble the table with the index. + # Figure out how to nibble the table with the chosen index. my $asc = $args{TableNibbler}->generate_asc_stmt( %args, tbl_struct => $tbl->{tbl_struct}, @@ -131,23 +134,71 @@ sub new { ); PTDEBUG && _d('Ascend params:', Dumper($asc)); - # Make SQL statements, prepared on first call to next(). FROM and - # ORDER BY are the same for all statements. FORCE IDNEX and ORDER BY - # are needed to ensure deterministic nibbling. - my $from = "$tbl->{name} FORCE INDEX(`$index`)"; - my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); - - # The real first row in the table. Usually we start nibbling from - # this row. Called once in _get_bounds(). + # Get the real first lower boundary. Using this plus the chosen index, + # we'll see what index MySQL wants to use to ascend the table. This + # is only executed once, and the first lower boundary is saved so we + # can start nibbling from it later. my $first_lb_sql = "SELECT /*!40001 SQL_NO_CACHE */ " . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $from" + . " FROM $tbl->{name}" . ($where ? " WHERE $where" : '') . " ORDER BY $order_by" . " LIMIT 1" . " /*first lower boundary*/"; - PTDEBUG && _d('First lower boundary statement:', $first_lb_sql); + PTDEBUG && _d($first_lb_sql); + my $first_lower = $cxn->dbh()->selectrow_arrayref($first_lb_sql); + PTDEBUG && _d('First lower boundary:', Dumper($first_lower)); + + # If the user didn't request a --chunk-index or they did but + # it wasn't chosen, then check which index MySQL wants to use + # to ascend the table. + if ( !$args{chunk_index} || (lc($args{chunk_index}) ne lc($index)) ) { + + # This statment must be identical to the (poorly named) ub_sql below + # (aka "next chunk boundary") because ub_sql is what ascends the table + # and therefore might cause a table scan. The difference between this + # statement and the real ub_sql below is that here we do not add + # FORCE INDEX but let MySQL chose the index. + my $sql + = "EXPLAIN SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$asc->{scols}}) + . " FROM $tbl->{name}" + . " WHERE " . $asc->{boundaries}->{'>='} + . ($where ? " AND ($where)" : '') + . " ORDER BY $order_by" + . " LIMIT ?, 2" + . " /*get MySQL index*/"; + my $sth = $cxn->dbh()->prepare($sql); + my $mysql_index = _get_mysql_index( + Cxn => $cxn, + sth => $sth, + params => [@$first_lower, $limit], + ); + PTDEBUG && _d('MySQL index:', $mysql_index); + + if ( lc($index) ne lc($mysql_index) ) { + # Our chosen index and MySQL's chosen index are different. + # This probably happens due to a --where clause that we don't + # know anything about but MySQL can optimize for by using + # another index. We use the MySQL instead of our chosen index + # because the MySQL optimizer should know best. + my $chosen_index_struct = $tbl->{tbl_struct}->{keys}->{$index}; + my $mysql_index_struct = $tbl->{tbl_struct}->{keys}->{$mysql_index}; + warn "The best index for chunking $tbl->{name} is $index (" + . ($chosen_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$chosen_index_struct->{cols}} + . " columns), but index $mysql_index (" + . ($mysql_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$mysql_index_struct->{cols}} + . " columns) that MySQL chose will be used instead.\n"; + $index = $mysql_index; + } + } + + # All statements from here on will use FORCE INDEX now that we know + # which index is best. + my $from = "$tbl->{name} FORCE INDEX(`$index`)"; # If we're resuming, this fetches the effective first row, which # should differ from the real first row. Called once in _get_bounds(). @@ -224,14 +275,11 @@ sub new { . " /*explain $comments{nibble}*/"; PTDEBUG && _d('Explain nibble statement:', $explain_nibble_sql); - my $limit = $chunk_size - 1; - PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - $self = { %args, index => $index, limit => $limit, - first_lb_sql => $first_lb_sql, + first_lower => $first_lower, last_ub_sql => $last_ub_sql, ub_sql => $ub_sql, nibble_sql => $nibble_sql, @@ -429,7 +477,7 @@ sub can_nibble { my ($cxn, $tbl, $chunk_size, $o) = @args{@required_args}; # About how many rows are there? - my ($row_est, $mysql_index) = get_row_estimate( + my $row_est = get_row_estimate( Cxn => $cxn, tbl => $tbl, where => $o->has('where') ? $o->get('where') : '', @@ -452,7 +500,7 @@ sub can_nibble { } # Get an index to nibble by. We'll order rows by the index's columns. - my $index = _find_best_index(%args, mysql_index => $mysql_index); + my $index = _find_best_index(%args); if ( !$index && !$one_nibble ) { die "There is no good index and the table is oversized."; } @@ -561,6 +609,18 @@ sub _get_index_cardinality { return $cardinality; } +sub _get_mysql_index { + my (%args) = @_; + my @required_args = qw(Cxn sth params); + my ($cxn, $sth, $params) = @args{@required_args}; + PTDEBUG && _d($sth->{Statement}, 'params:', @$params); + $sth->execute(@$params); + my $row = $sth->fetchrow_hashref(); + $sth->finish(); + PTDEBUG && _d(Dumper($row)); + return $row->{key}; +} + sub get_row_estimate { my (%args) = @_; my @required_args = qw(Cxn tbl); @@ -570,11 +630,11 @@ sub get_row_estimate { my ($cxn, $tbl) = @args{@required_args}; my $sql = "EXPLAIN SELECT * FROM $tbl->{name} " - . "WHERE " . ($args{where} || '1=1'); + . "WHERE " . ($args{where} || '1=1 /*get row estimate*/'); PTDEBUG && _d($sql); my $expl = $cxn->dbh()->selectrow_hashref($sql); PTDEBUG && _d(Dumper($expl)); - return ($expl->{rows} || 0), $expl->{key}; + return $expl->{rows} || 0; } sub _prepare_sths { @@ -606,10 +666,6 @@ sub _get_bounds { my $dbh = $self->{Cxn}->dbh(); - # Get the real first lower boundary. - $self->{first_lower} = $dbh->selectrow_arrayref($self->{first_lb_sql}); - PTDEBUG && _d('First lower boundary:', Dumper($self->{first_lower})); - # The next boundary is the first lower boundary. If resuming, # this should be something > the real first lower boundary and # bounded (else it's not one of our chunks). diff --git a/t/pt-table-checksum/chunk_index.t b/t/pt-table-checksum/chunk_index.t index 3aae20f7..48bab4e5 100644 --- a/t/pt-table-checksum/chunk_index.t +++ b/t/pt-table-checksum/chunk_index.t @@ -25,7 +25,7 @@ if ( !$dbh ) { plan skip_all => 'Cannot connect to sandbox master'; } else { - plan tests => 10; + plan tests => 11; } # The sandbox servers run with lock_wait_timeout=3 and it's not dynamic @@ -141,6 +141,22 @@ is( "14 rows checksummed (bug 925855)" ); +# ############################################################################# +# Bug 978432: PK is ignored +# ############################################################################# +$sb->load_file('master', "t/pt-table-checksum/samples/not-using-pk-bug.sql"); +PerconaTest::wait_for_table($dbh, "test.multi_resource_apt", "apt_id=4 AND res_id=4"); + +ok( + no_diff( + sub { pt_table_checksum::main(@args, + qw(-t test.multi_resource_apt --chunk-size 2 --explain --explain)) + }, + "t/pt-table-checksum/samples/not-using-pk-bug.out", + ), + "Smarter chunk index selection (bug 978432)" +); + # ############################################################################# # Done. # ############################################################################# diff --git a/t/pt-table-checksum/samples/not-using-pk-bug.out b/t/pt-table-checksum/samples/not-using-pk-bug.out new file mode 100644 index 00000000..f9640d33 --- /dev/null +++ b/t/pt-table-checksum/samples/not-using-pk-bug.out @@ -0,0 +1,20 @@ +-- +-- test.multi_resource_apt +-- + +REPLACE INTO `percona`.`checksums` (db, tbl, chunk, chunk_index, lower_boundary, upper_boundary, this_cnt, this_crc) SELECT ?, ?, ?, ?, ?, ?, COUNT(*) AS cnt, COALESCE(LOWER(CONV(BIT_XOR(CAST(CRC32(CONCAT_WS('#', `apt_id`, `res_id`)) AS UNSIGNED)), 10, 16)), 0) AS crc FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` > ?) OR (`apt_id` = ? AND `res_id` >= ?)) AND ((`apt_id` < ?) OR (`apt_id` = ? AND `res_id` <= ?)) /*checksum chunk*/ + +REPLACE INTO `percona`.`checksums` (db, tbl, chunk, chunk_index, lower_boundary, upper_boundary, this_cnt, this_crc) SELECT ?, ?, ?, ?, ?, ?, COUNT(*), '0' FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` < ?) OR (`apt_id` = ? AND `res_id` < ?)) ORDER BY `apt_id`, `res_id` /*past lower chunk*/ + +REPLACE INTO `percona`.`checksums` (db, tbl, chunk, chunk_index, lower_boundary, upper_boundary, this_cnt, this_crc) SELECT ?, ?, ?, ?, ?, ?, COUNT(*), '0' FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` > ?) OR (`apt_id` = ? AND `res_id` > ?)) ORDER BY `apt_id`, `res_id` /*past upper chunk*/ + +SELECT /*!40001 SQL_NO_CACHE */ `apt_id`, `apt_id`, `res_id` FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` > ?) OR (`apt_id` = ? AND `res_id` >= ?)) ORDER BY `apt_id`, `res_id` LIMIT ?, 2 /*next chunk boundary*/ + +1 1,1,1 2,2,1 +2 2,2,2 3,3,1 +3 3,3,2 3,3,3 +4 4,4,1 4,4,2 +5 4,4,3 4,4,4 +6 1,1,1 +7 4,4,4 + diff --git a/t/pt-table-checksum/samples/not-using-pk-bug.sql b/t/pt-table-checksum/samples/not-using-pk-bug.sql new file mode 100644 index 00000000..b299513e --- /dev/null +++ b/t/pt-table-checksum/samples/not-using-pk-bug.sql @@ -0,0 +1,20 @@ +DROP DATABASE IF EXISTS test; +CREATE DATABASE test; +USE test; +CREATE TABLE `multi_resource_apt` ( + `apt_id` int(10) unsigned NOT NULL DEFAULT '0', + `res_id` int(10) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`apt_id`,`res_id`), + KEY `resid` (`res_id`) +) ENGINE=InnoDB; +INSERT INTO multi_resource_apt VALUES + (1, 1), + (2, 1), + (2, 2), + (3, 1), + (3, 2), + (3, 3), + (4, 1), + (4, 2), + (4, 3), + (4, 4);