From e2073065b1a5a17acd3516dbb7c00622420af416 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Fri, 4 May 2012 17:46:34 -0600 Subject: [PATCH 1/2] EXPLAIN actual ascending nibble query to get MySQL's chosen index, and use that instead of the tool's chosen index. TODO: fix OobNibbleIterator.t and chunk_index.t. --- bin/pt-table-checksum | 80 ++++++++++---- lib/NibbleIterator.pm | 102 ++++++++++++++---- t/pt-table-checksum/chunk_index.t | 18 +++- .../samples/not-using-pk-bug.out | 20 ++++ .../samples/not-using-pk-bug.sql | 20 ++++ 5 files changed, 196 insertions(+), 44 deletions(-) create mode 100644 t/pt-table-checksum/samples/not-using-pk-bug.out create mode 100644 t/pt-table-checksum/samples/not-using-pk-bug.sql diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index 135f9888..8a2fc127 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -3580,6 +3580,9 @@ sub new { else { my $index = $nibble_params->{index}; # brevity my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols}; + my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); + my $limit = $chunk_size - 1; + PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); my $asc = $args{TableNibbler}->generate_asc_stmt( %args, @@ -3590,18 +3593,52 @@ sub new { ); PTDEBUG && _d('Ascend params:', Dumper($asc)); - my $from = "$tbl->{name} FORCE INDEX(`$index`)"; - my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); - my $first_lb_sql = "SELECT /*!40001 SQL_NO_CACHE */ " . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $from" + . " FROM $tbl->{name}" . ($where ? " WHERE $where" : '') . " ORDER BY $order_by" . " LIMIT 1" . " /*first lower boundary*/"; - PTDEBUG && _d('First lower boundary statement:', $first_lb_sql); + PTDEBUG && _d($first_lb_sql); + my $first_lower = $cxn->dbh()->selectrow_arrayref($first_lb_sql); + PTDEBUG && _d('First lower boundary:', Dumper($first_lower)); + + if ( !$args{chunk_index} || (lc($args{chunk_index}) ne lc($index)) ) { + + my $sql + = "EXPLAIN SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$asc->{scols}}) + . " FROM $tbl->{name}" + . " WHERE " . $asc->{boundaries}->{'>='} + . ($where ? " AND ($where)" : '') + . " ORDER BY $order_by" + . " LIMIT ?, 2" + . " /*get MySQL index*/"; + my $sth = $cxn->dbh()->prepare($sql); + my $mysql_index = _get_mysql_index( + Cxn => $cxn, + sth => $sth, + params => [@$first_lower, $limit], + ); + PTDEBUG && _d('MySQL index:', $mysql_index); + + if ( lc($index) ne lc($mysql_index) ) { + my $chosen_index_struct = $tbl->{tbl_struct}->{keys}->{$index}; + my $mysql_index_struct = $tbl->{tbl_struct}->{keys}->{$mysql_index}; + warn "The best index for chunking $tbl->{name} is $index (" + . ($chosen_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$chosen_index_struct->{cols}} + . " columns), but index $mysql_index (" + . ($mysql_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$mysql_index_struct->{cols}} + . " columns) that MySQL chose will be used instead.\n"; + $index = $mysql_index; + } + } + + my $from = "$tbl->{name} FORCE INDEX(`$index`)"; my $resume_lb_sql; if ( $args{resume} ) { @@ -3663,14 +3700,11 @@ sub new { . " /*explain $comments{nibble}*/"; PTDEBUG && _d('Explain nibble statement:', $explain_nibble_sql); - my $limit = $chunk_size - 1; - PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - $self = { %args, index => $index, limit => $limit, - first_lb_sql => $first_lb_sql, + first_lower => $first_lower, last_ub_sql => $last_ub_sql, ub_sql => $ub_sql, nibble_sql => $nibble_sql, @@ -3858,7 +3892,7 @@ sub can_nibble { } my ($cxn, $tbl, $chunk_size, $o) = @args{@required_args}; - my ($row_est, $mysql_index) = get_row_estimate( + my $row_est = get_row_estimate( Cxn => $cxn, tbl => $tbl, where => $o->has('where') ? $o->get('where') : '', @@ -3876,7 +3910,7 @@ sub can_nibble { $one_nibble = 1; } - my $index = _find_best_index(%args, mysql_index => $mysql_index); + my $index = _find_best_index(%args); if ( !$index && !$one_nibble ) { die "There is no good index and the table is oversized."; } @@ -3980,6 +4014,18 @@ sub _get_index_cardinality { return $cardinality; } +sub _get_mysql_index { + my (%args) = @_; + my @required_args = qw(Cxn sth params); + my ($cxn, $sth, $params) = @args{@required_args}; + PTDEBUG && _d($sth->{Statement}, 'params:', @$params); + $sth->execute(@$params); + my $row = $sth->fetchrow_hashref(); + $sth->finish(); + PTDEBUG && _d(Dumper($row)); + return $row->{key}; +} + sub get_row_estimate { my (%args) = @_; my @required_args = qw(Cxn tbl); @@ -3989,11 +4035,11 @@ sub get_row_estimate { my ($cxn, $tbl) = @args{@required_args}; my $sql = "EXPLAIN SELECT * FROM $tbl->{name} " - . "WHERE " . ($args{where} || '1=1'); + . "WHERE " . ($args{where} || '1=1 /*get row estimate*/'); PTDEBUG && _d($sql); my $expl = $cxn->dbh()->selectrow_hashref($sql); PTDEBUG && _d(Dumper($expl)); - return ($expl->{rows} || 0), $expl->{key}; + return $expl->{rows} || 0; } sub _prepare_sths { @@ -4025,9 +4071,6 @@ sub _get_bounds { my $dbh = $self->{Cxn}->dbh(); - $self->{first_lower} = $dbh->selectrow_arrayref($self->{first_lb_sql}); - PTDEBUG && _d('First lower boundary:', Dumper($self->{first_lower})); - if ( my $nibble = $self->{resume} ) { if ( defined $nibble->{lower_boundary} && defined $nibble->{upper_boundary} ) { @@ -6387,10 +6430,7 @@ sub main { my $chunk_size_limit = $o->get('chunk-size-limit'); my @too_large; foreach my $slave ( @$slaves ) { - # get_row_estimate() returns (row_est, index), but - # we only need the row_est. Maybe in the future we'll - # care what index MySQL will use on a slave. - my ($n_rows) = NibbleIterator::get_row_estimate( + my $n_rows = NibbleIterator::get_row_estimate( Cxn => $slave, tbl => $tbl, where => $o->get('where'), diff --git a/lib/NibbleIterator.pm b/lib/NibbleIterator.pm index 2730fe6a..0be676df 100644 --- a/lib/NibbleIterator.pm +++ b/lib/NibbleIterator.pm @@ -120,8 +120,11 @@ sub new { else { my $index = $nibble_params->{index}; # brevity my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols}; + my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); + my $limit = $chunk_size - 1; + PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - # Figure out how to nibble the table with the index. + # Figure out how to nibble the table with the chosen index. my $asc = $args{TableNibbler}->generate_asc_stmt( %args, tbl_struct => $tbl->{tbl_struct}, @@ -131,23 +134,71 @@ sub new { ); PTDEBUG && _d('Ascend params:', Dumper($asc)); - # Make SQL statements, prepared on first call to next(). FROM and - # ORDER BY are the same for all statements. FORCE IDNEX and ORDER BY - # are needed to ensure deterministic nibbling. - my $from = "$tbl->{name} FORCE INDEX(`$index`)"; - my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); - - # The real first row in the table. Usually we start nibbling from - # this row. Called once in _get_bounds(). + # Get the real first lower boundary. Using this plus the chosen index, + # we'll see what index MySQL wants to use to ascend the table. This + # is only executed once, and the first lower boundary is saved so we + # can start nibbling from it later. my $first_lb_sql = "SELECT /*!40001 SQL_NO_CACHE */ " . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $from" + . " FROM $tbl->{name}" . ($where ? " WHERE $where" : '') . " ORDER BY $order_by" . " LIMIT 1" . " /*first lower boundary*/"; - PTDEBUG && _d('First lower boundary statement:', $first_lb_sql); + PTDEBUG && _d($first_lb_sql); + my $first_lower = $cxn->dbh()->selectrow_arrayref($first_lb_sql); + PTDEBUG && _d('First lower boundary:', Dumper($first_lower)); + + # If the user didn't request a --chunk-index or they did but + # it wasn't chosen, then check which index MySQL wants to use + # to ascend the table. + if ( !$args{chunk_index} || (lc($args{chunk_index}) ne lc($index)) ) { + + # This statment must be identical to the (poorly named) ub_sql below + # (aka "next chunk boundary") because ub_sql is what ascends the table + # and therefore might cause a table scan. The difference between this + # statement and the real ub_sql below is that here we do not add + # FORCE INDEX but let MySQL chose the index. + my $sql + = "EXPLAIN SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$asc->{scols}}) + . " FROM $tbl->{name}" + . " WHERE " . $asc->{boundaries}->{'>='} + . ($where ? " AND ($where)" : '') + . " ORDER BY $order_by" + . " LIMIT ?, 2" + . " /*get MySQL index*/"; + my $sth = $cxn->dbh()->prepare($sql); + my $mysql_index = _get_mysql_index( + Cxn => $cxn, + sth => $sth, + params => [@$first_lower, $limit], + ); + PTDEBUG && _d('MySQL index:', $mysql_index); + + if ( lc($index) ne lc($mysql_index) ) { + # Our chosen index and MySQL's chosen index are different. + # This probably happens due to a --where clause that we don't + # know anything about but MySQL can optimize for by using + # another index. We use the MySQL instead of our chosen index + # because the MySQL optimizer should know best. + my $chosen_index_struct = $tbl->{tbl_struct}->{keys}->{$index}; + my $mysql_index_struct = $tbl->{tbl_struct}->{keys}->{$mysql_index}; + warn "The best index for chunking $tbl->{name} is $index (" + . ($chosen_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$chosen_index_struct->{cols}} + . " columns), but index $mysql_index (" + . ($mysql_index_struct->{is_unique} ? "unique" : "not unique") + . ", covers " . scalar @{$mysql_index_struct->{cols}} + . " columns) that MySQL chose will be used instead.\n"; + $index = $mysql_index; + } + } + + # All statements from here on will use FORCE INDEX now that we know + # which index is best. + my $from = "$tbl->{name} FORCE INDEX(`$index`)"; # If we're resuming, this fetches the effective first row, which # should differ from the real first row. Called once in _get_bounds(). @@ -224,14 +275,11 @@ sub new { . " /*explain $comments{nibble}*/"; PTDEBUG && _d('Explain nibble statement:', $explain_nibble_sql); - my $limit = $chunk_size - 1; - PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - $self = { %args, index => $index, limit => $limit, - first_lb_sql => $first_lb_sql, + first_lower => $first_lower, last_ub_sql => $last_ub_sql, ub_sql => $ub_sql, nibble_sql => $nibble_sql, @@ -429,7 +477,7 @@ sub can_nibble { my ($cxn, $tbl, $chunk_size, $o) = @args{@required_args}; # About how many rows are there? - my ($row_est, $mysql_index) = get_row_estimate( + my $row_est = get_row_estimate( Cxn => $cxn, tbl => $tbl, where => $o->has('where') ? $o->get('where') : '', @@ -452,7 +500,7 @@ sub can_nibble { } # Get an index to nibble by. We'll order rows by the index's columns. - my $index = _find_best_index(%args, mysql_index => $mysql_index); + my $index = _find_best_index(%args); if ( !$index && !$one_nibble ) { die "There is no good index and the table is oversized."; } @@ -561,6 +609,18 @@ sub _get_index_cardinality { return $cardinality; } +sub _get_mysql_index { + my (%args) = @_; + my @required_args = qw(Cxn sth params); + my ($cxn, $sth, $params) = @args{@required_args}; + PTDEBUG && _d($sth->{Statement}, 'params:', @$params); + $sth->execute(@$params); + my $row = $sth->fetchrow_hashref(); + $sth->finish(); + PTDEBUG && _d(Dumper($row)); + return $row->{key}; +} + sub get_row_estimate { my (%args) = @_; my @required_args = qw(Cxn tbl); @@ -570,11 +630,11 @@ sub get_row_estimate { my ($cxn, $tbl) = @args{@required_args}; my $sql = "EXPLAIN SELECT * FROM $tbl->{name} " - . "WHERE " . ($args{where} || '1=1'); + . "WHERE " . ($args{where} || '1=1 /*get row estimate*/'); PTDEBUG && _d($sql); my $expl = $cxn->dbh()->selectrow_hashref($sql); PTDEBUG && _d(Dumper($expl)); - return ($expl->{rows} || 0), $expl->{key}; + return $expl->{rows} || 0; } sub _prepare_sths { @@ -606,10 +666,6 @@ sub _get_bounds { my $dbh = $self->{Cxn}->dbh(); - # Get the real first lower boundary. - $self->{first_lower} = $dbh->selectrow_arrayref($self->{first_lb_sql}); - PTDEBUG && _d('First lower boundary:', Dumper($self->{first_lower})); - # The next boundary is the first lower boundary. If resuming, # this should be something > the real first lower boundary and # bounded (else it's not one of our chunks). diff --git a/t/pt-table-checksum/chunk_index.t b/t/pt-table-checksum/chunk_index.t index 3aae20f7..48bab4e5 100644 --- a/t/pt-table-checksum/chunk_index.t +++ b/t/pt-table-checksum/chunk_index.t @@ -25,7 +25,7 @@ if ( !$dbh ) { plan skip_all => 'Cannot connect to sandbox master'; } else { - plan tests => 10; + plan tests => 11; } # The sandbox servers run with lock_wait_timeout=3 and it's not dynamic @@ -141,6 +141,22 @@ is( "14 rows checksummed (bug 925855)" ); +# ############################################################################# +# Bug 978432: PK is ignored +# ############################################################################# +$sb->load_file('master', "t/pt-table-checksum/samples/not-using-pk-bug.sql"); +PerconaTest::wait_for_table($dbh, "test.multi_resource_apt", "apt_id=4 AND res_id=4"); + +ok( + no_diff( + sub { pt_table_checksum::main(@args, + qw(-t test.multi_resource_apt --chunk-size 2 --explain --explain)) + }, + "t/pt-table-checksum/samples/not-using-pk-bug.out", + ), + "Smarter chunk index selection (bug 978432)" +); + # ############################################################################# # Done. # ############################################################################# diff --git a/t/pt-table-checksum/samples/not-using-pk-bug.out b/t/pt-table-checksum/samples/not-using-pk-bug.out new file mode 100644 index 00000000..f9640d33 --- /dev/null +++ b/t/pt-table-checksum/samples/not-using-pk-bug.out @@ -0,0 +1,20 @@ +-- +-- test.multi_resource_apt +-- + +REPLACE INTO `percona`.`checksums` (db, tbl, chunk, chunk_index, lower_boundary, upper_boundary, this_cnt, this_crc) SELECT ?, ?, ?, ?, ?, ?, COUNT(*) AS cnt, COALESCE(LOWER(CONV(BIT_XOR(CAST(CRC32(CONCAT_WS('#', `apt_id`, `res_id`)) AS UNSIGNED)), 10, 16)), 0) AS crc FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` > ?) OR (`apt_id` = ? AND `res_id` >= ?)) AND ((`apt_id` < ?) OR (`apt_id` = ? AND `res_id` <= ?)) /*checksum chunk*/ + +REPLACE INTO `percona`.`checksums` (db, tbl, chunk, chunk_index, lower_boundary, upper_boundary, this_cnt, this_crc) SELECT ?, ?, ?, ?, ?, ?, COUNT(*), '0' FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` < ?) OR (`apt_id` = ? AND `res_id` < ?)) ORDER BY `apt_id`, `res_id` /*past lower chunk*/ + +REPLACE INTO `percona`.`checksums` (db, tbl, chunk, chunk_index, lower_boundary, upper_boundary, this_cnt, this_crc) SELECT ?, ?, ?, ?, ?, ?, COUNT(*), '0' FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` > ?) OR (`apt_id` = ? AND `res_id` > ?)) ORDER BY `apt_id`, `res_id` /*past upper chunk*/ + +SELECT /*!40001 SQL_NO_CACHE */ `apt_id`, `apt_id`, `res_id` FROM `test`.`multi_resource_apt` FORCE INDEX(`PRIMARY`) WHERE ((`apt_id` > ?) OR (`apt_id` = ? AND `res_id` >= ?)) ORDER BY `apt_id`, `res_id` LIMIT ?, 2 /*next chunk boundary*/ + +1 1,1,1 2,2,1 +2 2,2,2 3,3,1 +3 3,3,2 3,3,3 +4 4,4,1 4,4,2 +5 4,4,3 4,4,4 +6 1,1,1 +7 4,4,4 + diff --git a/t/pt-table-checksum/samples/not-using-pk-bug.sql b/t/pt-table-checksum/samples/not-using-pk-bug.sql new file mode 100644 index 00000000..b299513e --- /dev/null +++ b/t/pt-table-checksum/samples/not-using-pk-bug.sql @@ -0,0 +1,20 @@ +DROP DATABASE IF EXISTS test; +CREATE DATABASE test; +USE test; +CREATE TABLE `multi_resource_apt` ( + `apt_id` int(10) unsigned NOT NULL DEFAULT '0', + `res_id` int(10) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`apt_id`,`res_id`), + KEY `resid` (`res_id`) +) ENGINE=InnoDB; +INSERT INTO multi_resource_apt VALUES + (1, 1), + (2, 1), + (2, 2), + (3, 1), + (3, 2), + (3, 3), + (4, 1), + (4, 2), + (4, 3), + (4, 4); From 30b6b887666c4c07b40a85fd5e0c005a4edab171 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 8 May 2012 12:43:47 -0600 Subject: [PATCH 2/2] Restore original NibbleIterator and implement simpler solution: only use MySQL's chosen index if --where. --- bin/pt-table-checksum | 85 +++++++++--------------------- lib/NibbleIterator.pm | 117 +++++++++++++----------------------------- 2 files changed, 61 insertions(+), 141 deletions(-) diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index 8a2fc127..472f29f9 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -3580,9 +3580,6 @@ sub new { else { my $index = $nibble_params->{index}; # brevity my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols}; - my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); - my $limit = $chunk_size - 1; - PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); my $asc = $args{TableNibbler}->generate_asc_stmt( %args, @@ -3593,52 +3590,18 @@ sub new { ); PTDEBUG && _d('Ascend params:', Dumper($asc)); + my $from = "$tbl->{name} FORCE INDEX(`$index`)"; + my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); + my $first_lb_sql = "SELECT /*!40001 SQL_NO_CACHE */ " . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $tbl->{name}" + . " FROM $from" . ($where ? " WHERE $where" : '') . " ORDER BY $order_by" . " LIMIT 1" . " /*first lower boundary*/"; - PTDEBUG && _d($first_lb_sql); - my $first_lower = $cxn->dbh()->selectrow_arrayref($first_lb_sql); - PTDEBUG && _d('First lower boundary:', Dumper($first_lower)); - - if ( !$args{chunk_index} || (lc($args{chunk_index}) ne lc($index)) ) { - - my $sql - = "EXPLAIN SELECT /*!40001 SQL_NO_CACHE */ " - . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $tbl->{name}" - . " WHERE " . $asc->{boundaries}->{'>='} - . ($where ? " AND ($where)" : '') - . " ORDER BY $order_by" - . " LIMIT ?, 2" - . " /*get MySQL index*/"; - my $sth = $cxn->dbh()->prepare($sql); - my $mysql_index = _get_mysql_index( - Cxn => $cxn, - sth => $sth, - params => [@$first_lower, $limit], - ); - PTDEBUG && _d('MySQL index:', $mysql_index); - - if ( lc($index) ne lc($mysql_index) ) { - my $chosen_index_struct = $tbl->{tbl_struct}->{keys}->{$index}; - my $mysql_index_struct = $tbl->{tbl_struct}->{keys}->{$mysql_index}; - warn "The best index for chunking $tbl->{name} is $index (" - . ($chosen_index_struct->{is_unique} ? "unique" : "not unique") - . ", covers " . scalar @{$chosen_index_struct->{cols}} - . " columns), but index $mysql_index (" - . ($mysql_index_struct->{is_unique} ? "unique" : "not unique") - . ", covers " . scalar @{$mysql_index_struct->{cols}} - . " columns) that MySQL chose will be used instead.\n"; - $index = $mysql_index; - } - } - - my $from = "$tbl->{name} FORCE INDEX(`$index`)"; + PTDEBUG && _d('First lower boundary statement:', $first_lb_sql); my $resume_lb_sql; if ( $args{resume} ) { @@ -3700,11 +3663,14 @@ sub new { . " /*explain $comments{nibble}*/"; PTDEBUG && _d('Explain nibble statement:', $explain_nibble_sql); + my $limit = $chunk_size - 1; + PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); + $self = { %args, index => $index, limit => $limit, - first_lower => $first_lower, + first_lb_sql => $first_lb_sql, last_ub_sql => $last_ub_sql, ub_sql => $ub_sql, nibble_sql => $nibble_sql, @@ -3892,12 +3858,18 @@ sub can_nibble { } my ($cxn, $tbl, $chunk_size, $o) = @args{@required_args}; - my $row_est = get_row_estimate( + my $where = $o->has('where') ? $o->get('where') : ''; + + my ($row_est, $mysql_index) = get_row_estimate( Cxn => $cxn, tbl => $tbl, - where => $o->has('where') ? $o->get('where') : '', + where => $where, ); + if ( !$where ) { + $mysql_index = undef; + } + my $one_nibble = !defined $args{one_nibble} || $args{one_nibble} ? $row_est <= $chunk_size * $o->get('chunk-size-limit') : 0; @@ -3910,7 +3882,7 @@ sub can_nibble { $one_nibble = 1; } - my $index = _find_best_index(%args); + my $index = _find_best_index(%args, mysql_index => $mysql_index); if ( !$index && !$one_nibble ) { die "There is no good index and the table is oversized."; } @@ -4014,18 +3986,6 @@ sub _get_index_cardinality { return $cardinality; } -sub _get_mysql_index { - my (%args) = @_; - my @required_args = qw(Cxn sth params); - my ($cxn, $sth, $params) = @args{@required_args}; - PTDEBUG && _d($sth->{Statement}, 'params:', @$params); - $sth->execute(@$params); - my $row = $sth->fetchrow_hashref(); - $sth->finish(); - PTDEBUG && _d(Dumper($row)); - return $row->{key}; -} - sub get_row_estimate { my (%args) = @_; my @required_args = qw(Cxn tbl); @@ -4035,11 +3995,11 @@ sub get_row_estimate { my ($cxn, $tbl) = @args{@required_args}; my $sql = "EXPLAIN SELECT * FROM $tbl->{name} " - . "WHERE " . ($args{where} || '1=1 /*get row estimate*/'); + . "WHERE " . ($args{where} || '1=1'); PTDEBUG && _d($sql); my $expl = $cxn->dbh()->selectrow_hashref($sql); PTDEBUG && _d(Dumper($expl)); - return $expl->{rows} || 0; + return ($expl->{rows} || 0), $expl->{key}; } sub _prepare_sths { @@ -4071,6 +4031,9 @@ sub _get_bounds { my $dbh = $self->{Cxn}->dbh(); + $self->{first_lower} = $dbh->selectrow_arrayref($self->{first_lb_sql}); + PTDEBUG && _d('First lower boundary:', Dumper($self->{first_lower})); + if ( my $nibble = $self->{resume} ) { if ( defined $nibble->{lower_boundary} && defined $nibble->{upper_boundary} ) { @@ -6430,7 +6393,7 @@ sub main { my $chunk_size_limit = $o->get('chunk-size-limit'); my @too_large; foreach my $slave ( @$slaves ) { - my $n_rows = NibbleIterator::get_row_estimate( + my ($n_rows) = NibbleIterator::get_row_estimate( Cxn => $slave, tbl => $tbl, where => $o->get('where'), diff --git a/lib/NibbleIterator.pm b/lib/NibbleIterator.pm index 0be676df..6e446f4c 100644 --- a/lib/NibbleIterator.pm +++ b/lib/NibbleIterator.pm @@ -120,11 +120,8 @@ sub new { else { my $index = $nibble_params->{index}; # brevity my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols}; - my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); - my $limit = $chunk_size - 1; - PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); - # Figure out how to nibble the table with the chosen index. + # Figure out how to nibble the table with the index. my $asc = $args{TableNibbler}->generate_asc_stmt( %args, tbl_struct => $tbl->{tbl_struct}, @@ -134,71 +131,23 @@ sub new { ); PTDEBUG && _d('Ascend params:', Dumper($asc)); - # Get the real first lower boundary. Using this plus the chosen index, - # we'll see what index MySQL wants to use to ascend the table. This - # is only executed once, and the first lower boundary is saved so we - # can start nibbling from it later. + # Make SQL statements, prepared on first call to next(). FROM and + # ORDER BY are the same for all statements. FORCE IDNEX and ORDER BY + # are needed to ensure deterministic nibbling. + my $from = "$tbl->{name} FORCE INDEX(`$index`)"; + my $order_by = join(', ', map {$q->quote($_)} @{$index_cols}); + + # The real first row in the table. Usually we start nibbling from + # this row. Called once in _get_bounds(). my $first_lb_sql = "SELECT /*!40001 SQL_NO_CACHE */ " . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $tbl->{name}" + . " FROM $from" . ($where ? " WHERE $where" : '') . " ORDER BY $order_by" . " LIMIT 1" . " /*first lower boundary*/"; - PTDEBUG && _d($first_lb_sql); - my $first_lower = $cxn->dbh()->selectrow_arrayref($first_lb_sql); - PTDEBUG && _d('First lower boundary:', Dumper($first_lower)); - - # If the user didn't request a --chunk-index or they did but - # it wasn't chosen, then check which index MySQL wants to use - # to ascend the table. - if ( !$args{chunk_index} || (lc($args{chunk_index}) ne lc($index)) ) { - - # This statment must be identical to the (poorly named) ub_sql below - # (aka "next chunk boundary") because ub_sql is what ascends the table - # and therefore might cause a table scan. The difference between this - # statement and the real ub_sql below is that here we do not add - # FORCE INDEX but let MySQL chose the index. - my $sql - = "EXPLAIN SELECT /*!40001 SQL_NO_CACHE */ " - . join(', ', map { $q->quote($_) } @{$asc->{scols}}) - . " FROM $tbl->{name}" - . " WHERE " . $asc->{boundaries}->{'>='} - . ($where ? " AND ($where)" : '') - . " ORDER BY $order_by" - . " LIMIT ?, 2" - . " /*get MySQL index*/"; - my $sth = $cxn->dbh()->prepare($sql); - my $mysql_index = _get_mysql_index( - Cxn => $cxn, - sth => $sth, - params => [@$first_lower, $limit], - ); - PTDEBUG && _d('MySQL index:', $mysql_index); - - if ( lc($index) ne lc($mysql_index) ) { - # Our chosen index and MySQL's chosen index are different. - # This probably happens due to a --where clause that we don't - # know anything about but MySQL can optimize for by using - # another index. We use the MySQL instead of our chosen index - # because the MySQL optimizer should know best. - my $chosen_index_struct = $tbl->{tbl_struct}->{keys}->{$index}; - my $mysql_index_struct = $tbl->{tbl_struct}->{keys}->{$mysql_index}; - warn "The best index for chunking $tbl->{name} is $index (" - . ($chosen_index_struct->{is_unique} ? "unique" : "not unique") - . ", covers " . scalar @{$chosen_index_struct->{cols}} - . " columns), but index $mysql_index (" - . ($mysql_index_struct->{is_unique} ? "unique" : "not unique") - . ", covers " . scalar @{$mysql_index_struct->{cols}} - . " columns) that MySQL chose will be used instead.\n"; - $index = $mysql_index; - } - } - - # All statements from here on will use FORCE INDEX now that we know - # which index is best. - my $from = "$tbl->{name} FORCE INDEX(`$index`)"; + PTDEBUG && _d('First lower boundary statement:', $first_lb_sql); # If we're resuming, this fetches the effective first row, which # should differ from the real first row. Called once in _get_bounds(). @@ -275,11 +224,14 @@ sub new { . " /*explain $comments{nibble}*/"; PTDEBUG && _d('Explain nibble statement:', $explain_nibble_sql); + my $limit = $chunk_size - 1; + PTDEBUG && _d('Initial chunk size (LIMIT):', $limit); + $self = { %args, index => $index, limit => $limit, - first_lower => $first_lower, + first_lb_sql => $first_lb_sql, last_ub_sql => $last_ub_sql, ub_sql => $ub_sql, nibble_sql => $nibble_sql, @@ -476,13 +428,26 @@ sub can_nibble { } my ($cxn, $tbl, $chunk_size, $o) = @args{@required_args}; + my $where = $o->has('where') ? $o->get('where') : ''; + # About how many rows are there? - my $row_est = get_row_estimate( + my ($row_est, $mysql_index) = get_row_estimate( Cxn => $cxn, tbl => $tbl, - where => $o->has('where') ? $o->get('where') : '', + where => $where, ); + # MySQL's chosen index is only something we should prefer + # if --where is used. Else, we can chose our own index + # and disregard the MySQL index from the row estimate. + # If there's a --where, however, then MySQL's chosen index + # is used because it tells us how MySQL plans to optimize + # for the --where. + # https://bugs.launchpad.net/percona-toolkit/+bug/978432 + if ( !$where ) { + $mysql_index = undef; + } + # Can all those rows be nibbled in one chunk? If one_nibble is defined, # then do as it says; else, look at the chunk size limit. my $one_nibble = !defined $args{one_nibble} || $args{one_nibble} @@ -500,7 +465,7 @@ sub can_nibble { } # Get an index to nibble by. We'll order rows by the index's columns. - my $index = _find_best_index(%args); + my $index = _find_best_index(%args, mysql_index => $mysql_index); if ( !$index && !$one_nibble ) { die "There is no good index and the table is oversized."; } @@ -609,18 +574,6 @@ sub _get_index_cardinality { return $cardinality; } -sub _get_mysql_index { - my (%args) = @_; - my @required_args = qw(Cxn sth params); - my ($cxn, $sth, $params) = @args{@required_args}; - PTDEBUG && _d($sth->{Statement}, 'params:', @$params); - $sth->execute(@$params); - my $row = $sth->fetchrow_hashref(); - $sth->finish(); - PTDEBUG && _d(Dumper($row)); - return $row->{key}; -} - sub get_row_estimate { my (%args) = @_; my @required_args = qw(Cxn tbl); @@ -630,11 +583,11 @@ sub get_row_estimate { my ($cxn, $tbl) = @args{@required_args}; my $sql = "EXPLAIN SELECT * FROM $tbl->{name} " - . "WHERE " . ($args{where} || '1=1 /*get row estimate*/'); + . "WHERE " . ($args{where} || '1=1'); PTDEBUG && _d($sql); my $expl = $cxn->dbh()->selectrow_hashref($sql); PTDEBUG && _d(Dumper($expl)); - return $expl->{rows} || 0; + return ($expl->{rows} || 0), $expl->{key}; } sub _prepare_sths { @@ -666,6 +619,10 @@ sub _get_bounds { my $dbh = $self->{Cxn}->dbh(); + # Get the real first lower boundary. + $self->{first_lower} = $dbh->selectrow_arrayref($self->{first_lb_sql}); + PTDEBUG && _d('First lower boundary:', Dumper($self->{first_lower})); + # The next boundary is the first lower boundary. If resuming, # this should be something > the real first lower boundary and # bounded (else it's not one of our chunks).