Detect infinite loops. Use best non-unique index. Disable chunk size limit if chunk index is unique.

2025-09-17 17:27:57 +00:00 · 2011-09-23 17:33:23 -06:00
parent 07cb6010a2
commit 9f3e05691c
4 changed files with 220 additions and 30 deletions
--- a/bin/pt-table-checksum
+++ b/bin/pt-table-checksum
@@ -5249,17 +5249,19 @@ sub main {

         # Check if the chunk is too large.  If yes, then return 0 to
         # skip this chunk and get fetch the next boundary.
-         my $is_oversize = is_oversize_chunk(
-            %args,
-            chunk_size       => $tbl->{chunk_size},
-            chunk_size_limit => $o->get('chunk-size-limit'),
-         );
-         if ( $is_oversize ) {
-            MKDEBUG && _d('Chunk', $args{nibbleno}, 'of table',
-               "$tbl->{db}.$tbl->{tbl}", 'is too large');
-            $tbl->{checksum_results}->{skipped}++;
-            $tbl->{nibble_time} = 0;
-            return 0;  # next boundary
+         if ( $tbl->{chunk_size_limit} ) {
+            my $is_oversize = is_oversize_chunk(
+               %args,
+               chunk_size => $tbl->{chunk_size},
+               limit      => $tbl->{chunk_size_limit},
+            );
+            if ( $is_oversize ) {
+               MKDEBUG && _d('Chunk', $args{nibbleno}, 'of table',
+                  "$tbl->{db}.$tbl->{tbl}", 'is too large');
+               $tbl->{checksum_results}->{skipped}++;
+               $tbl->{nibble_time} = 0;
+               return 0;  # next boundary
+            }
         }

         # Exec and time the chunk checksum query.  If it fails, retry.
@@ -5426,6 +5428,16 @@ sub main {
            TableNibbler => $tn,
            TableParser  => $tp,
         );
+         
+         my $chunk_index = $nibble_iter->nibble_index();
+         if ( $tbl->{tbl_struct}->{keys}->{$chunk_index}->{is_unique} ) {
+            MKDEBUG && _d('Disabling chunk size limit for table because',
+               'chunk index', $chunk_index, 'is unique');
+            $tbl->{chunk_size_limit} = 0;
+         }
+         else {
+            $tbl->{chunk_size_limit} = $o->get('chunk-size-limit');
+         }

         # Finally, checksum the table.
         # The "1 while" loop is necessary because we're executing REPLACE
@@ -5487,21 +5499,22 @@ sub exec_nibble {

         my $lb_quoted = join(',', map { $q->quote_val($_) } @$lb);
         my $ub_quoted = join(',', map { $q->quote_val($_) } @$ub);
+         my $chunk_idx = $$nibble_iter->nibble_index();

         # Execute the REPLACE...SELECT checksum query.
         # MKDEBUG && _d($sth->{Statement}, 'params:',
         # );
         $sth->execute(
            # REPLACE INTO repl_table SELECT
-            $tbl->{db},                   # db
-            $tbl->{tbl},                  # tbl
-            $args{nibbleno},              # chunk
-            $nibble_iter->nibble_index(), # chunk_index
-            $lb_quoted,                   # lower_boundary
-            $ub_quoted,                   # upper_boundary
+            $tbl->{db},       # db
+            $tbl->{tbl},      # tbl
+            $args{nibbleno},  # chunk
+            $chunk_idx,       # chunk_index
+            $lb_quoted,       # lower_boundary
+            $ub_quoted,       # upper_boundary
            # this_cnt, this_crc WHERE
-            @$lb,                         # upper boundary values
-            @$ub,                         # lower boundary values
+            @$lb,             # upper boundary values
+            @$ub,             # lower boundary values
         );

         # Check if checksum query caused any warnings.
@@ -5749,12 +5762,12 @@ sub create_repl_table {
 #   Determine if the chunk is oversize.
 #
 # Required Arguments:
-#   * tbl              - Standard tbl hashref
-#   * explain_sth      - Sth to EXPLAIN the chunking query
-#   * lb               - Arrayref with lower boundary values for explain_sth
-#   * ub               - Arrayref with upper boundary values for explain_sth
-#   * chunk_size       - Chunk size
-#   * chunk_size_limit - Chunk size limit
+#   * tbl         - Standard tbl hashref
+#   * explain_sth - Sth to EXPLAIN the chunking query
+#   * lb          - Arrayref with lower boundary values for explain_sth
+#   * ub          - Arrayref with upper boundary values for explain_sth
+#   * chunk_size  - Chunk size
+#   * limit       - Chunk size limit
 #
 # Returns:
 #   True if EXPLAIN rows is >= chunk-size * chunk-size-limit, else false
--- a/lib/NibbleIterator.pm
+++ b/lib/NibbleIterator.pm
@@ -57,10 +57,7 @@ sub new {
   my ($dbh, $tbl, $chunk_size, $o, $q) = @args{@required_args};

   # Get an index to nibble by.  We'll order rows by the index's columns.
-   my $index = $args{TableParser}->find_best_index(
-      $tbl->{tbl_struct},
-      $args{chunk_index},
-   );
+   my $index = _find_best_index(%args);
   die "No index to nibble table $tbl->{db}.$tbl->{tbl}" unless $index;
   my $index_cols = $tbl->{tbl_struct}->{keys}->{$index}->{cols};

@@ -290,6 +287,86 @@ sub set_chunk_size {
   return;
 }

+sub _find_best_index {
+   my (%args) = @_;
+   my @required_args = qw(tbl TableParser dbh Quoter);
+   my ($tbl, $tp) = @args{@required_args};
+
+   my $tbl_struct = $tbl->{tbl_struct};
+   my $indexes    = $tbl_struct->{keys};
+
+   my $best_index;
+   my @possible_indexes;
+   if ( my $want_index = $args{chunk_index} ) {
+      MKDEBUG && _d('Want to use nibble index', $want_index);
+      if ( $want_index eq 'PRIMARY' || $indexes->{$want_index}->{is_unique} ) {
+         $best_index = $want_index;
+      }
+      else {
+         push @possible_indexes, $want_index;
+      }
+   }
+   else {
+      foreach my $index ( $tp->sort_indexes($tbl_struct) ) {
+         if ( $index eq 'PRIMARY' || $indexes->{$index}->{is_unique} ) {
+            $best_index = $index;
+            last;
+         }
+         else {
+            push @possible_indexes, $index;
+         }
+      }
+   }
+
+   if ( !$best_index && @possible_indexes ) {
+      MKDEBUG && _d('No PRIMARY or unique indexes;',
+         'will use index with highest cardinality');
+      foreach my $index ( @possible_indexes ) {
+         $indexes->{$index}->{cardinality} = _get_index_cardinality(
+            %args,
+            index => $index,
+         );
+      }
+      @possible_indexes = sort {
+         # Prefer the index with the highest cardinality.
+         my $cmp
+            = $indexes->{$b}->{cardinality} <=> $indexes->{$b}->{cardinality};
+         if ( $cmp == 0 ) {
+            # Indexes have the same cardinality; prefer the one with
+            # more columns.
+            $cmp = scalar @{$indexes->{$b}->{cols}}
+               <=> scalar @{$indexes->{$a}->{cols}};
+         }
+         $cmp;
+      } @possible_indexes;
+      $best_index = $possible_indexes[0];
+   }
+
+   MKDEBUG && _d('Best index:', $best_index);
+   return $best_index;
+}
+
+sub _get_index_cardinality {
+   my (%args) = @_;
+   my @required_args = qw(dbh tbl index Quoter);
+   my ($dbh, $tbl, $index, $q) = @args{@required_args};
+
+   my $sql = "SHOW INDEXES FROM " . $q->quote(@{$tbl}{qw(db tbl)})
+           . " WHERE Key_name = '$index'";
+   MKDEBUG && _d($sql);
+   my $cardinality = 1;
+   my $rows = $dbh->selectall_hashref($sql, 'key_name');
+   foreach my $row ( values %$rows ) {
+      $cardinality *= $row->{cardinality} if $row->{cardinality};
+   }
+   MKDEBUG && _d('Index', $index, 'cardinality:', $cardinality);
+   return $cardinality;
+}
+
+sub _can_nibble_index {
+   my ($index) = @_;
+}
+
 sub _can_nibble_once {
   my ($self) = @_;
   my ($dbh, $tbl, $tp) = @{$self}{qw(dbh tbl TableParser)};
@@ -380,6 +457,22 @@ sub _next_boundaries {
   if ( $boundary && @$boundary ) {
      $self->{ub} = $boundary->[0]; # this nibble
      if ( $boundary->[1] ) {
+         if ( $self->_identical_boundaries($boundary) ) {
+            my $tbl     = $self->{tbl};
+            my $index   = $tbl->{tbl_struct}->{keys}->{$self->{index}};
+            my $n_cols  = scalar @{$index->{cols}};
+            my $chunkno = $self->{nibbleno} + 1;
+            die "Possible infinite loop detected!  "
+               . "The upper boundary for chunk $chunkno is "
+               . "<" . join(', ', @{$boundary->[0]}) . "> and the lower "
+               . "boundary for chunk " . ($chunkno + 1) . " is also "
+               . "<" . join(', ', @{$boundary->[1]}) . ">.  "
+               . "This usually happens when using a non-unique single "
+               . "column index.  The current chunk index for table "
+               . "$tbl->{db}.$tbl->{tbl} is $self->{index} which is"
+               . ($index->{is_unique} ? '' : ' not') . " unique and covers "
+               . ($n_cols > 1 ? "$n_cols columns" : "1 column") . ".\n";
+         }
         $self->{next_lb} = $boundary->[1]; # next nibble
      }
      else {
@@ -397,6 +490,20 @@ sub _next_boundaries {
   return 1; # have boundary
 }

+sub _identical_boundaries {
+   my ($self, $boundaries) = @_;
+   my $ub = $boundaries->[0];
+   my $lb = $boundaries->[1];
+   return 0 unless $ub && $lb;
+   my $n_vals = scalar @$ub;
+   for my $i ( 0..($n_vals-1) ) {
+      # One diff means the bounds aren't identical.
+      return 0 if $lb->[$i] ne $ub->[$i];
+   }
+   MKDEBUG && _d('Infinite loop detected');
+   return 1;
+}
+
 sub DESTROY {
   my ( $self ) = @_;
   foreach my $key ( keys %$self ) {
--- a/t/lib/NibbleIterator.t
+++ b/t/lib/NibbleIterator.t
@@ -38,7 +38,7 @@ if ( !$dbh ) {
   plan skip_all => 'Cannot connect to sandbox master';
 }
 else {
-   plan tests => 21;
+   plan tests => 25;
 }

 my $q  = new Quoter();
@@ -469,6 +469,55 @@ is_deeply(
   "Nibble by 1 row"
 );

+# ############################################################################
+# Avoid infinite loops.
+# ############################################################################
+$sb->load_file('master', "$in/bad_tables.sql");
+$dbh->do('analyze table bad_tables.inv');
+$ni = make_nibble_iter(
+   db   => 'bad_tables',
+   tbl  => 'inv',
+   argv => [qw(--databases bad_tables --chunk-size 3)],
+);
+
+$all_rows = $dbh->selectall_arrayref('select * from bad_tables.inv order by tee_id, on_id');
+
+is(
+   $ni->nibble_index(),
+   'index_inv_on_tee_id_and_on_id',
+   'Use index with higest cardinality'
+);
+
+@rows = ();
+while (my $row = $ni->next()) {
+   push @rows, $row;
+}
+
+is_deeply(
+   \@rows,
+   $all_rows,
+   'Selected all rows from non-unique index'
+);
+
+$dbh->do('alter table bad_tables.inv drop index index_inv_on_tee_id_and_on_id');
+$ni = make_nibble_iter(
+   db   => 'bad_tables',
+   tbl  => 'inv',
+   argv => [qw(--databases bad_tables --chunk-size 7)],
+);
+
+is(
+   $ni->nibble_index(),
+   'index_inv_on_on_id',
+   'Using bad index'
+);
+
+throws_ok(
+   sub { for (1..50) { $ni->next() } },
+   qr/infinite loop/,
+   'Detects infinite loop'
+);
+
 # #############################################################################
 # Done.
 # #############################################################################
--- a/t/lib/samples/NibbleIterator/bad_tables.sql
+++ b/t/lib/samples/NibbleIterator/bad_tables.sql
@@ -0,0 +1,21 @@
+DROP DATABASE IF EXISTS bad_tables;
+CREATE DATABASE bad_tables;
+USE bad_tables;
+
+-- This table can cause an infinite nibbling loop.
+CREATE TABLE `inv` (
+  `tee_id` int(11) NOT NULL,
+  `on_id` int(11) NOT NULL,
+  `updated_at` datetime DEFAULT NULL,
+  KEY `index_inv_on_on_id` (`on_id`),
+  KEY `index_inv_on_tee_id_and_on_id` (`tee_id`,`on_id`)
+);
+
+INSERT INTO inv (tee_id, on_id) VALUES
+  (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),         (1, 7), (1, 8), (1, 9),
+  (2, 1), (2, 2), (2, 3),         (2, 5), (2, 6), (2, 7), (2, 8),
+  (3, 1), (3, 2), (3, 3), (3, 4),
+                  (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9),
+  (5,1),
+  (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9),
+  (7, 1), (7, 2), (7, 3), (7, 4), (7, 5), (7, 6), (7, 7), (7, 8), (7, 9);