From c989bd29ddcfb45e3f7103d080d7c50bb76784b7 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Fri, 9 Sep 2011 17:29:28 -0600 Subject: [PATCH] First working NibbleIterator. --- lib/NibbleIterator.pm | 215 ++++++++++++++++++++++++++++------------- t/lib/NibbleIterator.t | 160 +++++++++++++++++++++++++++++- 2 files changed, 306 insertions(+), 69 deletions(-) diff --git a/lib/NibbleIterator.pm b/lib/NibbleIterator.pm index 87610130..777fa2ad 100644 --- a/lib/NibbleIterator.pm +++ b/lib/NibbleIterator.pm @@ -56,25 +56,34 @@ sub new { asc_only => 1, ); - # Make SQL statements, prepared on first call to next(). The preamble - # and ORDER BY are the same for all statements. FORCE IDNEX and ORDER BY + # Make SQL statements, prepared on first call to next(). FROM and + # ORDER BY are the same for all statements. FORCE IDNEX and ORDER BY # are needed to ensure deterministic nibbling. - my $nibble_sql_preamble - = "SELECT /*!40001 SQL_NO_CACHE */ " - . join(', ', map { $q->quote($_) } @{$asc->{cols}}) - . " FROM " . $q->quote(@{$tbl}{qw(db tbl)}) - . " FORCE INDEX(`$index`)"; + my $from = " FROM " . $q->quote(@{$tbl}{qw(db tbl)}) + . " FORCE INDEX(`$index`)"; my $order_by = "ORDER BY " . join(', ', map {$q->quote($_)} @{$index_cols}); - # This statement is only executed once, so it doesn't use a sth. + # These statements are only executed once, so they don't use sths. my $first_lb_sql - = $nibble_sql_preamble + = "SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$index_cols}) + . " $from " . ($args{where} ? " WHERE $args{where}" : '') . " $order_by " . " LIMIT 1" . " /*first lower boundary*/"; MKDEBUG && _d('First lower boundary statement:', $first_lb_sql); + my $last_ub_sql + = "SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$index_cols}) + . " $from " + . ($args{where} ? " WHERE $args{where}" : '') + . " $order_by DESC " + . " LIMIT 1" + . " /*last upper boundary*/"; + MKDEBUG && _d('Last upper boundary statement:', $last_ub_sql); + # Nibbles are inclusive, so for a..z, the nibbles are: a-e, f-j, k-o, p-t, # u-y, and z. This complicates getting the next upper boundary because # if we use either (col >= lb AND col < ub) or (col > lb AND col <= ub) @@ -84,8 +93,10 @@ sub new { # the upper boundary for the current nibble *and* the lower boundary # for the next nibble. See _next_boundaries(). my $ub_sql - = $nibble_sql_preamble - . " WHERE (" . $asc->{boundaries}->{'>='} . ")" # lower boundary + = "SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$index_cols}) + . " $from " + . " WHERE " . $asc->{boundaries}->{'>='} # lower boundary . ($args{where} ? " AND ($args{where})" : '') . " $order_by " . " LIMIT 2 OFFSET " . (($o->get('chunk-size') || 1) - 1) @@ -93,23 +104,38 @@ sub new { MKDEBUG && _d('Next upper boundary statement:', $ub_sql); my $nibble_sql - = $nibble_sql_preamble - . " WHERE (" . $asc->{boundaries}->{'>='} . ")" # lower boundary - . " AND (" . $asc->{boundaries}->{'<='} . ")" # upper boundary + = "SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$asc->{cols}}) + . " $from " + . " WHERE " . $asc->{boundaries}->{'>='} # lower boundary + . " AND " . $asc->{boundaries}->{'<='} # upper boundary . ($args{where} ? " AND ($args{where})" : '') . " $order_by" . " /*nibble*/"; MKDEBUG && _d('Nibble statement:', $nibble_sql); + # If the chunk size is >= number of rows in table, then we don't + # need to chunk; we can just select all rows, in order, at once. + my $one_nibble_sql + = "SELECT /*!40001 SQL_NO_CACHE */ " + . join(', ', map { $q->quote($_) } @{$asc->{cols}}) + . " $from " + . ($args{where} ? " AND ($args{where})" : '') + . " $order_by" + . " /*one nibble*/"; + MKDEBUG && _d('One nibble statement:', $one_nibble_sql); + my $self = { %args, - asc => $asc, - first_lb_sql => $first_lb_sql, - ub_sql => $ub_sql, - nibble_sql => $nibble_sql, - nibbleno => 0, - have_rows => 0, - rowno => 0, + index => $index, + first_lb_sql => $first_lb_sql, + last_ub_sql => $last_ub_sql, + ub_sql => $ub_sql, + nibble_sql => $nibble_sql, + one_nibble_sql => $one_nibble_sql, + nibbleno => 0, + have_rows => 0, + rowno => 0, }; return bless $self, $class; @@ -121,9 +147,13 @@ sub next { # First call, init everything. This could be done in new(), but # all work is delayed until actually needed. if ($self->{nibbleno} == 0) { + $self->_can_nibble_once(); $self->_prepare_sths(); - $self->_get_first_lb(); + $self->_get_bounds(); # $self->_check_index_usage(); + if ( my $callback = $self->{callbacks}->{init} ) { + $callback->(); + } } # Return rows in nibble. sth->{Active} is always true with DBD::mysql v3, @@ -135,42 +165,105 @@ sub next { if ( $row ) { $self->{rowno}++; MKDEBUG && _d('Row', $self->{rowno}, 'in nibble', $self->{nibbleno}); + if ( my $callback = $self->{callbacks}->{before_row} ) { + $callback->(); + } # fetchrow_arraryref re-uses its internal arrayref, so we must copy. return [ @$row ]; } - MKDEBUG && _d('No more rowso in nibble', $self->{nibbleno}); + MKDEBUG && _d('No more rows in nibble', $self->{nibbleno}); + if ( my $callback = $self->{callbacks}->{after_nibble} ) { + $callback->(); + } $self->{rowno} = 0; $self->{have_rows} = 0; } # If there's another boundary, fetch the rows within it. if ( $self->_next_boundaries() ) { + $self->{nibbleno}++; MKDEBUG && _d($self->{nibble_sth}->{Statement}, 'params:', join(', ', (@{$self->{lb}}, @{$self->{ub}}))); $self->{nibble_sth}->execute(@{$self->{lb}}, @{$self->{ub}}); $self->{have_rows} = $self->{nibble_sth}->rows(); if ( $self->{have_rows} ) { - $self->{nibbleno}++; MKDEBUG && _d($self->{have_rows}, 'rows in nibble', $self->{nibbleno}); + if ( my $callback = $self->{callbacks}->{before_nibble} ) { + $callback->(); + } return $self->next(); } } MKDEBUG && _d('Done nibbling'); + if ( my $callback = $self->{callbacks}->{done} ) { + $callback->(); + } return; } +sub nibble_number { + my ($self) = @_; + return $self->{nibbleno}; +} + +sub number_of_rows { + my ($self) = @_; + return $self->{have_rows}; +} + +sub row_number { + my ($self) = @_; + return $self->{nibbleno}; +} + + +sub _can_nibble_once { + my ($self) = @_; + my ($dbh, $tbl, $q) = @{$self}{qw(dbh tbl Quoter)}; + my $table_status; + eval { + my $sql = "SHOW TABLE STATUS FROM " . $q->quote($tbl->{db}) + . " LIKE " . $q->literal_like($tbl->{tbl}); + MKDEBUG && _d($sql); + $table_status = $dbh->selectrow_hashref($sql); + MKDEBUG && _d('Table status:', Dumper($table_status)); + }; + if ( $EVAL_ERROR ) { + warn $EVAL_ERROR; + return 0; + } + my $n_rows = defined $table_status->{Rows} ? $table_status->{Rows} + : defined $table_status->{rows} ? $table_status->{rows} + : 0; + my $chunk_size = $self->{OptionParser}->get('chunk-size') || 1; + $self->{one_nibble} = $n_rows <= $chunk_size ? 1 : 0; + MKDEBUG && _d('One nibble:', $self->{one_nibble} ? 'yes' : 'no'); + return $self->{one_nibble}; +} + sub _prepare_sths { my ($self) = @_; MKDEBUG && _d('Preparing statement handles'); - $self->{ub_sth} = $self->{dbh}->prepare($self->{ub_sql}); - $self->{nibble_sth} = $self->{dbh}->prepare($self->{nibble_sql}); + if ( $self->{one_nibble} ) { + $self->{nibble_sth} = $self->{dbh}->prepare($self->{one_nibble_sql}); + } + else { + $self->{ub_sth} = $self->{dbh}->prepare($self->{ub_sql}); + $self->{nibble_sth} = $self->{dbh}->prepare($self->{nibble_sql}); + } } -sub _get_first_lb { +sub _get_bounds { my ($self) = @_; + return if $self->{one_nibble}; + $self->{next_lb} = $self->{dbh}->selectrow_arrayref($self->{first_lb_sql}); - MKDEBUG && _d('First lower boundary:', Dumper($self->{lb})); + MKDEBUG && _d('First lower boundary:', Dumper($self->{next_lb})); + + $self->{last_ub} = $self->{dbh}->selectrow_arrayref($self->{last_ub_sql}); + MKDEBUG && _d('Last upper boundary:', Dumper($self->{last_ub})); + return; } @@ -178,40 +271,20 @@ sub _check_index_usage { my ($self) = @_; my ($dbh, $tbl, $q) = @{$self}{qw(dbh tbl Quoter)}; - my $table_status; + my $explain; eval { - my $sql = "SHOW TABLE STATUS FROM " . $q->quote($tbl->{db}) - . " LIKE " . $q->literal_like($tbl->{tbl}); - MKDEBUG && _d($sql); - $table_status = $dbh->selectrow_hashref($sql); + $explain = $dbh->selectall_arrayref("", {Slice => {}}); }; - MKDEBUG && $EVAL_ERROR && _d($EVAL_ERROR); - - my $small_table; - if ( $table_status ) { - my $n_rows = defined $table_status->{Rows} ? $table_status->{Rows} - : defined $table_status->{rows} ? $table_status->{rows} - : undef; - $small_table = 1 if defined $n_rows && $n_rows <= 100; + if ( $EVAL_ERROR ) { + warn "Cannot check if MySQL is using the chunk index: $EVAL_ERROR"; + return; } - MKDEBUG && _d('Small table:', $small_table); - - if ( !$small_table ) { - my $explain; - eval { - $explain = $dbh->selectall_arrayref("", {Slice => {}}); - }; - if ( $EVAL_ERROR ) { - MKDEBUG && _d($EVAL_ERROR); - return; - } - MKDEBUG && _d('EXPLAIN key:', $explain->[0]->{key}); - my $explain_index = lc($explain->[0]->{key} || ''); - if ( $explain_index ne lc($self->{asc}->{index}) ) { - die "Cannot nibble table $tbl->{db}.$tbl->{tbl} because MySQL chose " - . ($explain_index ? "the `$explain_index`" : 'no') . ' index' - . " instead of the `$self->{asc}->{index}` index"; - } + my $explain_index = lc($explain->[0]->{key} || ''); + MKDEBUG && _d('EXPLAIN index:', $explain_index); + if ( $explain_index ne $self->{index} ) { + die "Cannot nibble table $tbl->{db}.$tbl->{tbl} because MySQL chose " + . ($explain_index ? "the `$explain_index`" : 'no') . ' index' + . " instead of the chunk index `$self->{asc}->{index}`"; } return; @@ -225,25 +298,37 @@ sub _next_boundaries { return; } + if ( $self->{one_nibble} ) { + $self->{lb} = $self->{ub} = []; + $self->{no_more_boundaries} = 1; # for next call + return 1; + } + $self->{lb} = $self->{next_lb}; MKDEBUG && _d($self->{ub_sth}->{Statement}, 'params:', join(', ', @{$self->{lb}})); $self->{ub_sth}->execute(@{$self->{lb}}); my $boundary = $self->{ub_sth}->fetchall_arrayref(); + MKDEBUG && _d('Next boundary:', Dumper($boundary)); if ( $boundary && @$boundary ) { - $self->{ub} = $boundary->[0]; # this nibble - $self->{next_lb} = $boundary->[1]; # next nibble - $self->{ub_sth}->finish(); - MKDEBUG && _d('Next upper boundary:', Dumper($self->{ub})); + $self->{ub} = $boundary->[0]; # this nibble + if ( $boundary->[1] ) { + $self->{next_lb} = $boundary->[1]; # next nibble + } + else { + $self->{no_more_boundaries} = 1; # for next call + MKDEBUG && _d('Last upper boundary:', Dumper($boundary->[0])); + } } else { $self->{no_more_boundaries} = 1; # for next call - $self->{ub} = $self->{lb}; + $self->{ub} = $self->{last_ub}; MKDEBUG && _d('Last upper boundary:', Dumper($self->{ub})); } + $self->{ub_sth}->finish(); - return 1; # have boundaries + return 1; # have boundary } sub _d { diff --git a/t/lib/NibbleIterator.t b/t/lib/NibbleIterator.t index 025f21f9..2ea04141 100644 --- a/t/lib/NibbleIterator.t +++ b/t/lib/NibbleIterator.t @@ -38,10 +38,9 @@ if ( !$dbh ) { plan skip_all => 'Cannot connect to sandbox master'; } else { - plan tests => 6; + plan tests => 12; } - my $q = new Quoter(); my $tp = new TableParser(Quoter=>$q); my $du = new MySQLDump(); @@ -79,14 +78,18 @@ sub make_nibble_iter { 1 while $si->next_schema_object(); my $ni = new NibbleIterator( - dbh => $dbh, - tbl => $schema->get_table($args{db}, $args{tbl}), + dbh => $dbh, + tbl => $schema->get_table($args{db}, $args{tbl}), + callbacks => $args{callbacks}, %common_modules, ); return $ni; } +# ############################################################################ +# a-z w/ chunk-size 5, z is final boundary and single value +# ############################################################################ my $ni = make_nibble_iter( sql_file => "a-z.sql", db => 'test', @@ -155,6 +158,155 @@ is_deeply( 'a-z nibble 6' ) or print Dumper(\@rows); +# ############################################################################ +# a-y w/ chunk-size 5, even nibbles +# ############################################################################ +$dbh->do('delete from test.t where c="z"'); +my $all_rows = $dbh->selectall_arrayref('select * from test.t order by c'); +$ni = make_nibble_iter( + db => 'test', + tbl => 't', + argv => [qw(--databases test --chunk-size 5)], +); + +@rows = (); +for (1..26) { + push @rows, $ni->next(); +} +is_deeply( + \@rows, + $all_rows, + 'a-y even nibble' +) or print Dumper(\@rows); + +# ############################################################################ +# chunk-size exceeds number of rows, 1 nibble +# ############################################################################ +$ni = make_nibble_iter( + db => 'test', + tbl => 't', + argv => [qw(--databases test --chunk-size 100)], +); + +@rows = (); +for (1..27) { + push @rows, $ni->next(); +} +is_deeply( + \@rows, + $all_rows, + '1 nibble' +) or print Dumper(\@rows); + +# ############################################################################ +# single row table +# ############################################################################ +$dbh->do('delete from test.t where c != "d"'); +$ni = make_nibble_iter( + db => 'test', + tbl => 't', + argv => [qw(--databases test --chunk-size 100)], +); + +@rows = (); +for (1..3) { + push @rows, $ni->next(); +} +is_deeply( + \@rows, + [['d']], + 'single row table' +) or print Dumper(\@rows); + +# ############################################################################ +# empty table +# ############################################################################ +$dbh->do('truncate table test.t'); +$ni = make_nibble_iter( + db => 'test', + tbl => 't', + argv => [qw(--databases test --chunk-size 100)], +); + +@rows = (); +for (1..3) { + push @rows, $ni->next(); +} +is_deeply( + \@rows, + [], + 'empty table' +) or print Dumper(\@rows); + +# ############################################################################ +# Callbacks +# ############################################################################ +$ni = make_nibble_iter( + sql_file => "a-z.sql", + db => 'test', + tbl => 't', + argv => [qw(--databases test --chunk-size 2)], + callbacks => { + init => sub { print "init\n" }, + before_nibble => sub { print "before nibble ".$ni->nibble_number()."\n" }, + before_row => sub { print "before row\n" }, + after_nibble => sub { print "after nibble ".$ni->nibble_number()."\n" }, + done => sub { print "done\n" }, + } +); + +$dbh->do('delete from test.t limit 20'); # 6 rows left + +my $output = output( + sub { + for (1..8) { $ni->next() } + }, +); + +is( + $output, +"init +before nibble 1 +before row +before row +after nibble 1 +before nibble 2 +before row +before row +after nibble 2 +before nibble 3 +before row +before row +after nibble 3 +done +done +", + "callbacks" +); + + +# ############################################################################ +# Nibble a larger table by numeric pk id +# ############################################################################ +SKIP: { + skip "Sakila database is not loaded", 1 + unless @{ $dbh->selectall_arrayref('show databases like "sakila"') }; + + $ni = make_nibble_iter( + db => 'sakila', + tbl => 'payment', + argv => [qw(--databases sakila --tables payment --chunk-size 100)], + ); + + my $n_nibbles = 0; + $n_nibbles++ while $ni->next(); + is( + $n_nibbles, + 16049, + "Nibble sakila.payment (16049 rows)" + ); +} + # ############################################################################# # Done. # #############################################################################