Files
percona-toolkit/lib/TableChunker.pm
2025-01-10 23:07:03 +03:00

1426 lines
52 KiB
Perl

# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA 02111-1307 USA.
# ###########################################################################
# TableChunker package
# ###########################################################################
{
# Package: TableChunker
# TableChunker helps determine how to "chunk" a table. Chunk are
# pre-determined ranges of rows defined by boundary values (sometimes also
# called endpoints) on numeric or numeric-like columns, including date/time
# types. Any numeric column type that MySQL can do positional comparisons
# (<, <=, >, >=) on works. Chunking on character data is not supported yet
# (but see <issue 568 at http://code.google.com/p/maatkit/issues/detail?id=568>).
#
# Usually chunks range over all rows in a table but sometimes they only
# range over a subset of rows if an optional where arg is passed to various
# subs. In either case a chunk is like "`col` >= 5 AND `col` < 10". If
# col is of type int and is unique, then that chunk ranges over up to 5 rows.
#
# Chunks are included in WHERE clauses by various tools to do work on discrete
# chunks of the table instead of trying to work on the entire table at once.
# Chunks do not overlap and their size is configurable via the chunk_size arg
# passed to several subs. The chunk_size can be a number of rows or a size
# like 1M, in which case it's in estimated bytes of data. Real chunk sizes
# are usually close to the requested chunk_size but unless the optional exact
# arg is assed the real chunk sizes are approximate. Sometimes the
# distribution of values on the chunk colun can skew chunking. If, for
# example, col has values 0, 100, 101, ... then the zero value skews chunking.
# The zero_chunk arg handles this.
package TableChunker;
use strict;
use utf8;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
use POSIX qw(floor ceil);
use List::Util qw(min max);
use Data::Dumper;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Quotekeys = 0;
# Sub: new
#
# Parameters:
# $class - TableChunker (automatic)
# %args - Arguments
#
# Required Arguments:
# Quoter - <Quoter> object
# TableParser - <TableParser> object
sub new {
my ( $class, %args ) = @_;
foreach my $arg ( qw(Quoter TableParser) ) {
die "I need a $arg argument" unless $args{$arg};
}
my %int_types = map { $_ => 1 } qw(bigint date datetime int mediumint smallint time timestamp tinyint year);
my %real_types = map { $_ => 1 } qw(decimal double float);
my $self = {
%args,
int_types => \%int_types,
real_types => \%real_types,
EPOCH => '1970-01-01',
};
return bless $self, $class;
}
# Sub: find_chunk_columns
# Find chunkable columns.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# table_struct - Hashref returned from <TableParser::parse()>
#
# Optional Arguments:
# exact - bool: Try to support exact chunk sizes (may still chunk fuzzily)
#
# Returns:
# Array: whether the table can be chunked exactly if requested (zero
# otherwise), arrayref of columns that support chunking. Example:
# (start code)
# 1,
# [
# { column => 'id', index => 'PRIMARY' },
# { column => 'i', index => 'i_idx' },
# ]
# (end code)
sub find_chunk_columns {
my ( $self, %args ) = @_;
foreach my $arg ( qw(tbl_struct) ) {
die "I need a $arg argument" unless $args{$arg};
}
my $tbl_struct = $args{tbl_struct};
# See if there's an index that will support chunking.
my @possible_indexes;
foreach my $index ( values %{ $tbl_struct->{keys} } ) {
# Accept only BTREE indexes.
next unless $index->{type} eq 'BTREE';
# Reject indexes with prefixed columns.
next if grep { defined } @{$index->{col_prefixes}};
# If exact, accept only unique, single-column indexes.
if ( $args{exact} ) {
next unless $index->{is_unique} && @{$index->{cols}} == 1;
}
push @possible_indexes, $index;
}
PTDEBUG && _d('Possible chunk indexes in order:',
join(', ', map { $_->{name} } @possible_indexes));
# Build list of candidate chunk columns.
my $can_chunk_exact = 0;
my @candidate_cols;
foreach my $index ( @possible_indexes ) {
my $col = $index->{cols}->[0];
# Accept only integer or real number type columns or character columns.
my $col_type = $tbl_struct->{type_for}->{$col};
next unless $self->{int_types}->{$col_type}
|| $self->{real_types}->{$col_type}
|| $col_type =~ m/char/;
# Save the candidate column and its index.
push @candidate_cols, { column => $col, index => $index->{name} };
}
$can_chunk_exact = 1 if $args{exact} && scalar @candidate_cols;
if ( PTDEBUG ) {
my $chunk_type = $args{exact} ? 'Exact' : 'Inexact';
_d($chunk_type, 'chunkable:',
join(', ', map { "$_->{column} on $_->{index}" } @candidate_cols));
}
# Order the candidates by their original column order.
# Put the PK's first column first, if it's a candidate.
my @result;
PTDEBUG && _d('Ordering columns by order in tbl, PK first');
if ( $tbl_struct->{keys}->{PRIMARY} ) {
my $pk_first_col = $tbl_struct->{keys}->{PRIMARY}->{cols}->[0];
@result = grep { $_->{column} eq $pk_first_col } @candidate_cols;
@candidate_cols = grep { $_->{column} ne $pk_first_col } @candidate_cols;
}
my $i = 0;
my %col_pos = map { $_ => $i++ } @{$tbl_struct->{cols}};
push @result, sort { $col_pos{$a->{column}} <=> $col_pos{$b->{column}} }
@candidate_cols;
if ( PTDEBUG ) {
_d('Chunkable columns:',
join(', ', map { "$_->{column} on $_->{index}" } @result));
_d('Can chunk exactly:', $can_chunk_exact);
}
return ($can_chunk_exact, @result);
}
# Sub: calculate_chunks
# Calculate chunks for the given range statistics. Args min, max and
# rows_in_range are returned from get_range_statistics() which is usually
# called before this sub. Min and max are expected to be valid values
# (NULL is valid).
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# dbh - dbh
# db - database name
# tbl - table name
# tbl_struct - retval of <TableParser::parse()>
# chunk_col - column name to chunk on
# min - min col value, from <TableChunker::get_range_statistics()>
# max - max col value, from <TableChunker::get_range_statistics()>
# rows_in_range - number of rows to chunk, from
# <TableChunker::get_range_statistics()>
# chunk_size - requested size of each chunk
#
# Optional Arguments:
# exact - Use exact chunk_size? Use approximates is not.
# tries - Fetch up to this many rows to find a non-zero value
# chunk_range - Make chunk range open (default) or openclosed
# where - WHERE clause.
#
# Returns:
# Array of WHERE predicates like "`col` >= '10' AND `col` < '20'",
# one for each chunk. All values are single-quoted due to <issue 1002 at
# http://code.google.com/p/maatkit/issues/detail?id=1002>. Example:
# (start code)
# `film_id` < '30',
# `film_id` >= '30' AND `film_id` < '60',
# `film_id` >= '60' AND `film_id` < '90',
# `film_id` >= '90',
# (end code)
sub calculate_chunks {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db tbl tbl_struct chunk_col rows_in_range chunk_size);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless defined $args{$arg};
}
PTDEBUG && _d('Calculate chunks for',
join(", ", map {"$_=".(defined $args{$_} ? $args{$_} : "undef")}
qw(db tbl chunk_col min max rows_in_range chunk_size zero_chunk exact)
));
if ( !$args{rows_in_range} ) {
PTDEBUG && _d("Empty table");
return '1=1';
}
# http://code.google.com/p/maatkit/issues/detail?id=1084
if ( $args{rows_in_range} < $args{chunk_size} ) {
PTDEBUG && _d("Chunk size larger than rows in range");
return '1=1';
}
my $q = $self->{Quoter};
my $dbh = $args{dbh};
my $chunk_col = $args{chunk_col};
my $tbl_struct = $args{tbl_struct};
my $col_type = $tbl_struct->{type_for}->{$chunk_col};
PTDEBUG && _d('chunk col type:', $col_type);
# Get chunker info for the column type. Numeric cols are chunked
# differently than char cols.
my %chunker;
if ( $tbl_struct->{is_numeric}->{$chunk_col} || $col_type =~ /date|time/ ) {
%chunker = $self->_chunk_numeric(%args);
}
elsif ( $col_type =~ m/char/ ) {
%chunker = $self->_chunk_char(%args);
}
else {
die "Cannot chunk $col_type columns";
}
PTDEBUG && _d("Chunker:", Dumper(\%chunker));
my ($col, $start_point, $end_point, $interval, $range_func)
= @chunker{qw(col start_point end_point interval range_func)};
# Generate a list of chunk boundaries. The first and last chunks are
# inclusive, and will catch any rows before or after the end of the
# supposed range. So 1-100 divided into chunks of 30 should actually end
# up with chunks like this:
# < 30
# >= 30 AND < 60
# >= 60 AND < 90
# >= 90
# If zero_chunk was specified and zero chunking was possible, the first
# chunk will be = 0 to catch any zero or zero-equivalent (e.g. 00:00:00)
# rows.
my @chunks;
if ( $start_point < $end_point ) {
# The zero chunk, if there is one. It doesn't have to be the first
# chunk. The 0 cannot be quoted because if d='0000-00-00' then
# d=0 will work but d='0' will cause warning 1292: Incorrect date
# value: '0' for column 'd'. This might have to column-specific in
# future when we chunk on more exotic column types.
push @chunks, "$col = 0" if $chunker{have_zero_chunk};
my ($beg, $end);
my $iter = 0;
for ( my $i = $start_point; $i < $end_point; $i += $interval ) {
($beg, $end) = $self->$range_func($dbh, $i, $interval, $end_point);
# The first chunk.
if ( $iter++ == 0 ) {
push @chunks,
($chunker{have_zero_chunk} ? "$col > 0 AND " : "")
."$col < " . $q->quote_val($end);
}
else {
# The normal case is a chunk in the middle of the range somewhere.
push @chunks, "$col >= " . $q->quote_val($beg) . " AND $col < " . $q->quote_val($end);
}
}
# Remove the last chunk and replace it with one that matches everything
# from the beginning of the last chunk to infinity, or to the max col
# value if chunk_range is openclosed. If the chunk column is nullable,
# do NULL separately.
my $chunk_range = lc($args{chunk_range} || 'open');
my $nullable = $args{tbl_struct}->{is_nullable}->{$args{chunk_col}};
pop @chunks;
if ( @chunks ) {
push @chunks, "$col >= " . $q->quote_val($beg)
. ($chunk_range eq 'openclosed'
? " AND $col <= " . $q->quote_val($args{max}) : "");
}
else {
push @chunks, $nullable ? "$col IS NOT NULL" : '1=1';
}
if ( $nullable ) {
push @chunks, "$col IS NULL";
}
}
else {
# There are no chunks; just do the whole table in one chunk.
PTDEBUG && _d('No chunks; using single chunk 1=1');
push @chunks, '1=1';
}
return @chunks;
}
# Sub: _chunk_numeric
# Determine how to chunk a numeric column.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# dbh - dbh
# db - database name
# tbl - table name
# tbl_struct - retval of <TableParser::parse()>
# chunk_col - column name to chunk on
# min - min col value, from <TableChunker::get_range_statistics()>
# max - max col value, from <TableChunker::get_range_statistics()>
# rows_in_range - number of rows to chunk, from
# <TableChunker::get_range_statistics()>
# chunk_size - requested size of each chunk
#
# Optional Arguments:
# exact - Use exact chunk_size? Use approximates is not.
# tries - Fetch up to this many rows to find a non-zero value
# zero_chunk - Add an extra chunk for zero values? (0, 00:00, etc.)
#
# Returns:
# Array of chunker info that <calculate_chunks()> uses to create
# chunks, like:
# (start code)
# col => quoted chunk column name
# start_point => start value (a Perl number)
# end_point => end value (a Perl number)
# interval => interval to walk from start_ to end_point (a Perl number)
# range_func => coderef to return a value while walking that ^ range
# have_zero_chunk => whether to include a zero chunk (col=0)
# (end code)
sub _chunk_numeric {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db tbl tbl_struct chunk_col rows_in_range chunk_size);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless defined $args{$arg};
}
my $q = $self->{Quoter};
my $db_tbl = $q->quote($args{db}, $args{tbl});
my $col_type = $args{tbl_struct}->{type_for}->{$args{chunk_col}};
# Convert the given MySQL values to (Perl) numbers using some MySQL function.
# E.g.: SELECT TIME_TO_SEC('12:34') == 45240.
my $range_func;
if ( $col_type =~ m/(?:int|year|float|double|decimal)$/ ) {
$range_func = 'range_num';
}
elsif ( $col_type =~ m/^(?:timestamp|date|time)$/ ) {
$range_func = "range_$col_type";
}
elsif ( $col_type eq 'datetime' ) {
$range_func = 'range_datetime';
}
my ($start_point, $end_point);
eval {
$start_point = $self->value_to_number(
value => $args{min},
column_type => $col_type,
dbh => $args{dbh},
);
$end_point = $self->value_to_number(
value => $args{max},
column_type => $col_type,
dbh => $args{dbh},
);
};
if ( $EVAL_ERROR ) {
if ( $EVAL_ERROR =~ m/don't know how to chunk/ ) {
# Special kind of error doesn't make sense with the more verbose
# description below.
die $EVAL_ERROR;
}
else {
die "Error calculating chunk start and end points for table "
. "`$args{tbl_struct}->{name}` on column `$args{chunk_col}` "
. "with min/max values "
. join('/',
map { defined $args{$_} ? $args{$_} : 'undef' } qw(min max))
. ":\n\n"
. $EVAL_ERROR
. "\nVerify that the min and max values are valid for the column. "
. "If they are valid, this error could be caused by a bug in the "
. "tool.";
}
}
# The end points might be NULL in the pathological case that the table
# has nothing but NULL values. If there's at least one non-NULL value
# then MIN() and MAX() will return it. Otherwise, the only thing to do
# is make NULL end points zero to make the code below work and any NULL
# values will be handled by the special "IS NULL" chunk.
if ( !defined $start_point ) {
PTDEBUG && _d('Start point is undefined');
$start_point = 0;
}
if ( !defined $end_point || $end_point < $start_point ) {
PTDEBUG && _d('End point is undefined or before start point');
$end_point = 0;
}
PTDEBUG && _d("Actual chunk range:", $start_point, "to", $end_point);
# Determine if we can include a zero chunk (col = 0). If yes, then
# make sure the start point is non-zero. $start_point and $end_point
# should be numbers (converted from MySQL values earlier). The purpose
# of the zero chunk is to capture a potentially large number of zero
# values that might imbalance the size of the first chunk. E.g. if a
# lot of invalid times were inserted and stored as 00:00:00, these
# zero (equivalent) values are captured by the zero chunk instead of
# the first chunk + all the non-zero values in the first chunk.
my $have_zero_chunk = 0;
if ( $args{zero_chunk} ) {
if ( $start_point != $end_point && $start_point >= 0 ) {
PTDEBUG && _d('Zero chunking');
my $nonzero_val = $self->get_nonzero_value(
%args,
db_tbl => $db_tbl,
col => $args{chunk_col},
col_type => $col_type,
val => $args{min}
);
# Since we called value_to_number() before with this column type
# we shouldn't have to worry about it dying here--it would have
# died earlier if we can't chunk the column type.
$start_point = $self->value_to_number(
value => $nonzero_val,
column_type => $col_type,
dbh => $args{dbh},
);
$have_zero_chunk = 1;
}
else {
PTDEBUG && _d("Cannot zero chunk");
}
}
PTDEBUG && _d("Using chunk range:", $start_point, "to", $end_point);
# Calculate the chunk size in terms of "distance between endpoints"
# that will give approximately the right number of rows between the
# endpoints. If possible and requested, forbid chunks from being any
# bigger than specified.
my $interval = $args{chunk_size}
* ($end_point - $start_point)
/ $args{rows_in_range};
if ( $self->{int_types}->{$col_type} ) {
$interval = ceil($interval);
}
$interval ||= $args{chunk_size};
if ( $args{exact} ) {
$interval = $args{chunk_size};
}
PTDEBUG && _d('Chunk interval:', $interval, 'units');
return (
col => $q->quote($args{chunk_col}),
start_point => $start_point,
end_point => $end_point,
interval => $interval,
range_func => $range_func,
have_zero_chunk => $have_zero_chunk,
);
}
# Sub: _chunk_numeric
# Determine how to chunk a character column.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# dbh - dbh
# db - database name
# tbl - table name
# tbl_struct - retval of <TableParser::parse()>
# chunk_col - column name to chunk on
# min - min col value, from <TableChunker::get_range_statistics()>
# max - max col value, from <TableChunker::get_range_statistics()>
# rows_in_range - number of rows to chunk, from
# <TableChunker::get_range_statistics()>
# chunk_size - requested size of each chunk
#
# Optional Arguments:
# where - WHERE clause.
#
# Returns:
# Array of chunker info that <calculate_chunks()> uses to create
# chunks, like:
# (start code)
# col => quoted chunk column name
# start_point => start value (a Perl number)
# end_point => end value (a Perl number)
# interval => interval to walk from start_ to end_point (a Perl number)
# range_func => coderef to return a value while walking that ^ range
# (end code)
sub _chunk_char {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db tbl tbl_struct chunk_col min max rows_in_range chunk_size);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless defined $args{$arg};
}
my $q = $self->{Quoter};
my $db_tbl = $q->quote($args{db}, $args{tbl});
my $dbh = $args{dbh};
my $chunk_col = $args{chunk_col};
my $qchunk_col = $q->quote($args{chunk_col});
my $row;
my $sql;
# Get the character codes between the min and max column values.
my ($min_col, $max_col) = @{args}{qw(min max)};
$sql = "SELECT ORD(?) AS min_col_ord, ORD(?) AS max_col_ord";
PTDEBUG && _d($dbh, $sql);
my $ord_sth = $dbh->prepare($sql); # avoid quoting issues
$ord_sth->execute($min_col, $max_col);
$row = $ord_sth->fetchrow_arrayref();
my ($min_col_ord, $max_col_ord) = ($row->[0], $row->[1]);
PTDEBUG && _d("Min/max col char code:", $min_col_ord, $max_col_ord);
# Create a sorted chacater-to-number map of the unique characters in
# the column ranging from the min character code to the max.
my $base;
my @chars;
PTDEBUG && _d("Table charset:", $args{tbl_struct}->{charset});
if ( ($args{tbl_struct}->{charset} || "") eq "latin1" ) {
# These are the unique, sorted latin1 character codes according to
# MySQL. You'll notice that many are missing. That's because MySQL
# treats many characters as the same, for example "e" and "é".
my @sorted_latin1_chars = (
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95, 96, 123, 124, 125, 126, 161,
162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
190, 191, 215, 216, 222, 223, 247, 255);
my ($first_char, $last_char);
for my $i ( 0..$#sorted_latin1_chars ) {
$first_char = $i and last if $sorted_latin1_chars[$i] >= $min_col_ord;
}
for my $i ( $first_char..$#sorted_latin1_chars ) {
$last_char = $i and last if $sorted_latin1_chars[$i] >= $max_col_ord;
};
@chars = map { chr $_; } @sorted_latin1_chars[$first_char..$last_char];
$base = scalar @chars;
}
else {
# If the table's charset isn't latin1, who knows what charset is being
# used, what characters it contains, and how those characters are sorted.
# So we create a character map and let MySQL tell us these things.
# Create a temp table with the same char col def as the original table.
my $tmp_tbl = '__maatkit_char_chunking_map';
my $tmp_db_tbl = $q->quote($args{db}, $tmp_tbl);
$sql = "DROP TABLE IF EXISTS $tmp_db_tbl";
PTDEBUG && _d($dbh, $sql);
$dbh->do($sql);
my $col_def = $args{tbl_struct}->{defs}->{$chunk_col};
$sql = "CREATE TEMPORARY TABLE $tmp_db_tbl ($col_def) "
. "ENGINE=MEMORY DEFAULT CHARSET = utf8";
PTDEBUG && _d($dbh, $sql);
$dbh->do($sql);
# Populate the temp table with all the characters between the min and max
# max character codes. This is our character-to-number map.
$sql = "INSERT INTO $tmp_db_tbl VALUES (CHAR(?))";
PTDEBUG && _d($dbh, $sql);
my $ins_char_sth = $dbh->prepare($sql); # avoid quoting issues
for my $char_code ( $min_col_ord..$max_col_ord ) {
# Ignore errors on invalid chars for UTF8
eval { $ins_char_sth->execute($char_code) };
}
# Select from the char-to-number map all characters between the
# min and max col values, letting MySQL order them. The first
# character returned becomes "zero" in a new base system of counting,
# the second character becomes "one", etc. So if 42 chars are
# returned like [a, B, c, d, é, ..., ü] then we have a base 42
# system where 0=a, 1=B, 2=c, 3=d, 4=é, ... 41=ü. count_base()
# helps us count in arbitrary systems.
$sql = "SELECT $qchunk_col FROM $tmp_db_tbl "
. "WHERE $qchunk_col BETWEEN ? AND ? "
. "ORDER BY $qchunk_col";
PTDEBUG && _d($dbh, $sql);
my $sel_char_sth = $dbh->prepare($sql);
$sel_char_sth->execute($min_col, $max_col);
@chars = map { $_->[0] } @{ $sel_char_sth->fetchall_arrayref() };
$base = scalar @chars;
$sql = "DROP TABLE $tmp_db_tbl";
PTDEBUG && _d($dbh, $sql);
$dbh->do($sql);
}
PTDEBUG && _d("Base", $base, "chars:", @chars);
# See https://bugs.launchpad.net/percona-toolkit/+bug/1034717
die "Cannot chunk table $db_tbl using the character column "
. "$chunk_col, most likely because all values start with the "
. "same character. This table must be synced separately by "
. "specifying a list of --algorithms without the Chunk algorithm"
if $base == 1;
# Now we begin calculating how to chunk the char column. This is
# completely different from _chunk_numeric because we're not dealing
# with the values to chunk directly (the characters) but rather a map.
# In our base system, how many values can 1, 2, etc. characters express?
# E.g. in a base 26 system (a-z), 1 char expresses 26^1=26 chars (a-z),
# 2 chars expresses 26^2=676 chars. If the requested chunk size is 100,
# then 1 char might not express enough values, but 2 surely can. This
# is imperefect because we don't know if we have data like: [apple, boy,
# car] (i.e. values evenly distributed across the range of chars), or
# [ant, apple, azur, boy]. We assume data is more evenly distributed
# than not so we use the minimum number of characters to express a chunk
# size.
$sql = "SELECT MAX(LENGTH($qchunk_col)) FROM $db_tbl "
. ($args{where} ? "WHERE $args{where} " : "")
. "ORDER BY $qchunk_col";
PTDEBUG && _d($dbh, $sql);
$row = $dbh->selectrow_arrayref($sql);
my $max_col_len = $row->[0];
PTDEBUG && _d("Max column value:", $max_col, $max_col_len);
my $n_values;
for my $n_chars ( 1..$max_col_len ) {
$n_values = $base**$n_chars;
if ( $n_values >= $args{chunk_size} ) {
PTDEBUG && _d($n_chars, "chars in base", $base, "expresses",
$n_values, "values");
last;
}
}
# Our interval is not like a _chunk_numeric() interval, either, because
# we don't increment in the actual values (i.e. the characters) but rather
# in the char-to-number map. If the above calculation found that 1 char
# expressed enough values for 1 chunk, then each char in the map will
# yield roughly one chunk of values, so the interval is 1. Or, if we need
# 2 chars to express enough vals for 1 chunk, then we'll increment through
# the map 2 chars at a time, like [a, b], [c, d], etc.
my $n_chunks = $args{rows_in_range} / $args{chunk_size};
my $interval = floor(($n_values+0.00001) / $n_chunks) || 1;
my $range_func = sub {
my ( $self, $dbh, $start, $interval, $max ) = @_;
my $start_char = $self->base_count(
count_to => $start,
base => $base,
symbols => \@chars,
);
my $end_char = $self->base_count(
count_to => min($max, $start + $interval),
base => $base,
symbols => \@chars,
);
return $start_char, $end_char;
};
return (
col => $qchunk_col,
start_point => 0,
end_point => $n_values,
interval => $interval,
range_func => $range_func,
);
}
# Sub: get_first_chunkable_column
# Get the first chunkable column in a table.
# Only a "sane" column/index is returned. That means that
# the first auto-detected chunk col/index are used if any combination of
# preferred chunk col or index would be really bad, like chunk col=x
# and chunk index=some index over (y, z). That's bad because the index
# doesn't include the column; it would also be bad if the column wasn't
# a left-most prefix of the index.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# tbl_struct - Hashref returned by <TableParser::parse()>
#
# Optional arguments:
# chunk_column - Preferred chunkable column name
# chunk_index - Preferred chunkable column index name
# exact - bool: passed to <find_chunk_columns()>
#
# Returns:
# List: chunkable column name, chunkable column index
sub get_first_chunkable_column {
my ( $self, %args ) = @_;
foreach my $arg ( qw(tbl_struct) ) {
die "I need a $arg argument" unless $args{$arg};
}
# First auto-detected chunk col/index. If any combination of preferred
# chunk col or index are specified and are sane, they will overwrite
# these defaults. Else, these defaults will be returned.
my ($exact, @cols) = $self->find_chunk_columns(%args);
my $col = $cols[0]->{column};
my $idx = $cols[0]->{index};
# Wanted/preferred chunk column and index. Caller only gets what
# they want, though, if it results in a sane col/index pair.
my $wanted_col = $args{chunk_column};
my $wanted_idx = $args{chunk_index};
PTDEBUG && _d("Preferred chunk col/idx:", $wanted_col, $wanted_idx);
if ( $wanted_col && $wanted_idx ) {
# Preferred column and index: check that the pair is sane.
foreach my $chunkable_col ( @cols ) {
if ( $wanted_col eq $chunkable_col->{column}
&& $wanted_idx eq $chunkable_col->{index} ) {
# The wanted column is chunkable with the wanted index.
$col = $wanted_col;
$idx = $wanted_idx;
last;
}
}
}
elsif ( $wanted_col ) {
# Preferred column, no index: check if column is chunkable, if yes
# then use its index, else fall back to default col/index.
foreach my $chunkable_col ( @cols ) {
if ( $wanted_col eq $chunkable_col->{column} ) {
# The wanted column is chunkable, so use its index and overwrite
# the defaults.
$col = $wanted_col;
$idx = $chunkable_col->{index};
last;
}
}
}
elsif ( $wanted_idx ) {
# Preferred index, no column: check if index's left-most column is
# chunkable, if yes then use its column, else fall back to auto-detected
# col/index.
foreach my $chunkable_col ( @cols ) {
if ( $wanted_idx eq $chunkable_col->{index} ) {
# The wanted index has a chunkable column, so use it and overwrite
# the defaults.
$col = $chunkable_col->{column};
$idx = $wanted_idx;
last;
}
}
}
PTDEBUG && _d('First chunkable col/index:', $col, $idx);
return $col, $idx;
}
# Sub: size_to_rows
# Convert a size in rows or bytes to a number of rows in the table,
# using SHOW TABLE STATUS. If the size is a string with a suffix of M/G/k,
# interpret it as mebibytes, gibibytes, or kibibytes respectively.
# If it's just a number, treat it as a number of rows and return right away.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# dbh - dbh
# db - Database name
# tbl - Table name
# chunk_size - Chunk size string like "1000" or "50M"
#
# Returns:
# Array: number of rows, average row size
sub size_to_rows {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db tbl chunk_size);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $db, $tbl, $chunk_size) = @args{@required_args};
my $q = $self->{Quoter};
my $tp = $self->{TableParser};
my ($n_rows, $avg_row_length);
my ( $num, $suffix ) = $chunk_size =~ m/^(\d+)([MGk])?$/;
if ( $suffix ) { # Convert to bytes.
$chunk_size = $suffix eq 'k' ? $num * 1_024
: $suffix eq 'M' ? $num * 1_024 * 1_024
: $num * 1_024 * 1_024 * 1_024;
}
elsif ( $num ) {
$n_rows = $num;
}
else {
die "Invalid chunk size $chunk_size; must be an integer "
. "with optional suffix kMG";
}
if ( $suffix || $args{avg_row_length} ) {
my ($status) = $tp->get_table_status($dbh, $db, $tbl);
$avg_row_length = $status->{avg_row_length};
if ( !defined $n_rows ) {
$n_rows = $avg_row_length ? ceil($chunk_size / $avg_row_length) : undef;
}
}
return $n_rows, $avg_row_length;
}
# Sub: get_range_statistics
# Determine the range of values for the chunk_col column on this table.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# dbh - dbh
# db - Database name
# tbl - Table name
# chunk_col - Chunk column name
# tbl_struct - Hashref returned by <TableParser::parse()>
#
# Optional arguments:
# where - WHERE clause without "WHERE" to restrict range
# index_hint - "FORCE INDEX (...)" clause
# tries - Fetch up to this many rows to find a valid value
#
# Returns:
# Array: min row value, max row value, rows in range
sub get_range_statistics {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db tbl chunk_col tbl_struct);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $db, $tbl, $col) = @args{@required_args};
my $where = $args{where};
my $q = $self->{Quoter};
my $col_type = $args{tbl_struct}->{type_for}->{$col};
my $col_is_numeric = $args{tbl_struct}->{is_numeric}->{$col};
# Quote these once so we don't have to do it again.
my $db_tbl = $q->quote($db, $tbl);
$col = $q->quote($col);
my ($min, $max);
eval {
# First get the actual end points, whatever MySQL considers the
# min and max values to be for this column.
my $sql = "SELECT MIN($col), MAX($col) FROM $db_tbl"
. ($args{index_hint} ? " $args{index_hint}" : "")
. ($where ? " WHERE ($where)" : '');
PTDEBUG && _d($dbh, $sql);
($min, $max) = $dbh->selectrow_array($sql);
PTDEBUG && _d("Actual end points:", $min, $max);
# Now, for two reasons, get the valid end points. For one, an
# end point may be 0 or some zero-equivalent and the user doesn't
# want that because it skews the range. Or two, an end point may
# be an invalid value like date 2010-00-00 and we can't use that.
($min, $max) = $self->get_valid_end_points(
%args,
dbh => $dbh,
db_tbl => $db_tbl,
col => $col,
col_type => $col_type,
min => $min,
max => $max,
);
PTDEBUG && _d("Valid end points:", $min, $max);
};
if ( $EVAL_ERROR ) {
die "Error getting min and max values for table $db_tbl "
. "on column $col: $EVAL_ERROR";
}
# Finally get the total number of rows in range, usually the whole
# table unless there's a where arg restricting the range.
my $sql = "EXPLAIN SELECT * FROM $db_tbl"
. ($args{index_hint} ? " $args{index_hint}" : "")
. ($where ? " WHERE $where" : '');
PTDEBUG && _d($sql);
my $expl = $dbh->selectrow_hashref($sql);
return (
min => $min,
max => $max,
rows_in_range => $expl->{rows},
);
}
# Sub: inject_chunks
# Create a SQL statement from a query prototype by filling in placeholders.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# database - Database name
# table - Table name
# chunks - Arrayref of chunks from <calculate_chunks()>
# chunk_num - Index into chunks to use
# query - Query prototype returned by
# <TableChecksum::make_checksum_query()>
#
# Optional Arguments:
# index_hint - "FORCE INDEX (...)" clause
# where - Arrayref of WHERE clauses joined with AND
#
# Returns:
# A SQL statement
sub inject_chunks {
my ( $self, %args ) = @_;
foreach my $arg ( qw(database table chunks chunk_num query) ) {
die "I need a $arg argument" unless defined $args{$arg};
}
PTDEBUG && _d('Injecting chunk', $args{chunk_num});
my $query = $args{query};
my $comment = sprintf("/*%s.%s:%d/%d*/",
$args{database}, $args{table},
$args{chunk_num} + 1, scalar @{$args{chunks}});
$query =~ s!/\*PROGRESS_COMMENT\*/!$comment!;
my $where = "WHERE (" . $args{chunks}->[$args{chunk_num}] . ')';
if ( $args{where} && grep { $_ } @{$args{where}} ) {
$where .= " AND ("
. join(" AND ", map { "($_)" } grep { $_ } @{$args{where}} )
. ")";
}
my $db_tbl = $self->{Quoter}->quote(@args{qw(database table)});
my $index_hint = $args{index_hint} || '';
PTDEBUG && _d('Parameters:',
Dumper({WHERE => $where, DB_TBL => $db_tbl, INDEX_HINT => $index_hint}));
$query =~ s!/\*WHERE\*/! $where!;
$query =~ s!/\*DB_TBL\*/!$db_tbl!;
$query =~ s!/\*INDEX_HINT\*/! $index_hint!;
$query =~ s!/\*CHUNK_NUM\*/! $args{chunk_num} AS chunk_num,!;
return $query;
}
# #############################################################################
# MySQL value to Perl number conversion.
# #############################################################################
# Convert a MySQL column value to a Perl integer.
# Arguments:
# * value scalar: MySQL value to convert
# * column_type scalar: MySQL column type of the value
# * dbh dbh
# Returns an integer or undef if the value isn't convertible
# (e.g. date 0000-00-00 is not convertible).
sub value_to_number {
my ( $self, %args ) = @_;
my @required_args = qw(column_type dbh);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless defined $args{$arg};
}
my $val = $args{value};
my ($col_type, $dbh) = @args{@required_args};
PTDEBUG && _d('Converting MySQL', $col_type, $val);
return unless defined $val; # value is NULL
# MySQL functions to convert a non-numeric value to a number
# so we can do basic math on it in Perl. Right now we just
# convert temporal values but later we'll need to convert char
# and hex values.
my %mysql_conv_func_for = (
timestamp => 'UNIX_TIMESTAMP',
date => 'TO_DAYS',
time => 'TIME_TO_SEC',
datetime => 'TO_DAYS',
);
# Convert the value to a number that Perl can do arithmetic with.
my $num;
if ( $col_type =~ m/(?:int|year|float|double|decimal)$/ ) {
# These types are already numbers.
$num = $val;
}
elsif ( $col_type =~ m/^(?:timestamp|date|time)$/ ) {
# These are temporal values. Convert them using a MySQL func.
my $func = $mysql_conv_func_for{$col_type};
my $sql = "SELECT $func(?)";
PTDEBUG && _d($dbh, $sql, $val);
my $sth = $dbh->prepare($sql);
$sth->execute($val);
($num) = $sth->fetchrow_array();
}
elsif ( $col_type eq 'datetime' ) {
# This type is temporal, too, but needs special handling.
# Newer versions of MySQL could use TIMESTAMPDIFF, but it's easier
# to maintain just one kind of code, so I do it all with DATE_ADD().
$num = $self->timestampdiff($dbh, $val);
}
else {
die "I don't know how to chunk $col_type\n";
}
PTDEBUG && _d('Converts to', $num);
return $num;
}
# ###########################################################################
# Range functions.
# ###########################################################################
sub range_num {
my ( $self, $dbh, $start, $interval, $max ) = @_;
my $end = min($max, $start + $interval);
# "Remove" scientific notation so the regex below does not make
# 6.123456e+18 into 6.12345.
$start = sprintf('%.17f', $start) if $start =~ /e/;
$end = sprintf('%.17f', $end) if $end =~ /e/;
# Trim decimal places, if needed. This helps avoid issues with float
# precision differing on different platforms.
$start =~ s/\.(\d{5}).*$/.$1/;
$end =~ s/\.(\d{5}).*$/.$1/;
if ( $end > $start ) {
return ( $start, $end );
}
else {
die "Chunk size is too small: $end !> $start\n";
}
}
sub range_time {
my ( $self, $dbh, $start, $interval, $max ) = @_;
my $sql = "SELECT SEC_TO_TIME($start), SEC_TO_TIME(LEAST($max, $start + $interval))";
PTDEBUG && _d($sql);
return $dbh->selectrow_array($sql);
}
sub range_date {
my ( $self, $dbh, $start, $interval, $max ) = @_;
my $sql = "SELECT FROM_DAYS($start), FROM_DAYS(LEAST($max, $start + $interval))";
PTDEBUG && _d($sql);
return $dbh->selectrow_array($sql);
}
sub range_datetime {
my ( $self, $dbh, $start, $interval, $max ) = @_;
my $sql = "SELECT DATE_ADD('$self->{EPOCH}', INTERVAL $start SECOND), "
. "DATE_ADD('$self->{EPOCH}', INTERVAL LEAST($max, $start + $interval) SECOND)";
PTDEBUG && _d($sql);
return $dbh->selectrow_array($sql);
}
sub range_timestamp {
my ( $self, $dbh, $start, $interval, $max ) = @_;
my $sql = "SELECT FROM_UNIXTIME($start), FROM_UNIXTIME(LEAST($max, $start + $interval))";
PTDEBUG && _d($sql);
return $dbh->selectrow_array($sql);
}
# Returns the number of seconds between EPOCH and the value, according to
# the MySQL server. (The server can do no wrong). I believe this code is right
# after looking at the source of sql/time.cc but I am paranoid and add in an
# extra check just to make sure. Earlier versions overflow on large interval
# values, such as on 3.23.58, '1970-01-01' - interval 58000000000 second is
# 2037-06-25 11:29:04. I know of no workaround. TO_DAYS('0000-....') is NULL,
# so we treat it as 0.
sub timestampdiff {
my ( $self, $dbh, $time ) = @_;
my $sql = "SELECT (COALESCE(TO_DAYS('$time'), 0) * 86400 + TIME_TO_SEC('$time')) "
. "- TO_DAYS('$self->{EPOCH} 00:00:00') * 86400";
PTDEBUG && _d($sql);
my ( $diff ) = $dbh->selectrow_array($sql);
$sql = "SELECT DATE_ADD('$self->{EPOCH}', INTERVAL $diff SECOND)";
PTDEBUG && _d($sql);
my ( $check ) = $dbh->selectrow_array($sql);
die <<" EOF"
Incorrect datetime math: given $time, calculated $diff but checked to $check.
This could be due to a version of MySQL that overflows on large interval
values to DATE_ADD(), or the given datetime is not a valid date. If not,
please report this as a bug.
EOF
unless $check eq $time;
return $diff;
}
# #############################################################################
# End point validation.
# #############################################################################
# These sub require val (or min and max) args which usually aren't NULL
# but could be zero so the usual "die ... unless $args{$arg}" check does
# not work.
# Returns valid min and max values. A valid val evaluates to a non-NULL value.
# Arguments:
# * dbh dbh
# * db_tbl scalar: quoted `db`.`tbl`
# * col scalar: quoted `column`
# * col_type scalar: column type of the value
# * min scalar: any scalar value
# * max scalar: any scalar value
# Optional arguments:
# * index_hint scalar: "FORCE INDEX (...)" hint
# * where scalar: WHERE clause without "WHERE"
# * tries scalar: fetch up to this many rows to find a valid value
# * zero_chunk bool: do a separate chunk for zero values
# Some column types can store invalid values, like most of the temporal
# types. When evaluated, invalid values return NULL. If the value is
# NULL to begin with, then it is not invalid because NULL is valid.
# For example, TO_DAYS('2009-00-00') evaluates to NULL because that date
# is invalid, even though it's storable.
sub get_valid_end_points {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db_tbl col col_type);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $db_tbl, $col, $col_type) = @args{@required_args};
my ($real_min, $real_max) = @args{qw(min max)};
# Common error message format in case there's a problem with
# finding a valid min or max value.
my $err_fmt = "Error finding a valid %s value for table $db_tbl on "
. "column $col. The real %s value %s is invalid and "
. "no other valid values were found. Verify that the table "
. "has at least one valid value for this column"
. ($args{where} ? " where $args{where}." : ".");
# Validate min value if it's not NULL. NULL is valid.
my $valid_min = $real_min;
if ( defined $valid_min ) {
# Get a valid min end point.
PTDEBUG && _d("Validating min end point:", $real_min);
$valid_min = $self->_get_valid_end_point(
%args,
val => $real_min,
endpoint => 'min',
);
die sprintf($err_fmt, 'minimum', 'minimum',
(defined $real_min ? $real_min : "NULL"))
unless defined $valid_min;
}
# Validate max value if it's not NULL. NULL is valid.
my $valid_max = $real_max;
if ( defined $valid_max ) {
# Get a valid max end point. So far I've not found a case where
# the actual max val is invalid, but check anyway just in case.
PTDEBUG && _d("Validating max end point:", $real_min);
$valid_max = $self->_get_valid_end_point(
%args,
val => $real_max,
endpoint => 'max',
);
die sprintf($err_fmt, 'maximum', 'maximum',
(defined $real_max ? $real_max : "NULL"))
unless defined $valid_max;
}
return $valid_min, $valid_max;
}
# Does the actual work for get_valid_end_points() for each end point.
sub _get_valid_end_point {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db_tbl col col_type);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $db_tbl, $col, $col_type) = @args{@required_args};
my $val = $args{val};
# NULL is valid.
return $val unless defined $val;
# Right now we only validate temporal types, but when we begin
# chunking char and hex columns we'll need to validate those.
# E.g. HEX('abcdefg') is invalid and we'll probably find some
# combination of char val + charset/collation that's invalid.
my $validate = $col_type =~ m/time|date/ ? \&_validate_temporal_value
: undef;
# If we cannot validate the value, assume it's valid.
if ( !$validate ) {
PTDEBUG && _d("No validator for", $col_type, "values");
return $val;
}
# Return the value if it's already valid.
return $val if defined $validate->($dbh, $val);
# The value is not valid so find the first one in the table that is.
PTDEBUG && _d("Value is invalid, getting first valid value");
$val = $self->get_first_valid_value(
%args,
val => $val,
validate => $validate,
);
return $val;
}
# Arguments:
# * dbh dbh
# * db_tbl scalar: quoted `db`.`tbl`
# * col scalar: quoted `column` name
# * val scalar: the current value, may be real, maybe not
# * validate coderef: returns a defined value if the given value is valid
# * endpoint scalar: "min" or "max", i.e. find first endpoint() real val
# Optional arguments:
# * tries scalar: fetch up to this many rows to find a valid value
# * index_hint scalar: "FORCE INDEX (...)" hint
# * where scalar: WHERE clause without "WHERE"
# Returns the first column value from the given db_tbl that does *not*
# evaluate to NULL. This is used mostly to eliminate unreal temporal
# values which MySQL allows to be stored, like "2010-00-00". Returns
# undef if no real value is found.
sub get_first_valid_value {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db_tbl col validate endpoint);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $db_tbl, $col, $validate, $endpoint) = @args{@required_args};
my $tries = defined $args{tries} ? $args{tries} : 5;
my $val = $args{val};
# NULL values are valid and shouldn't be passed to us.
return unless defined $val;
# Prep a sth for fetching the next col val.
my $cmp = $endpoint =~ m/min/i ? '>'
: $endpoint =~ m/max/i ? '<'
: die "Invalid endpoint arg: $endpoint";
my $sql = "SELECT $col FROM $db_tbl "
. ($args{index_hint} ? "$args{index_hint} " : "")
. "WHERE $col $cmp ? AND $col IS NOT NULL "
. ($args{where} ? "AND ($args{where}) " : "")
. "ORDER BY $col LIMIT 1";
PTDEBUG && _d($dbh, $sql);
my $sth = $dbh->prepare($sql);
# Fetch the next col val from the db.tbl until we find a valid one
# or run out of rows. Only try a limited number of next rows.
my $last_val = $val;
while ( $tries-- ) {
$sth->execute($last_val);
my ($next_val) = $sth->fetchrow_array();
PTDEBUG && _d('Next value:', $next_val, '; tries left:', $tries);
if ( !defined $next_val ) {
PTDEBUG && _d('No more rows in table');
last;
}
if ( defined $validate->($dbh, $next_val) ) {
PTDEBUG && _d('First valid value:', $next_val);
$sth->finish();
return $next_val;
}
$last_val = $next_val;
}
$sth->finish();
$val = undef; # no valid value found
return $val;
}
# Evaluates any temporal value, returns NULL if it's invalid, else returns
# a value (possibly zero). It's magical but tested. See also,
# http://hackmysql.com/blog/2010/05/26/detecting-invalid-and-zero-temporal-values/
sub _validate_temporal_value {
my ( $dbh, $val ) = @_;
my $sql = "SELECT IF(TIME_FORMAT(?,'%H:%i:%s')=?, TIME_TO_SEC(?), TO_DAYS(?))";
my $res;
eval {
PTDEBUG && _d($dbh, $sql, $val);
my $sth = $dbh->prepare($sql);
$sth->execute($val, $val, $val, $val);
($res) = $sth->fetchrow_array();
$sth->finish();
};
if ( $EVAL_ERROR ) {
PTDEBUG && _d($EVAL_ERROR);
}
return $res;
}
sub get_nonzero_value {
my ( $self, %args ) = @_;
my @required_args = qw(dbh db_tbl col col_type);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($dbh, $db_tbl, $col, $col_type) = @args{@required_args};
my $tries = defined $args{tries} ? $args{tries} : 5;
my $val = $args{val};
# Right now we only need a special check for temporal values.
# _validate_temporal_value() does double-duty for this. The
# default anonymous sub handles ints.
my $is_nonzero = $col_type =~ m/time|date/ ? \&_validate_temporal_value
: sub { return $_[1]; };
if ( !$is_nonzero->($dbh, $val) ) { # quasi-double-negative, sorry
PTDEBUG && _d('Discarding zero value:', $val);
my $sql = "SELECT $col FROM $db_tbl "
. ($args{index_hint} ? "$args{index_hint} " : "")
. "WHERE $col > ? AND $col IS NOT NULL "
. ($args{where} ? "AND ($args{where}) " : '')
. "ORDER BY $col LIMIT 1";
PTDEBUG && _d($sql);
my $sth = $dbh->prepare($sql);
my $last_val = $val;
while ( $tries-- ) {
$sth->execute($last_val);
my ($next_val) = $sth->fetchrow_array();
if ( $is_nonzero->($dbh, $next_val) ) {
PTDEBUG && _d('First non-zero value:', $next_val);
$sth->finish();
return $next_val;
}
$last_val = $next_val;
}
$sth->finish();
$val = undef; # no non-zero value found
}
return $val;
}
# Sub: base_count
# Count to any number in any base with the given symbols. E.g. if counting
# to 10 in base 16 with symbols 0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f the result
# is "a". This is trivial for stuff like base 16 (hex), but far less trivial
# for arbitrary bases with arbitrary symbols like base 25 with symbols
# B,C,D,...X,Y,Z. For that, counting to 10 results in "L". The base and its
# symbols are determined by the character column. Symbols can be non-ASCII.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# count_to - Number to count to
# base - Base of special system
# symbols - Arrayref of symbols for "numbers" in special system
#
# Returns:
# The "number" (symbol) in the special target base system
sub base_count {
my ( $self, %args ) = @_;
my @required_args = qw(count_to base symbols);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless defined $args{$arg};
}
my ($n, $base, $symbols) = @args{@required_args};
# Can't take log of zero and the zeroth symbol in any base is the
# zeroth symbol in any other base.
return $symbols->[0] if $n == 0;
my $highest_power = floor(log($n+0.00001)/log($base));
if ( $highest_power == 0 ){
return $symbols->[$n];
}
my @base_powers;
for my $power ( 0..$highest_power ) {
push @base_powers, ($base**$power) || 1;
}
my @base_multiples;
foreach my $base_power ( reverse @base_powers ) {
my $multiples = floor(($n+0.00001) / $base_power);
push @base_multiples, $multiples;
$n -= $multiples * $base_power;
}
return join('', map { $symbols->[$_] } @base_multiples);
}
sub _d {
my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
map { defined $_ ? $_ : 'undef' }
@_;
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}
1;
}
# ###########################################################################
# End TableChunker package
# ###########################################################################