mirror of
https://github.com/percona/percona-toolkit.git
synced 2025-09-01 18:25:59 +00:00
1426 lines
52 KiB
Perl
1426 lines
52 KiB
Perl
# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd.
|
|
# Feedback and improvements are welcome.
|
|
#
|
|
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
|
|
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify it under
|
|
# the terms of the GNU General Public License as published by the Free Software
|
|
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
|
|
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
|
|
# licenses.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along with
|
|
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
|
|
# Place, Suite 330, Boston, MA 02111-1307 USA.
|
|
# ###########################################################################
|
|
# TableChunker package
|
|
# ###########################################################################
|
|
{
|
|
# Package: TableChunker
|
|
# TableChunker helps determine how to "chunk" a table. Chunk are
|
|
# pre-determined ranges of rows defined by boundary values (sometimes also
|
|
# called endpoints) on numeric or numeric-like columns, including date/time
|
|
# types. Any numeric column type that MySQL can do positional comparisons
|
|
# (<, <=, >, >=) on works. Chunking on character data is not supported yet
|
|
# (but see <issue 568 at http://code.google.com/p/maatkit/issues/detail?id=568>).
|
|
#
|
|
# Usually chunks range over all rows in a table but sometimes they only
|
|
# range over a subset of rows if an optional where arg is passed to various
|
|
# subs. In either case a chunk is like "`col` >= 5 AND `col` < 10". If
|
|
# col is of type int and is unique, then that chunk ranges over up to 5 rows.
|
|
#
|
|
# Chunks are included in WHERE clauses by various tools to do work on discrete
|
|
# chunks of the table instead of trying to work on the entire table at once.
|
|
# Chunks do not overlap and their size is configurable via the chunk_size arg
|
|
# passed to several subs. The chunk_size can be a number of rows or a size
|
|
# like 1M, in which case it's in estimated bytes of data. Real chunk sizes
|
|
# are usually close to the requested chunk_size but unless the optional exact
|
|
# arg is assed the real chunk sizes are approximate. Sometimes the
|
|
# distribution of values on the chunk colun can skew chunking. If, for
|
|
# example, col has values 0, 100, 101, ... then the zero value skews chunking.
|
|
# The zero_chunk arg handles this.
|
|
package TableChunker;
|
|
|
|
use strict;
|
|
use utf8;
|
|
use warnings FATAL => 'all';
|
|
use English qw(-no_match_vars);
|
|
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
|
|
|
|
use POSIX qw(floor ceil);
|
|
use List::Util qw(min max);
|
|
use Data::Dumper;
|
|
$Data::Dumper::Indent = 1;
|
|
$Data::Dumper::Sortkeys = 1;
|
|
$Data::Dumper::Quotekeys = 0;
|
|
|
|
# Sub: new
|
|
#
|
|
# Parameters:
|
|
# $class - TableChunker (automatic)
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# Quoter - <Quoter> object
|
|
# TableParser - <TableParser> object
|
|
sub new {
|
|
my ( $class, %args ) = @_;
|
|
foreach my $arg ( qw(Quoter TableParser) ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
|
|
my %int_types = map { $_ => 1 } qw(bigint date datetime int mediumint smallint time timestamp tinyint year);
|
|
my %real_types = map { $_ => 1 } qw(decimal double float);
|
|
|
|
my $self = {
|
|
%args,
|
|
int_types => \%int_types,
|
|
real_types => \%real_types,
|
|
EPOCH => '1970-01-01',
|
|
};
|
|
|
|
return bless $self, $class;
|
|
}
|
|
|
|
# Sub: find_chunk_columns
|
|
# Find chunkable columns.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# table_struct - Hashref returned from <TableParser::parse()>
|
|
#
|
|
# Optional Arguments:
|
|
# exact - bool: Try to support exact chunk sizes (may still chunk fuzzily)
|
|
#
|
|
# Returns:
|
|
# Array: whether the table can be chunked exactly if requested (zero
|
|
# otherwise), arrayref of columns that support chunking. Example:
|
|
# (start code)
|
|
# 1,
|
|
# [
|
|
# { column => 'id', index => 'PRIMARY' },
|
|
# { column => 'i', index => 'i_idx' },
|
|
# ]
|
|
# (end code)
|
|
sub find_chunk_columns {
|
|
my ( $self, %args ) = @_;
|
|
foreach my $arg ( qw(tbl_struct) ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my $tbl_struct = $args{tbl_struct};
|
|
|
|
# See if there's an index that will support chunking.
|
|
my @possible_indexes;
|
|
foreach my $index ( values %{ $tbl_struct->{keys} } ) {
|
|
|
|
# Accept only BTREE indexes.
|
|
next unless $index->{type} eq 'BTREE';
|
|
|
|
# Reject indexes with prefixed columns.
|
|
next if grep { defined } @{$index->{col_prefixes}};
|
|
|
|
# If exact, accept only unique, single-column indexes.
|
|
if ( $args{exact} ) {
|
|
next unless $index->{is_unique} && @{$index->{cols}} == 1;
|
|
}
|
|
|
|
push @possible_indexes, $index;
|
|
}
|
|
PTDEBUG && _d('Possible chunk indexes in order:',
|
|
join(', ', map { $_->{name} } @possible_indexes));
|
|
|
|
# Build list of candidate chunk columns.
|
|
my $can_chunk_exact = 0;
|
|
my @candidate_cols;
|
|
foreach my $index ( @possible_indexes ) {
|
|
my $col = $index->{cols}->[0];
|
|
|
|
# Accept only integer or real number type columns or character columns.
|
|
my $col_type = $tbl_struct->{type_for}->{$col};
|
|
next unless $self->{int_types}->{$col_type}
|
|
|| $self->{real_types}->{$col_type}
|
|
|| $col_type =~ m/char/;
|
|
|
|
# Save the candidate column and its index.
|
|
push @candidate_cols, { column => $col, index => $index->{name} };
|
|
}
|
|
|
|
$can_chunk_exact = 1 if $args{exact} && scalar @candidate_cols;
|
|
|
|
if ( PTDEBUG ) {
|
|
my $chunk_type = $args{exact} ? 'Exact' : 'Inexact';
|
|
_d($chunk_type, 'chunkable:',
|
|
join(', ', map { "$_->{column} on $_->{index}" } @candidate_cols));
|
|
}
|
|
|
|
# Order the candidates by their original column order.
|
|
# Put the PK's first column first, if it's a candidate.
|
|
my @result;
|
|
PTDEBUG && _d('Ordering columns by order in tbl, PK first');
|
|
if ( $tbl_struct->{keys}->{PRIMARY} ) {
|
|
my $pk_first_col = $tbl_struct->{keys}->{PRIMARY}->{cols}->[0];
|
|
@result = grep { $_->{column} eq $pk_first_col } @candidate_cols;
|
|
@candidate_cols = grep { $_->{column} ne $pk_first_col } @candidate_cols;
|
|
}
|
|
my $i = 0;
|
|
my %col_pos = map { $_ => $i++ } @{$tbl_struct->{cols}};
|
|
push @result, sort { $col_pos{$a->{column}} <=> $col_pos{$b->{column}} }
|
|
@candidate_cols;
|
|
|
|
if ( PTDEBUG ) {
|
|
_d('Chunkable columns:',
|
|
join(', ', map { "$_->{column} on $_->{index}" } @result));
|
|
_d('Can chunk exactly:', $can_chunk_exact);
|
|
}
|
|
|
|
return ($can_chunk_exact, @result);
|
|
}
|
|
|
|
# Sub: calculate_chunks
|
|
# Calculate chunks for the given range statistics. Args min, max and
|
|
# rows_in_range are returned from get_range_statistics() which is usually
|
|
# called before this sub. Min and max are expected to be valid values
|
|
# (NULL is valid).
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# dbh - dbh
|
|
# db - database name
|
|
# tbl - table name
|
|
# tbl_struct - retval of <TableParser::parse()>
|
|
# chunk_col - column name to chunk on
|
|
# min - min col value, from <TableChunker::get_range_statistics()>
|
|
# max - max col value, from <TableChunker::get_range_statistics()>
|
|
# rows_in_range - number of rows to chunk, from
|
|
# <TableChunker::get_range_statistics()>
|
|
# chunk_size - requested size of each chunk
|
|
#
|
|
# Optional Arguments:
|
|
# exact - Use exact chunk_size? Use approximates is not.
|
|
# tries - Fetch up to this many rows to find a non-zero value
|
|
# chunk_range - Make chunk range open (default) or openclosed
|
|
# where - WHERE clause.
|
|
#
|
|
# Returns:
|
|
# Array of WHERE predicates like "`col` >= '10' AND `col` < '20'",
|
|
# one for each chunk. All values are single-quoted due to <issue 1002 at
|
|
# http://code.google.com/p/maatkit/issues/detail?id=1002>. Example:
|
|
# (start code)
|
|
# `film_id` < '30',
|
|
# `film_id` >= '30' AND `film_id` < '60',
|
|
# `film_id` >= '60' AND `film_id` < '90',
|
|
# `film_id` >= '90',
|
|
# (end code)
|
|
sub calculate_chunks {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db tbl tbl_struct chunk_col rows_in_range chunk_size);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless defined $args{$arg};
|
|
}
|
|
PTDEBUG && _d('Calculate chunks for',
|
|
join(", ", map {"$_=".(defined $args{$_} ? $args{$_} : "undef")}
|
|
qw(db tbl chunk_col min max rows_in_range chunk_size zero_chunk exact)
|
|
));
|
|
|
|
if ( !$args{rows_in_range} ) {
|
|
PTDEBUG && _d("Empty table");
|
|
return '1=1';
|
|
}
|
|
|
|
# http://code.google.com/p/maatkit/issues/detail?id=1084
|
|
if ( $args{rows_in_range} < $args{chunk_size} ) {
|
|
PTDEBUG && _d("Chunk size larger than rows in range");
|
|
return '1=1';
|
|
}
|
|
|
|
my $q = $self->{Quoter};
|
|
my $dbh = $args{dbh};
|
|
my $chunk_col = $args{chunk_col};
|
|
my $tbl_struct = $args{tbl_struct};
|
|
my $col_type = $tbl_struct->{type_for}->{$chunk_col};
|
|
PTDEBUG && _d('chunk col type:', $col_type);
|
|
|
|
# Get chunker info for the column type. Numeric cols are chunked
|
|
# differently than char cols.
|
|
my %chunker;
|
|
if ( $tbl_struct->{is_numeric}->{$chunk_col} || $col_type =~ /date|time/ ) {
|
|
%chunker = $self->_chunk_numeric(%args);
|
|
}
|
|
elsif ( $col_type =~ m/char/ ) {
|
|
%chunker = $self->_chunk_char(%args);
|
|
}
|
|
else {
|
|
die "Cannot chunk $col_type columns";
|
|
}
|
|
PTDEBUG && _d("Chunker:", Dumper(\%chunker));
|
|
my ($col, $start_point, $end_point, $interval, $range_func)
|
|
= @chunker{qw(col start_point end_point interval range_func)};
|
|
|
|
# Generate a list of chunk boundaries. The first and last chunks are
|
|
# inclusive, and will catch any rows before or after the end of the
|
|
# supposed range. So 1-100 divided into chunks of 30 should actually end
|
|
# up with chunks like this:
|
|
# < 30
|
|
# >= 30 AND < 60
|
|
# >= 60 AND < 90
|
|
# >= 90
|
|
# If zero_chunk was specified and zero chunking was possible, the first
|
|
# chunk will be = 0 to catch any zero or zero-equivalent (e.g. 00:00:00)
|
|
# rows.
|
|
my @chunks;
|
|
if ( $start_point < $end_point ) {
|
|
|
|
# The zero chunk, if there is one. It doesn't have to be the first
|
|
# chunk. The 0 cannot be quoted because if d='0000-00-00' then
|
|
# d=0 will work but d='0' will cause warning 1292: Incorrect date
|
|
# value: '0' for column 'd'. This might have to column-specific in
|
|
# future when we chunk on more exotic column types.
|
|
push @chunks, "$col = 0" if $chunker{have_zero_chunk};
|
|
|
|
my ($beg, $end);
|
|
my $iter = 0;
|
|
for ( my $i = $start_point; $i < $end_point; $i += $interval ) {
|
|
($beg, $end) = $self->$range_func($dbh, $i, $interval, $end_point);
|
|
|
|
# The first chunk.
|
|
if ( $iter++ == 0 ) {
|
|
push @chunks,
|
|
($chunker{have_zero_chunk} ? "$col > 0 AND " : "")
|
|
."$col < " . $q->quote_val($end);
|
|
}
|
|
else {
|
|
# The normal case is a chunk in the middle of the range somewhere.
|
|
push @chunks, "$col >= " . $q->quote_val($beg) . " AND $col < " . $q->quote_val($end);
|
|
}
|
|
}
|
|
|
|
# Remove the last chunk and replace it with one that matches everything
|
|
# from the beginning of the last chunk to infinity, or to the max col
|
|
# value if chunk_range is openclosed. If the chunk column is nullable,
|
|
# do NULL separately.
|
|
my $chunk_range = lc($args{chunk_range} || 'open');
|
|
my $nullable = $args{tbl_struct}->{is_nullable}->{$args{chunk_col}};
|
|
pop @chunks;
|
|
if ( @chunks ) {
|
|
push @chunks, "$col >= " . $q->quote_val($beg)
|
|
. ($chunk_range eq 'openclosed'
|
|
? " AND $col <= " . $q->quote_val($args{max}) : "");
|
|
}
|
|
else {
|
|
push @chunks, $nullable ? "$col IS NOT NULL" : '1=1';
|
|
}
|
|
if ( $nullable ) {
|
|
push @chunks, "$col IS NULL";
|
|
}
|
|
}
|
|
else {
|
|
# There are no chunks; just do the whole table in one chunk.
|
|
PTDEBUG && _d('No chunks; using single chunk 1=1');
|
|
push @chunks, '1=1';
|
|
}
|
|
|
|
return @chunks;
|
|
}
|
|
|
|
# Sub: _chunk_numeric
|
|
# Determine how to chunk a numeric column.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# dbh - dbh
|
|
# db - database name
|
|
# tbl - table name
|
|
# tbl_struct - retval of <TableParser::parse()>
|
|
# chunk_col - column name to chunk on
|
|
# min - min col value, from <TableChunker::get_range_statistics()>
|
|
# max - max col value, from <TableChunker::get_range_statistics()>
|
|
# rows_in_range - number of rows to chunk, from
|
|
# <TableChunker::get_range_statistics()>
|
|
# chunk_size - requested size of each chunk
|
|
#
|
|
# Optional Arguments:
|
|
# exact - Use exact chunk_size? Use approximates is not.
|
|
# tries - Fetch up to this many rows to find a non-zero value
|
|
# zero_chunk - Add an extra chunk for zero values? (0, 00:00, etc.)
|
|
#
|
|
# Returns:
|
|
# Array of chunker info that <calculate_chunks()> uses to create
|
|
# chunks, like:
|
|
# (start code)
|
|
# col => quoted chunk column name
|
|
# start_point => start value (a Perl number)
|
|
# end_point => end value (a Perl number)
|
|
# interval => interval to walk from start_ to end_point (a Perl number)
|
|
# range_func => coderef to return a value while walking that ^ range
|
|
# have_zero_chunk => whether to include a zero chunk (col=0)
|
|
# (end code)
|
|
sub _chunk_numeric {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db tbl tbl_struct chunk_col rows_in_range chunk_size);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless defined $args{$arg};
|
|
}
|
|
my $q = $self->{Quoter};
|
|
my $db_tbl = $q->quote($args{db}, $args{tbl});
|
|
my $col_type = $args{tbl_struct}->{type_for}->{$args{chunk_col}};
|
|
|
|
# Convert the given MySQL values to (Perl) numbers using some MySQL function.
|
|
# E.g.: SELECT TIME_TO_SEC('12:34') == 45240.
|
|
my $range_func;
|
|
if ( $col_type =~ m/(?:int|year|float|double|decimal)$/ ) {
|
|
$range_func = 'range_num';
|
|
}
|
|
elsif ( $col_type =~ m/^(?:timestamp|date|time)$/ ) {
|
|
$range_func = "range_$col_type";
|
|
}
|
|
elsif ( $col_type eq 'datetime' ) {
|
|
$range_func = 'range_datetime';
|
|
}
|
|
|
|
my ($start_point, $end_point);
|
|
eval {
|
|
$start_point = $self->value_to_number(
|
|
value => $args{min},
|
|
column_type => $col_type,
|
|
dbh => $args{dbh},
|
|
);
|
|
$end_point = $self->value_to_number(
|
|
value => $args{max},
|
|
column_type => $col_type,
|
|
dbh => $args{dbh},
|
|
);
|
|
};
|
|
if ( $EVAL_ERROR ) {
|
|
if ( $EVAL_ERROR =~ m/don't know how to chunk/ ) {
|
|
# Special kind of error doesn't make sense with the more verbose
|
|
# description below.
|
|
die $EVAL_ERROR;
|
|
}
|
|
else {
|
|
die "Error calculating chunk start and end points for table "
|
|
. "`$args{tbl_struct}->{name}` on column `$args{chunk_col}` "
|
|
. "with min/max values "
|
|
. join('/',
|
|
map { defined $args{$_} ? $args{$_} : 'undef' } qw(min max))
|
|
. ":\n\n"
|
|
. $EVAL_ERROR
|
|
. "\nVerify that the min and max values are valid for the column. "
|
|
. "If they are valid, this error could be caused by a bug in the "
|
|
. "tool.";
|
|
}
|
|
}
|
|
|
|
# The end points might be NULL in the pathological case that the table
|
|
# has nothing but NULL values. If there's at least one non-NULL value
|
|
# then MIN() and MAX() will return it. Otherwise, the only thing to do
|
|
# is make NULL end points zero to make the code below work and any NULL
|
|
# values will be handled by the special "IS NULL" chunk.
|
|
if ( !defined $start_point ) {
|
|
PTDEBUG && _d('Start point is undefined');
|
|
$start_point = 0;
|
|
}
|
|
if ( !defined $end_point || $end_point < $start_point ) {
|
|
PTDEBUG && _d('End point is undefined or before start point');
|
|
$end_point = 0;
|
|
}
|
|
PTDEBUG && _d("Actual chunk range:", $start_point, "to", $end_point);
|
|
|
|
# Determine if we can include a zero chunk (col = 0). If yes, then
|
|
# make sure the start point is non-zero. $start_point and $end_point
|
|
# should be numbers (converted from MySQL values earlier). The purpose
|
|
# of the zero chunk is to capture a potentially large number of zero
|
|
# values that might imbalance the size of the first chunk. E.g. if a
|
|
# lot of invalid times were inserted and stored as 00:00:00, these
|
|
# zero (equivalent) values are captured by the zero chunk instead of
|
|
# the first chunk + all the non-zero values in the first chunk.
|
|
my $have_zero_chunk = 0;
|
|
if ( $args{zero_chunk} ) {
|
|
if ( $start_point != $end_point && $start_point >= 0 ) {
|
|
PTDEBUG && _d('Zero chunking');
|
|
my $nonzero_val = $self->get_nonzero_value(
|
|
%args,
|
|
db_tbl => $db_tbl,
|
|
col => $args{chunk_col},
|
|
col_type => $col_type,
|
|
val => $args{min}
|
|
);
|
|
# Since we called value_to_number() before with this column type
|
|
# we shouldn't have to worry about it dying here--it would have
|
|
# died earlier if we can't chunk the column type.
|
|
$start_point = $self->value_to_number(
|
|
value => $nonzero_val,
|
|
column_type => $col_type,
|
|
dbh => $args{dbh},
|
|
);
|
|
$have_zero_chunk = 1;
|
|
}
|
|
else {
|
|
PTDEBUG && _d("Cannot zero chunk");
|
|
}
|
|
}
|
|
PTDEBUG && _d("Using chunk range:", $start_point, "to", $end_point);
|
|
|
|
# Calculate the chunk size in terms of "distance between endpoints"
|
|
# that will give approximately the right number of rows between the
|
|
# endpoints. If possible and requested, forbid chunks from being any
|
|
# bigger than specified.
|
|
my $interval = $args{chunk_size}
|
|
* ($end_point - $start_point)
|
|
/ $args{rows_in_range};
|
|
if ( $self->{int_types}->{$col_type} ) {
|
|
$interval = ceil($interval);
|
|
}
|
|
$interval ||= $args{chunk_size};
|
|
if ( $args{exact} ) {
|
|
$interval = $args{chunk_size};
|
|
}
|
|
PTDEBUG && _d('Chunk interval:', $interval, 'units');
|
|
|
|
return (
|
|
col => $q->quote($args{chunk_col}),
|
|
start_point => $start_point,
|
|
end_point => $end_point,
|
|
interval => $interval,
|
|
range_func => $range_func,
|
|
have_zero_chunk => $have_zero_chunk,
|
|
);
|
|
}
|
|
|
|
# Sub: _chunk_numeric
|
|
# Determine how to chunk a character column.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# dbh - dbh
|
|
# db - database name
|
|
# tbl - table name
|
|
# tbl_struct - retval of <TableParser::parse()>
|
|
# chunk_col - column name to chunk on
|
|
# min - min col value, from <TableChunker::get_range_statistics()>
|
|
# max - max col value, from <TableChunker::get_range_statistics()>
|
|
# rows_in_range - number of rows to chunk, from
|
|
# <TableChunker::get_range_statistics()>
|
|
# chunk_size - requested size of each chunk
|
|
#
|
|
# Optional Arguments:
|
|
# where - WHERE clause.
|
|
#
|
|
# Returns:
|
|
# Array of chunker info that <calculate_chunks()> uses to create
|
|
# chunks, like:
|
|
# (start code)
|
|
# col => quoted chunk column name
|
|
# start_point => start value (a Perl number)
|
|
# end_point => end value (a Perl number)
|
|
# interval => interval to walk from start_ to end_point (a Perl number)
|
|
# range_func => coderef to return a value while walking that ^ range
|
|
# (end code)
|
|
sub _chunk_char {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db tbl tbl_struct chunk_col min max rows_in_range chunk_size);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless defined $args{$arg};
|
|
}
|
|
my $q = $self->{Quoter};
|
|
my $db_tbl = $q->quote($args{db}, $args{tbl});
|
|
my $dbh = $args{dbh};
|
|
my $chunk_col = $args{chunk_col};
|
|
my $qchunk_col = $q->quote($args{chunk_col});
|
|
my $row;
|
|
my $sql;
|
|
|
|
# Get the character codes between the min and max column values.
|
|
my ($min_col, $max_col) = @{args}{qw(min max)};
|
|
$sql = "SELECT ORD(?) AS min_col_ord, ORD(?) AS max_col_ord";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
my $ord_sth = $dbh->prepare($sql); # avoid quoting issues
|
|
$ord_sth->execute($min_col, $max_col);
|
|
$row = $ord_sth->fetchrow_arrayref();
|
|
my ($min_col_ord, $max_col_ord) = ($row->[0], $row->[1]);
|
|
PTDEBUG && _d("Min/max col char code:", $min_col_ord, $max_col_ord);
|
|
|
|
# Create a sorted chacater-to-number map of the unique characters in
|
|
# the column ranging from the min character code to the max.
|
|
my $base;
|
|
my @chars;
|
|
PTDEBUG && _d("Table charset:", $args{tbl_struct}->{charset});
|
|
if ( ($args{tbl_struct}->{charset} || "") eq "latin1" ) {
|
|
# These are the unique, sorted latin1 character codes according to
|
|
# MySQL. You'll notice that many are missing. That's because MySQL
|
|
# treats many characters as the same, for example "e" and "é".
|
|
my @sorted_latin1_chars = (
|
|
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
|
|
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
|
|
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
|
|
74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
|
|
88, 89, 90, 91, 92, 93, 94, 95, 96, 123, 124, 125, 126, 161,
|
|
162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
|
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
|
|
190, 191, 215, 216, 222, 223, 247, 255);
|
|
|
|
my ($first_char, $last_char);
|
|
for my $i ( 0..$#sorted_latin1_chars ) {
|
|
$first_char = $i and last if $sorted_latin1_chars[$i] >= $min_col_ord;
|
|
}
|
|
for my $i ( $first_char..$#sorted_latin1_chars ) {
|
|
$last_char = $i and last if $sorted_latin1_chars[$i] >= $max_col_ord;
|
|
};
|
|
|
|
@chars = map { chr $_; } @sorted_latin1_chars[$first_char..$last_char];
|
|
$base = scalar @chars;
|
|
}
|
|
else {
|
|
# If the table's charset isn't latin1, who knows what charset is being
|
|
# used, what characters it contains, and how those characters are sorted.
|
|
# So we create a character map and let MySQL tell us these things.
|
|
|
|
# Create a temp table with the same char col def as the original table.
|
|
my $tmp_tbl = '__maatkit_char_chunking_map';
|
|
my $tmp_db_tbl = $q->quote($args{db}, $tmp_tbl);
|
|
$sql = "DROP TABLE IF EXISTS $tmp_db_tbl";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
$dbh->do($sql);
|
|
my $col_def = $args{tbl_struct}->{defs}->{$chunk_col};
|
|
$sql = "CREATE TEMPORARY TABLE $tmp_db_tbl ($col_def) "
|
|
. "ENGINE=MEMORY DEFAULT CHARSET = utf8";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
$dbh->do($sql);
|
|
|
|
# Populate the temp table with all the characters between the min and max
|
|
# max character codes. This is our character-to-number map.
|
|
$sql = "INSERT INTO $tmp_db_tbl VALUES (CHAR(?))";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
my $ins_char_sth = $dbh->prepare($sql); # avoid quoting issues
|
|
for my $char_code ( $min_col_ord..$max_col_ord ) {
|
|
# Ignore errors on invalid chars for UTF8
|
|
eval { $ins_char_sth->execute($char_code) };
|
|
}
|
|
|
|
# Select from the char-to-number map all characters between the
|
|
# min and max col values, letting MySQL order them. The first
|
|
# character returned becomes "zero" in a new base system of counting,
|
|
# the second character becomes "one", etc. So if 42 chars are
|
|
# returned like [a, B, c, d, é, ..., ü] then we have a base 42
|
|
# system where 0=a, 1=B, 2=c, 3=d, 4=é, ... 41=ü. count_base()
|
|
# helps us count in arbitrary systems.
|
|
$sql = "SELECT $qchunk_col FROM $tmp_db_tbl "
|
|
. "WHERE $qchunk_col BETWEEN ? AND ? "
|
|
. "ORDER BY $qchunk_col";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
my $sel_char_sth = $dbh->prepare($sql);
|
|
$sel_char_sth->execute($min_col, $max_col);
|
|
|
|
@chars = map { $_->[0] } @{ $sel_char_sth->fetchall_arrayref() };
|
|
$base = scalar @chars;
|
|
|
|
$sql = "DROP TABLE $tmp_db_tbl";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
$dbh->do($sql);
|
|
}
|
|
PTDEBUG && _d("Base", $base, "chars:", @chars);
|
|
|
|
# See https://bugs.launchpad.net/percona-toolkit/+bug/1034717
|
|
die "Cannot chunk table $db_tbl using the character column "
|
|
. "$chunk_col, most likely because all values start with the "
|
|
. "same character. This table must be synced separately by "
|
|
. "specifying a list of --algorithms without the Chunk algorithm"
|
|
if $base == 1;
|
|
|
|
# Now we begin calculating how to chunk the char column. This is
|
|
# completely different from _chunk_numeric because we're not dealing
|
|
# with the values to chunk directly (the characters) but rather a map.
|
|
|
|
# In our base system, how many values can 1, 2, etc. characters express?
|
|
# E.g. in a base 26 system (a-z), 1 char expresses 26^1=26 chars (a-z),
|
|
# 2 chars expresses 26^2=676 chars. If the requested chunk size is 100,
|
|
# then 1 char might not express enough values, but 2 surely can. This
|
|
# is imperefect because we don't know if we have data like: [apple, boy,
|
|
# car] (i.e. values evenly distributed across the range of chars), or
|
|
# [ant, apple, azur, boy]. We assume data is more evenly distributed
|
|
# than not so we use the minimum number of characters to express a chunk
|
|
# size.
|
|
$sql = "SELECT MAX(LENGTH($qchunk_col)) FROM $db_tbl "
|
|
. ($args{where} ? "WHERE $args{where} " : "")
|
|
. "ORDER BY $qchunk_col";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
$row = $dbh->selectrow_arrayref($sql);
|
|
my $max_col_len = $row->[0];
|
|
PTDEBUG && _d("Max column value:", $max_col, $max_col_len);
|
|
my $n_values;
|
|
for my $n_chars ( 1..$max_col_len ) {
|
|
$n_values = $base**$n_chars;
|
|
if ( $n_values >= $args{chunk_size} ) {
|
|
PTDEBUG && _d($n_chars, "chars in base", $base, "expresses",
|
|
$n_values, "values");
|
|
last;
|
|
}
|
|
}
|
|
|
|
# Our interval is not like a _chunk_numeric() interval, either, because
|
|
# we don't increment in the actual values (i.e. the characters) but rather
|
|
# in the char-to-number map. If the above calculation found that 1 char
|
|
# expressed enough values for 1 chunk, then each char in the map will
|
|
# yield roughly one chunk of values, so the interval is 1. Or, if we need
|
|
# 2 chars to express enough vals for 1 chunk, then we'll increment through
|
|
# the map 2 chars at a time, like [a, b], [c, d], etc.
|
|
my $n_chunks = $args{rows_in_range} / $args{chunk_size};
|
|
my $interval = floor(($n_values+0.00001) / $n_chunks) || 1;
|
|
|
|
my $range_func = sub {
|
|
my ( $self, $dbh, $start, $interval, $max ) = @_;
|
|
my $start_char = $self->base_count(
|
|
count_to => $start,
|
|
base => $base,
|
|
symbols => \@chars,
|
|
);
|
|
my $end_char = $self->base_count(
|
|
count_to => min($max, $start + $interval),
|
|
base => $base,
|
|
symbols => \@chars,
|
|
);
|
|
return $start_char, $end_char;
|
|
};
|
|
|
|
return (
|
|
col => $qchunk_col,
|
|
start_point => 0,
|
|
end_point => $n_values,
|
|
interval => $interval,
|
|
range_func => $range_func,
|
|
);
|
|
}
|
|
|
|
# Sub: get_first_chunkable_column
|
|
# Get the first chunkable column in a table.
|
|
# Only a "sane" column/index is returned. That means that
|
|
# the first auto-detected chunk col/index are used if any combination of
|
|
# preferred chunk col or index would be really bad, like chunk col=x
|
|
# and chunk index=some index over (y, z). That's bad because the index
|
|
# doesn't include the column; it would also be bad if the column wasn't
|
|
# a left-most prefix of the index.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# tbl_struct - Hashref returned by <TableParser::parse()>
|
|
#
|
|
# Optional arguments:
|
|
# chunk_column - Preferred chunkable column name
|
|
# chunk_index - Preferred chunkable column index name
|
|
# exact - bool: passed to <find_chunk_columns()>
|
|
#
|
|
# Returns:
|
|
# List: chunkable column name, chunkable column index
|
|
sub get_first_chunkable_column {
|
|
my ( $self, %args ) = @_;
|
|
foreach my $arg ( qw(tbl_struct) ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
|
|
# First auto-detected chunk col/index. If any combination of preferred
|
|
# chunk col or index are specified and are sane, they will overwrite
|
|
# these defaults. Else, these defaults will be returned.
|
|
my ($exact, @cols) = $self->find_chunk_columns(%args);
|
|
my $col = $cols[0]->{column};
|
|
my $idx = $cols[0]->{index};
|
|
|
|
# Wanted/preferred chunk column and index. Caller only gets what
|
|
# they want, though, if it results in a sane col/index pair.
|
|
my $wanted_col = $args{chunk_column};
|
|
my $wanted_idx = $args{chunk_index};
|
|
PTDEBUG && _d("Preferred chunk col/idx:", $wanted_col, $wanted_idx);
|
|
|
|
if ( $wanted_col && $wanted_idx ) {
|
|
# Preferred column and index: check that the pair is sane.
|
|
foreach my $chunkable_col ( @cols ) {
|
|
if ( $wanted_col eq $chunkable_col->{column}
|
|
&& $wanted_idx eq $chunkable_col->{index} ) {
|
|
# The wanted column is chunkable with the wanted index.
|
|
$col = $wanted_col;
|
|
$idx = $wanted_idx;
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
elsif ( $wanted_col ) {
|
|
# Preferred column, no index: check if column is chunkable, if yes
|
|
# then use its index, else fall back to default col/index.
|
|
foreach my $chunkable_col ( @cols ) {
|
|
if ( $wanted_col eq $chunkable_col->{column} ) {
|
|
# The wanted column is chunkable, so use its index and overwrite
|
|
# the defaults.
|
|
$col = $wanted_col;
|
|
$idx = $chunkable_col->{index};
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
elsif ( $wanted_idx ) {
|
|
# Preferred index, no column: check if index's left-most column is
|
|
# chunkable, if yes then use its column, else fall back to auto-detected
|
|
# col/index.
|
|
foreach my $chunkable_col ( @cols ) {
|
|
if ( $wanted_idx eq $chunkable_col->{index} ) {
|
|
# The wanted index has a chunkable column, so use it and overwrite
|
|
# the defaults.
|
|
$col = $chunkable_col->{column};
|
|
$idx = $wanted_idx;
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
|
|
PTDEBUG && _d('First chunkable col/index:', $col, $idx);
|
|
return $col, $idx;
|
|
}
|
|
|
|
# Sub: size_to_rows
|
|
# Convert a size in rows or bytes to a number of rows in the table,
|
|
# using SHOW TABLE STATUS. If the size is a string with a suffix of M/G/k,
|
|
# interpret it as mebibytes, gibibytes, or kibibytes respectively.
|
|
# If it's just a number, treat it as a number of rows and return right away.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# dbh - dbh
|
|
# db - Database name
|
|
# tbl - Table name
|
|
# chunk_size - Chunk size string like "1000" or "50M"
|
|
#
|
|
# Returns:
|
|
# Array: number of rows, average row size
|
|
sub size_to_rows {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db tbl chunk_size);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my ($dbh, $db, $tbl, $chunk_size) = @args{@required_args};
|
|
my $q = $self->{Quoter};
|
|
my $tp = $self->{TableParser};
|
|
|
|
my ($n_rows, $avg_row_length);
|
|
|
|
my ( $num, $suffix ) = $chunk_size =~ m/^(\d+)([MGk])?$/;
|
|
if ( $suffix ) { # Convert to bytes.
|
|
$chunk_size = $suffix eq 'k' ? $num * 1_024
|
|
: $suffix eq 'M' ? $num * 1_024 * 1_024
|
|
: $num * 1_024 * 1_024 * 1_024;
|
|
}
|
|
elsif ( $num ) {
|
|
$n_rows = $num;
|
|
}
|
|
else {
|
|
die "Invalid chunk size $chunk_size; must be an integer "
|
|
. "with optional suffix kMG";
|
|
}
|
|
|
|
if ( $suffix || $args{avg_row_length} ) {
|
|
my ($status) = $tp->get_table_status($dbh, $db, $tbl);
|
|
$avg_row_length = $status->{avg_row_length};
|
|
if ( !defined $n_rows ) {
|
|
$n_rows = $avg_row_length ? ceil($chunk_size / $avg_row_length) : undef;
|
|
}
|
|
}
|
|
|
|
return $n_rows, $avg_row_length;
|
|
}
|
|
|
|
# Sub: get_range_statistics
|
|
# Determine the range of values for the chunk_col column on this table.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# dbh - dbh
|
|
# db - Database name
|
|
# tbl - Table name
|
|
# chunk_col - Chunk column name
|
|
# tbl_struct - Hashref returned by <TableParser::parse()>
|
|
#
|
|
# Optional arguments:
|
|
# where - WHERE clause without "WHERE" to restrict range
|
|
# index_hint - "FORCE INDEX (...)" clause
|
|
# tries - Fetch up to this many rows to find a valid value
|
|
#
|
|
# Returns:
|
|
# Array: min row value, max row value, rows in range
|
|
sub get_range_statistics {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db tbl chunk_col tbl_struct);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my ($dbh, $db, $tbl, $col) = @args{@required_args};
|
|
my $where = $args{where};
|
|
my $q = $self->{Quoter};
|
|
|
|
my $col_type = $args{tbl_struct}->{type_for}->{$col};
|
|
my $col_is_numeric = $args{tbl_struct}->{is_numeric}->{$col};
|
|
|
|
# Quote these once so we don't have to do it again.
|
|
my $db_tbl = $q->quote($db, $tbl);
|
|
$col = $q->quote($col);
|
|
|
|
my ($min, $max);
|
|
eval {
|
|
# First get the actual end points, whatever MySQL considers the
|
|
# min and max values to be for this column.
|
|
my $sql = "SELECT MIN($col), MAX($col) FROM $db_tbl"
|
|
. ($args{index_hint} ? " $args{index_hint}" : "")
|
|
. ($where ? " WHERE ($where)" : '');
|
|
PTDEBUG && _d($dbh, $sql);
|
|
($min, $max) = $dbh->selectrow_array($sql);
|
|
PTDEBUG && _d("Actual end points:", $min, $max);
|
|
|
|
# Now, for two reasons, get the valid end points. For one, an
|
|
# end point may be 0 or some zero-equivalent and the user doesn't
|
|
# want that because it skews the range. Or two, an end point may
|
|
# be an invalid value like date 2010-00-00 and we can't use that.
|
|
($min, $max) = $self->get_valid_end_points(
|
|
%args,
|
|
dbh => $dbh,
|
|
db_tbl => $db_tbl,
|
|
col => $col,
|
|
col_type => $col_type,
|
|
min => $min,
|
|
max => $max,
|
|
);
|
|
PTDEBUG && _d("Valid end points:", $min, $max);
|
|
};
|
|
if ( $EVAL_ERROR ) {
|
|
die "Error getting min and max values for table $db_tbl "
|
|
. "on column $col: $EVAL_ERROR";
|
|
}
|
|
|
|
# Finally get the total number of rows in range, usually the whole
|
|
# table unless there's a where arg restricting the range.
|
|
my $sql = "EXPLAIN SELECT * FROM $db_tbl"
|
|
. ($args{index_hint} ? " $args{index_hint}" : "")
|
|
. ($where ? " WHERE $where" : '');
|
|
PTDEBUG && _d($sql);
|
|
my $expl = $dbh->selectrow_hashref($sql);
|
|
|
|
return (
|
|
min => $min,
|
|
max => $max,
|
|
rows_in_range => $expl->{rows},
|
|
);
|
|
}
|
|
|
|
# Sub: inject_chunks
|
|
# Create a SQL statement from a query prototype by filling in placeholders.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# database - Database name
|
|
# table - Table name
|
|
# chunks - Arrayref of chunks from <calculate_chunks()>
|
|
# chunk_num - Index into chunks to use
|
|
# query - Query prototype returned by
|
|
# <TableChecksum::make_checksum_query()>
|
|
#
|
|
# Optional Arguments:
|
|
# index_hint - "FORCE INDEX (...)" clause
|
|
# where - Arrayref of WHERE clauses joined with AND
|
|
#
|
|
# Returns:
|
|
# A SQL statement
|
|
sub inject_chunks {
|
|
my ( $self, %args ) = @_;
|
|
foreach my $arg ( qw(database table chunks chunk_num query) ) {
|
|
die "I need a $arg argument" unless defined $args{$arg};
|
|
}
|
|
PTDEBUG && _d('Injecting chunk', $args{chunk_num});
|
|
my $query = $args{query};
|
|
my $comment = sprintf("/*%s.%s:%d/%d*/",
|
|
$args{database}, $args{table},
|
|
$args{chunk_num} + 1, scalar @{$args{chunks}});
|
|
$query =~ s!/\*PROGRESS_COMMENT\*/!$comment!;
|
|
my $where = "WHERE (" . $args{chunks}->[$args{chunk_num}] . ')';
|
|
if ( $args{where} && grep { $_ } @{$args{where}} ) {
|
|
$where .= " AND ("
|
|
. join(" AND ", map { "($_)" } grep { $_ } @{$args{where}} )
|
|
. ")";
|
|
}
|
|
my $db_tbl = $self->{Quoter}->quote(@args{qw(database table)});
|
|
my $index_hint = $args{index_hint} || '';
|
|
|
|
PTDEBUG && _d('Parameters:',
|
|
Dumper({WHERE => $where, DB_TBL => $db_tbl, INDEX_HINT => $index_hint}));
|
|
$query =~ s!/\*WHERE\*/! $where!;
|
|
$query =~ s!/\*DB_TBL\*/!$db_tbl!;
|
|
$query =~ s!/\*INDEX_HINT\*/! $index_hint!;
|
|
$query =~ s!/\*CHUNK_NUM\*/! $args{chunk_num} AS chunk_num,!;
|
|
|
|
return $query;
|
|
}
|
|
|
|
# #############################################################################
|
|
# MySQL value to Perl number conversion.
|
|
# #############################################################################
|
|
|
|
# Convert a MySQL column value to a Perl integer.
|
|
# Arguments:
|
|
# * value scalar: MySQL value to convert
|
|
# * column_type scalar: MySQL column type of the value
|
|
# * dbh dbh
|
|
# Returns an integer or undef if the value isn't convertible
|
|
# (e.g. date 0000-00-00 is not convertible).
|
|
sub value_to_number {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(column_type dbh);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless defined $args{$arg};
|
|
}
|
|
my $val = $args{value};
|
|
my ($col_type, $dbh) = @args{@required_args};
|
|
PTDEBUG && _d('Converting MySQL', $col_type, $val);
|
|
|
|
return unless defined $val; # value is NULL
|
|
|
|
# MySQL functions to convert a non-numeric value to a number
|
|
# so we can do basic math on it in Perl. Right now we just
|
|
# convert temporal values but later we'll need to convert char
|
|
# and hex values.
|
|
my %mysql_conv_func_for = (
|
|
timestamp => 'UNIX_TIMESTAMP',
|
|
date => 'TO_DAYS',
|
|
time => 'TIME_TO_SEC',
|
|
datetime => 'TO_DAYS',
|
|
);
|
|
|
|
# Convert the value to a number that Perl can do arithmetic with.
|
|
my $num;
|
|
if ( $col_type =~ m/(?:int|year|float|double|decimal)$/ ) {
|
|
# These types are already numbers.
|
|
$num = $val;
|
|
}
|
|
elsif ( $col_type =~ m/^(?:timestamp|date|time)$/ ) {
|
|
# These are temporal values. Convert them using a MySQL func.
|
|
my $func = $mysql_conv_func_for{$col_type};
|
|
my $sql = "SELECT $func(?)";
|
|
PTDEBUG && _d($dbh, $sql, $val);
|
|
my $sth = $dbh->prepare($sql);
|
|
$sth->execute($val);
|
|
($num) = $sth->fetchrow_array();
|
|
}
|
|
elsif ( $col_type eq 'datetime' ) {
|
|
# This type is temporal, too, but needs special handling.
|
|
# Newer versions of MySQL could use TIMESTAMPDIFF, but it's easier
|
|
# to maintain just one kind of code, so I do it all with DATE_ADD().
|
|
$num = $self->timestampdiff($dbh, $val);
|
|
}
|
|
else {
|
|
die "I don't know how to chunk $col_type\n";
|
|
}
|
|
PTDEBUG && _d('Converts to', $num);
|
|
return $num;
|
|
}
|
|
|
|
# ###########################################################################
|
|
# Range functions.
|
|
# ###########################################################################
|
|
sub range_num {
|
|
my ( $self, $dbh, $start, $interval, $max ) = @_;
|
|
my $end = min($max, $start + $interval);
|
|
|
|
|
|
# "Remove" scientific notation so the regex below does not make
|
|
# 6.123456e+18 into 6.12345.
|
|
$start = sprintf('%.17f', $start) if $start =~ /e/;
|
|
$end = sprintf('%.17f', $end) if $end =~ /e/;
|
|
|
|
# Trim decimal places, if needed. This helps avoid issues with float
|
|
# precision differing on different platforms.
|
|
$start =~ s/\.(\d{5}).*$/.$1/;
|
|
$end =~ s/\.(\d{5}).*$/.$1/;
|
|
|
|
if ( $end > $start ) {
|
|
return ( $start, $end );
|
|
}
|
|
else {
|
|
die "Chunk size is too small: $end !> $start\n";
|
|
}
|
|
}
|
|
|
|
sub range_time {
|
|
my ( $self, $dbh, $start, $interval, $max ) = @_;
|
|
my $sql = "SELECT SEC_TO_TIME($start), SEC_TO_TIME(LEAST($max, $start + $interval))";
|
|
PTDEBUG && _d($sql);
|
|
return $dbh->selectrow_array($sql);
|
|
}
|
|
|
|
sub range_date {
|
|
my ( $self, $dbh, $start, $interval, $max ) = @_;
|
|
my $sql = "SELECT FROM_DAYS($start), FROM_DAYS(LEAST($max, $start + $interval))";
|
|
PTDEBUG && _d($sql);
|
|
return $dbh->selectrow_array($sql);
|
|
}
|
|
|
|
sub range_datetime {
|
|
my ( $self, $dbh, $start, $interval, $max ) = @_;
|
|
my $sql = "SELECT DATE_ADD('$self->{EPOCH}', INTERVAL $start SECOND), "
|
|
. "DATE_ADD('$self->{EPOCH}', INTERVAL LEAST($max, $start + $interval) SECOND)";
|
|
PTDEBUG && _d($sql);
|
|
return $dbh->selectrow_array($sql);
|
|
}
|
|
|
|
sub range_timestamp {
|
|
my ( $self, $dbh, $start, $interval, $max ) = @_;
|
|
my $sql = "SELECT FROM_UNIXTIME($start), FROM_UNIXTIME(LEAST($max, $start + $interval))";
|
|
PTDEBUG && _d($sql);
|
|
return $dbh->selectrow_array($sql);
|
|
}
|
|
|
|
# Returns the number of seconds between EPOCH and the value, according to
|
|
# the MySQL server. (The server can do no wrong). I believe this code is right
|
|
# after looking at the source of sql/time.cc but I am paranoid and add in an
|
|
# extra check just to make sure. Earlier versions overflow on large interval
|
|
# values, such as on 3.23.58, '1970-01-01' - interval 58000000000 second is
|
|
# 2037-06-25 11:29:04. I know of no workaround. TO_DAYS('0000-....') is NULL,
|
|
# so we treat it as 0.
|
|
sub timestampdiff {
|
|
my ( $self, $dbh, $time ) = @_;
|
|
my $sql = "SELECT (COALESCE(TO_DAYS('$time'), 0) * 86400 + TIME_TO_SEC('$time')) "
|
|
. "- TO_DAYS('$self->{EPOCH} 00:00:00') * 86400";
|
|
PTDEBUG && _d($sql);
|
|
my ( $diff ) = $dbh->selectrow_array($sql);
|
|
$sql = "SELECT DATE_ADD('$self->{EPOCH}', INTERVAL $diff SECOND)";
|
|
PTDEBUG && _d($sql);
|
|
my ( $check ) = $dbh->selectrow_array($sql);
|
|
die <<" EOF"
|
|
Incorrect datetime math: given $time, calculated $diff but checked to $check.
|
|
This could be due to a version of MySQL that overflows on large interval
|
|
values to DATE_ADD(), or the given datetime is not a valid date. If not,
|
|
please report this as a bug.
|
|
EOF
|
|
unless $check eq $time;
|
|
return $diff;
|
|
}
|
|
|
|
|
|
# #############################################################################
|
|
# End point validation.
|
|
# #############################################################################
|
|
|
|
# These sub require val (or min and max) args which usually aren't NULL
|
|
# but could be zero so the usual "die ... unless $args{$arg}" check does
|
|
# not work.
|
|
|
|
# Returns valid min and max values. A valid val evaluates to a non-NULL value.
|
|
# Arguments:
|
|
# * dbh dbh
|
|
# * db_tbl scalar: quoted `db`.`tbl`
|
|
# * col scalar: quoted `column`
|
|
# * col_type scalar: column type of the value
|
|
# * min scalar: any scalar value
|
|
# * max scalar: any scalar value
|
|
# Optional arguments:
|
|
# * index_hint scalar: "FORCE INDEX (...)" hint
|
|
# * where scalar: WHERE clause without "WHERE"
|
|
# * tries scalar: fetch up to this many rows to find a valid value
|
|
# * zero_chunk bool: do a separate chunk for zero values
|
|
# Some column types can store invalid values, like most of the temporal
|
|
# types. When evaluated, invalid values return NULL. If the value is
|
|
# NULL to begin with, then it is not invalid because NULL is valid.
|
|
# For example, TO_DAYS('2009-00-00') evaluates to NULL because that date
|
|
# is invalid, even though it's storable.
|
|
sub get_valid_end_points {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db_tbl col col_type);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my ($dbh, $db_tbl, $col, $col_type) = @args{@required_args};
|
|
my ($real_min, $real_max) = @args{qw(min max)};
|
|
|
|
# Common error message format in case there's a problem with
|
|
# finding a valid min or max value.
|
|
my $err_fmt = "Error finding a valid %s value for table $db_tbl on "
|
|
. "column $col. The real %s value %s is invalid and "
|
|
. "no other valid values were found. Verify that the table "
|
|
. "has at least one valid value for this column"
|
|
. ($args{where} ? " where $args{where}." : ".");
|
|
|
|
# Validate min value if it's not NULL. NULL is valid.
|
|
my $valid_min = $real_min;
|
|
if ( defined $valid_min ) {
|
|
# Get a valid min end point.
|
|
PTDEBUG && _d("Validating min end point:", $real_min);
|
|
$valid_min = $self->_get_valid_end_point(
|
|
%args,
|
|
val => $real_min,
|
|
endpoint => 'min',
|
|
);
|
|
die sprintf($err_fmt, 'minimum', 'minimum',
|
|
(defined $real_min ? $real_min : "NULL"))
|
|
unless defined $valid_min;
|
|
}
|
|
|
|
# Validate max value if it's not NULL. NULL is valid.
|
|
my $valid_max = $real_max;
|
|
if ( defined $valid_max ) {
|
|
# Get a valid max end point. So far I've not found a case where
|
|
# the actual max val is invalid, but check anyway just in case.
|
|
PTDEBUG && _d("Validating max end point:", $real_min);
|
|
$valid_max = $self->_get_valid_end_point(
|
|
%args,
|
|
val => $real_max,
|
|
endpoint => 'max',
|
|
);
|
|
die sprintf($err_fmt, 'maximum', 'maximum',
|
|
(defined $real_max ? $real_max : "NULL"))
|
|
unless defined $valid_max;
|
|
}
|
|
|
|
return $valid_min, $valid_max;
|
|
}
|
|
|
|
# Does the actual work for get_valid_end_points() for each end point.
|
|
sub _get_valid_end_point {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db_tbl col col_type);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my ($dbh, $db_tbl, $col, $col_type) = @args{@required_args};
|
|
my $val = $args{val};
|
|
|
|
# NULL is valid.
|
|
return $val unless defined $val;
|
|
|
|
# Right now we only validate temporal types, but when we begin
|
|
# chunking char and hex columns we'll need to validate those.
|
|
# E.g. HEX('abcdefg') is invalid and we'll probably find some
|
|
# combination of char val + charset/collation that's invalid.
|
|
my $validate = $col_type =~ m/time|date/ ? \&_validate_temporal_value
|
|
: undef;
|
|
|
|
# If we cannot validate the value, assume it's valid.
|
|
if ( !$validate ) {
|
|
PTDEBUG && _d("No validator for", $col_type, "values");
|
|
return $val;
|
|
}
|
|
|
|
# Return the value if it's already valid.
|
|
return $val if defined $validate->($dbh, $val);
|
|
|
|
# The value is not valid so find the first one in the table that is.
|
|
PTDEBUG && _d("Value is invalid, getting first valid value");
|
|
$val = $self->get_first_valid_value(
|
|
%args,
|
|
val => $val,
|
|
validate => $validate,
|
|
);
|
|
|
|
return $val;
|
|
}
|
|
|
|
# Arguments:
|
|
# * dbh dbh
|
|
# * db_tbl scalar: quoted `db`.`tbl`
|
|
# * col scalar: quoted `column` name
|
|
# * val scalar: the current value, may be real, maybe not
|
|
# * validate coderef: returns a defined value if the given value is valid
|
|
# * endpoint scalar: "min" or "max", i.e. find first endpoint() real val
|
|
# Optional arguments:
|
|
# * tries scalar: fetch up to this many rows to find a valid value
|
|
# * index_hint scalar: "FORCE INDEX (...)" hint
|
|
# * where scalar: WHERE clause without "WHERE"
|
|
# Returns the first column value from the given db_tbl that does *not*
|
|
# evaluate to NULL. This is used mostly to eliminate unreal temporal
|
|
# values which MySQL allows to be stored, like "2010-00-00". Returns
|
|
# undef if no real value is found.
|
|
sub get_first_valid_value {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db_tbl col validate endpoint);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my ($dbh, $db_tbl, $col, $validate, $endpoint) = @args{@required_args};
|
|
my $tries = defined $args{tries} ? $args{tries} : 5;
|
|
my $val = $args{val};
|
|
|
|
# NULL values are valid and shouldn't be passed to us.
|
|
return unless defined $val;
|
|
|
|
# Prep a sth for fetching the next col val.
|
|
my $cmp = $endpoint =~ m/min/i ? '>'
|
|
: $endpoint =~ m/max/i ? '<'
|
|
: die "Invalid endpoint arg: $endpoint";
|
|
my $sql = "SELECT $col FROM $db_tbl "
|
|
. ($args{index_hint} ? "$args{index_hint} " : "")
|
|
. "WHERE $col $cmp ? AND $col IS NOT NULL "
|
|
. ($args{where} ? "AND ($args{where}) " : "")
|
|
. "ORDER BY $col LIMIT 1";
|
|
PTDEBUG && _d($dbh, $sql);
|
|
my $sth = $dbh->prepare($sql);
|
|
|
|
# Fetch the next col val from the db.tbl until we find a valid one
|
|
# or run out of rows. Only try a limited number of next rows.
|
|
my $last_val = $val;
|
|
while ( $tries-- ) {
|
|
$sth->execute($last_val);
|
|
my ($next_val) = $sth->fetchrow_array();
|
|
PTDEBUG && _d('Next value:', $next_val, '; tries left:', $tries);
|
|
if ( !defined $next_val ) {
|
|
PTDEBUG && _d('No more rows in table');
|
|
last;
|
|
}
|
|
if ( defined $validate->($dbh, $next_val) ) {
|
|
PTDEBUG && _d('First valid value:', $next_val);
|
|
$sth->finish();
|
|
return $next_val;
|
|
}
|
|
$last_val = $next_val;
|
|
}
|
|
$sth->finish();
|
|
$val = undef; # no valid value found
|
|
|
|
return $val;
|
|
}
|
|
|
|
# Evaluates any temporal value, returns NULL if it's invalid, else returns
|
|
# a value (possibly zero). It's magical but tested. See also,
|
|
# http://hackmysql.com/blog/2010/05/26/detecting-invalid-and-zero-temporal-values/
|
|
sub _validate_temporal_value {
|
|
my ( $dbh, $val ) = @_;
|
|
my $sql = "SELECT IF(TIME_FORMAT(?,'%H:%i:%s')=?, TIME_TO_SEC(?), TO_DAYS(?))";
|
|
my $res;
|
|
eval {
|
|
PTDEBUG && _d($dbh, $sql, $val);
|
|
my $sth = $dbh->prepare($sql);
|
|
$sth->execute($val, $val, $val, $val);
|
|
($res) = $sth->fetchrow_array();
|
|
$sth->finish();
|
|
};
|
|
if ( $EVAL_ERROR ) {
|
|
PTDEBUG && _d($EVAL_ERROR);
|
|
}
|
|
return $res;
|
|
}
|
|
|
|
sub get_nonzero_value {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(dbh db_tbl col col_type);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless $args{$arg};
|
|
}
|
|
my ($dbh, $db_tbl, $col, $col_type) = @args{@required_args};
|
|
my $tries = defined $args{tries} ? $args{tries} : 5;
|
|
my $val = $args{val};
|
|
|
|
# Right now we only need a special check for temporal values.
|
|
# _validate_temporal_value() does double-duty for this. The
|
|
# default anonymous sub handles ints.
|
|
my $is_nonzero = $col_type =~ m/time|date/ ? \&_validate_temporal_value
|
|
: sub { return $_[1]; };
|
|
|
|
if ( !$is_nonzero->($dbh, $val) ) { # quasi-double-negative, sorry
|
|
PTDEBUG && _d('Discarding zero value:', $val);
|
|
my $sql = "SELECT $col FROM $db_tbl "
|
|
. ($args{index_hint} ? "$args{index_hint} " : "")
|
|
. "WHERE $col > ? AND $col IS NOT NULL "
|
|
. ($args{where} ? "AND ($args{where}) " : '')
|
|
. "ORDER BY $col LIMIT 1";
|
|
PTDEBUG && _d($sql);
|
|
my $sth = $dbh->prepare($sql);
|
|
|
|
my $last_val = $val;
|
|
while ( $tries-- ) {
|
|
$sth->execute($last_val);
|
|
my ($next_val) = $sth->fetchrow_array();
|
|
if ( $is_nonzero->($dbh, $next_val) ) {
|
|
PTDEBUG && _d('First non-zero value:', $next_val);
|
|
$sth->finish();
|
|
return $next_val;
|
|
}
|
|
$last_val = $next_val;
|
|
}
|
|
$sth->finish();
|
|
$val = undef; # no non-zero value found
|
|
}
|
|
|
|
return $val;
|
|
}
|
|
|
|
# Sub: base_count
|
|
# Count to any number in any base with the given symbols. E.g. if counting
|
|
# to 10 in base 16 with symbols 0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f the result
|
|
# is "a". This is trivial for stuff like base 16 (hex), but far less trivial
|
|
# for arbitrary bases with arbitrary symbols like base 25 with symbols
|
|
# B,C,D,...X,Y,Z. For that, counting to 10 results in "L". The base and its
|
|
# symbols are determined by the character column. Symbols can be non-ASCII.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Required Arguments:
|
|
# count_to - Number to count to
|
|
# base - Base of special system
|
|
# symbols - Arrayref of symbols for "numbers" in special system
|
|
#
|
|
# Returns:
|
|
# The "number" (symbol) in the special target base system
|
|
sub base_count {
|
|
my ( $self, %args ) = @_;
|
|
my @required_args = qw(count_to base symbols);
|
|
foreach my $arg ( @required_args ) {
|
|
die "I need a $arg argument" unless defined $args{$arg};
|
|
}
|
|
my ($n, $base, $symbols) = @args{@required_args};
|
|
|
|
# Can't take log of zero and the zeroth symbol in any base is the
|
|
# zeroth symbol in any other base.
|
|
return $symbols->[0] if $n == 0;
|
|
|
|
my $highest_power = floor(log($n+0.00001)/log($base));
|
|
if ( $highest_power == 0 ){
|
|
return $symbols->[$n];
|
|
}
|
|
|
|
my @base_powers;
|
|
for my $power ( 0..$highest_power ) {
|
|
push @base_powers, ($base**$power) || 1;
|
|
}
|
|
|
|
my @base_multiples;
|
|
foreach my $base_power ( reverse @base_powers ) {
|
|
my $multiples = floor(($n+0.00001) / $base_power);
|
|
push @base_multiples, $multiples;
|
|
$n -= $multiples * $base_power;
|
|
}
|
|
return join('', map { $symbols->[$_] } @base_multiples);
|
|
}
|
|
|
|
sub _d {
|
|
my ($package, undef, $line) = caller 0;
|
|
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
|
|
map { defined $_ ? $_ : 'undef' }
|
|
@_;
|
|
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
|
|
}
|
|
|
|
1;
|
|
}
|
|
# ###########################################################################
|
|
# End TableChunker package
|
|
# ###########################################################################
|