Files
percona-toolkit/lib/TableUsage.pm

761 lines
24 KiB
Perl

# This program is copyright 2011 Percona Inc.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA 02111-1307 USA.
# ###########################################################################
# TableUsage package
# ###########################################################################
{
# Package: TableUsage
# TableUsage determines how tables in a query are used.
#
# For best results, queries should be from EXPLAIN EXTENDED so all identifiers
# are fully qualified. Else, some table references may be missed because
# no effort is made to table-qualify unqualified columns.
#
# This package uses both QueryParser and SQLParser. The former is used for
# simple queries, and the latter is used for more complex queries where table
# usage may be hidden in who-knows-which clause of the SQL statement.
package TableUsage;
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant MKDEBUG => $ENV{MKDEBUG} || 0;
use Data::Dumper;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Quotekeys = 0;
# Sub: new
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# QueryParser - <QueryParser> object
# SQLParser - <SQLParser> object
#
# Optional Arguments:
# constant_data_value - Value for constants, default "DUAL".
#
# Returns:
# TableUsage object
sub new {
my ( $class, %args ) = @_;
my @required_args = qw(QueryParser SQLParser);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my $self = {
# defaults
constant_data_value => 'DUAL',
# override defaults
%args,
};
return bless $self, $class;
}
# Sub: get_table_usage
# Get table usage for each table in the given query.
#
# Parameters:
# %args - Arguments
#
# Required Arguments:
# query - Query string
#
# Returns:
# Arrayref of hashrefs, one for each CAT, like:
# (code start)
# [
# { context => 'DELETE',
# table => 'd.t',
# },
# { context => 'DELETE',
# table => 'd.t',
# },
# ],
# (code stop)
sub get_table_usage {
my ( $self, %args ) = @_;
my @required_args = qw(query);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($query) = @args{@required_args};
MKDEBUG && _d('Getting table access for',
substr($query, 0, 100), (length $query > 100 ? '...' : ''));
my $cats; # arrayref of CAT hashrefs for each table
# Try to parse the query first with SQLParser. This may be overkill for
# simple queries, but it's probably cheaper to just do this than to try
# detect first if the query is simple enough to parse with QueryParser.
my $query_struct;
eval {
$query_struct = $self->{SQLParser}->parse($query);
};
if ( $EVAL_ERROR ) {
MKDEBUG && _d('Failed to parse query with SQLParser:', $EVAL_ERROR);
if ( $EVAL_ERROR =~ m/Cannot parse/ ) {
# SQLParser can't parse this type of query, so it's probably some
# data definition statement with just a table list. Use QueryParser
# to extract the table list and hope we're not wrong.
$cats = $self->_get_tables_used_from_query_parser(%args);
}
else {
# SQLParser failed to parse the query due to some error.
die $EVAL_ERROR;
}
}
else {
# SQLParser parsed the query, so now we need to examine its structure
# to determine the CATs for each table.
$cats = $self->_get_tables_used_from_query_struct(
query_struct => $query_struct,
%args,
);
}
MKDEBUG && _d('Query table access:', Dumper($cats));
return $cats;
}
sub _get_tables_used_from_query_parser {
my ( $self, %args ) = @_;
my @required_args = qw(query);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($query) = @args{@required_args};
MKDEBUG && _d('Getting tables used from query parser');
$query = $self->{QueryParser}->clean_query($query);
my ($query_type) = $query =~ m/^\s*(\w+)\s+/;
$query_type = uc $query_type;
die "Query does not begin with a word" unless $query_type; # shouldn't happen
if ( $query_type eq 'DROP' ) {
my ($drop_what) = $query =~ m/^\s*DROP\s+(\w+)\s+/i;
die "Invalid DROP query: $query" unless $drop_what;
# Don't use a space like "DROP TABLE" because the output of
# mk-table-usage is space-separated.
$query_type .= '_' . uc($drop_what);
}
my @tables_used;
foreach my $table ( $self->{QueryParser}->get_tables($query) ) {
$table =~ s/`//g;
push @{$tables_used[0]}, {
table => $table,
context => $query_type,
};
}
return \@tables_used;
}
sub _get_tables_used_from_query_struct {
my ( $self, %args ) = @_;
my @required_args = qw(query_struct);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($query_struct) = @args{@required_args};
my $sp = $self->{SQLParser};
MKDEBUG && _d('Getting table used from query struct');
# The table references clause is different depending on the query type.
my $query_type = uc $query_struct->{type};
my $tbl_refs = $query_type =~ m/(?:SELECT|DELETE)/ ? 'from'
: $query_type =~ m/(?:INSERT|REPLACE)/ ? 'into'
: $query_type =~ m/UPDATE/ ? 'tables'
: die "Cannot find table references for $query_type queries";
my $tables = $query_struct->{$tbl_refs};
if ( !$tables || @$tables == 0 ) {
MKDEBUG && _d("Query does not use any tables");
return [
[ { context => $query_type, table => $self->{constant_data_value} } ]
];
}
# Get tables used in the query's WHERE clause, if it has one.
my $where;
if ( $query_struct->{where} ) {
$where = $self->_get_tables_used_in_where(
%args,
tables => $tables,
where => $query_struct->{where},
);
}
my @tables_used;
if ( $query_type eq 'UPDATE' && @{$query_struct->{tables}} > 1 ) {
MKDEBUG && _d("Multi-table UPDATE");
# UPDATE queries with multiple tables are a special case. The query
# reads from each referenced table and writes only to tables referenced
# in the SET clause. Each written table is like its own query, so
# we create a table usage hashref for each one.
my @join_tables;
foreach my $table ( @$tables ) {
my $table = $self->_qualify_table_name(
%args,
tables => $tables,
db => $table->{db},
tbl => $table->{tbl},
);
my $table_usage = {
context => 'JOIN',
table => $table,
};
MKDEBUG && _d("Table usage from TLIST:", Dumper($table_usage));
push @join_tables, $table_usage;
}
if ( $where && $where->{joined_tables} ) {
foreach my $table ( @{$where->{joined_tables}} ) {
my $table_usage = {
context => $query_type,
table => $table,
};
MKDEBUG && _d("Table usage from WHERE (implicit join):",
Dumper($table_usage));
push @join_tables, $table_usage;
}
}
my @where_tables;
if ( $where && $where->{filter_tables} ) {
foreach my $table ( @{$where->{filter_tables}} ) {
my $table_usage = {
context => 'WHERE',
table => $table,
};
MKDEBUG && _d("Table usage from WHERE:", Dumper($table_usage));
push @where_tables, $table_usage;
}
}
my $set_tables = $self->_get_tables_used_in_set(
%args,
tables => $tables,
set => $query_struct->{set},
);
foreach my $table ( @$set_tables ) {
my @table_usage = (
{ # the written table
context => 'UPDATE',
table => $table->{table},
},
{ # source of data written to the written table
context => 'SELECT',
table => $table->{value},
},
);
MKDEBUG && _d("Table usage from UPDATE SET:", Dumper(\@table_usage));
push @tables_used, [
@table_usage,
@join_tables,
@where_tables,
];
}
} # multi-table UPDATE
else {
# Only data in tables referenced in the column list are returned
# to the user. So a table can appear in the tlist (e.g. after FROM)
# but that doesn't mean data from the table is returned to the user;
# the table could be used purely for JOIN or WHERE.
if ( $query_type eq 'SELECT' ) {
my $clist_tables = $self->_get_tables_used_in_columns(
%args,
tables => $tables,
columns => $query_struct->{columns},
);
foreach my $table ( @$clist_tables ) {
my $table_usage = {
context => 'SELECT',
table => $table,
};
MKDEBUG && _d("Table usage from CLIST:", Dumper($table_usage));
push @{$tables_used[0]}, $table_usage;
}
}
if ( @$tables > 1 || $query_type ne 'SELECT' ) {
my $default_context = @$tables > 1 ? 'TLIST' : $query_type;
foreach my $table ( @$tables ) {
my $qualified_table = $self->_qualify_table_name(
%args,
tables => $tables,
db => $table->{db},
tbl => $table->{tbl},
);
my $context = $default_context;
if ( $table->{join} && $table->{join}->{condition} ) {
$context = 'JOIN';
if ( $table->{join}->{condition} eq 'using' ) {
MKDEBUG && _d("Table joined with USING condition");
my $joined_table = $self->_qualify_table_name(
%args,
tables => $tables,
tbl => $table->{join}->{to},
);
$self->_change_context(
tables => $tables,
table => $joined_table,
tables_used => $tables_used[0],
old_context => 'TLIST',
new_context => 'JOIN',
);
}
elsif ( $table->{join}->{condition} eq 'on' ) {
MKDEBUG && _d("Table joined with ON condition");
my $on_tables = $self->_get_tables_used_in_where(
%args,
tables => $tables,
where => $table->{join}->{where},
clause => 'JOIN condition', # just for debugging
);
MKDEBUG && _d("JOIN ON tables:", Dumper($on_tables));
foreach my $joined_table ( @{$on_tables->{joined_tables}} ) {
$self->_change_context(
tables => $tables,
table => $joined_table,
tables_used => $tables_used[0],
old_context => 'TLIST',
new_context => 'JOIN',
);
}
}
else {
warn "Unknown JOIN condition: $table->{join}->{condition}";
}
}
my $table_usage = {
context => $context,
table => $qualified_table,
};
MKDEBUG && _d("Table usage from TLIST:", Dumper($table_usage));
push @{$tables_used[0]}, $table_usage;
}
}
if ( $where && $where->{joined_tables} ) {
foreach my $joined_table ( @{$where->{joined_tables}} ) {
MKDEBUG && _d("Table joined implicitly in WHERE:", $joined_table);
$self->_change_context(
tables => $tables,
table => $joined_table,
tables_used => $tables_used[0],
old_context => 'TLIST',
new_context => 'JOIN',
);
}
}
if ( $query_type =~ m/(?:INSERT|REPLACE)/ ) {
if ( $query_struct->{select} ) {
MKDEBUG && _d("Getting tables used in INSERT-SELECT");
my $select_tables = $self->_get_tables_used_from_query_struct(
%args,
query_struct => $query_struct->{select},
);
push @{$tables_used[0]}, @{$select_tables->[0]};
}
else {
my $table_usage = {
context => 'SELECT',
table => $self->{constant_data_value},
};
MKDEBUG && _d("Table usage from SET/VALUES:", Dumper($table_usage));
push @{$tables_used[0]}, $table_usage;
}
}
elsif ( $query_type eq 'UPDATE' ) {
my $set_tables = $self->_get_tables_used_in_set(
%args,
tables => $tables,
set => $query_struct->{set},
);
foreach my $table ( @$set_tables ) {
my $table_usage = {
context => 'SELECT',
table => $table->{value_is_table} ? $table->{table}
: $self->{constant_data_value},
};
MKDEBUG && _d("Table usage from SET:", Dumper($table_usage));
push @{$tables_used[0]}, $table_usage;
}
}
if ( $where && $where->{filter_tables} ) {
foreach my $table ( @{$where->{filter_tables}} ) {
my $table_usage = {
context => 'WHERE',
table => $table,
};
MKDEBUG && _d("Table usage from WHERE:", Dumper($table_usage));
push @{$tables_used[0]}, $table_usage;
}
}
}
return \@tables_used;
}
sub _get_tables_used_in_columns {
my ( $self, %args ) = @_;
my @required_args = qw(tables columns);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($tables, $columns) = @args{@required_args};
MKDEBUG && _d("Getting tables used in CLIST");
my @tables;
if ( @$tables == 1 ) {
# SELECT a, b FROM t WHERE ... -- one table so cols a and b must
# be from that table.
MKDEBUG && _d("Single table SELECT:", $tables->[0]->{tbl});
my $table = $self->_qualify_table_name(
%args,
db => $tables->[0]->{db},
tbl => $tables->[0]->{tbl},
);
@tables = ($table);
}
elsif ( @$columns == 1 && $columns->[0]->{col} eq '*' ) {
if ( $columns->[0]->{tbl} ) {
# SELECT t1.* FROM ... -- selecting only from table t1
MKDEBUG && _d("SELECT all columns from one table");
my $table = $self->_qualify_table_name(
%args,
db => $columns->[0]->{db},
tbl => $columns->[0]->{tbl},
);
@tables = ($table);
}
else {
# SELECT * FROM ... -- selecting from all tables
MKDEBUG && _d("SELECT all columns from all tables");
foreach my $table ( @$tables ) {
my $table = $self->_qualify_table_name(
%args,
tables => $tables,
db => $table->{db},
tbl => $table->{tbl},
);
push @tables, $table;
}
}
}
else {
# SELECT x, y FROM t1, t2 -- have to determine from which table each
# column is.
MKDEBUG && _d(scalar @$tables, "table SELECT");
my %seen;
COLUMN:
foreach my $column ( @$columns ) {
next COLUMN unless $column->{tbl};
my $table = $self->_qualify_table_name(
%args,
db => $column->{db},
tbl => $column->{tbl},
);
push @tables, $table if $table && !$seen{$table}++;
}
}
return \@tables;
}
sub _get_tables_used_in_where {
my ( $self, %args ) = @_;
my @required_args = qw(tables where);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($tables, $where) = @args{@required_args};
my $sql_parser = $self->{SQLParser};
MKDEBUG && _d("Getting tables used in", $args{clause} || 'WHERE');
my %filter_tables;
my %join_tables;
CONDITION:
foreach my $cond ( @$where ) {
MKDEBUG && _d("Condition:", Dumper($cond));
my @tables; # tables used in this condition
my $n_vals = 0;
my $is_constant = 0;
my $unknown_table = 0;
ARG:
foreach my $arg ( qw(left_arg right_arg) ) {
if ( !defined $cond->{$arg} ) {
MKDEBUG && _d($arg, "is a constant value");
$is_constant = 1;
next ARG;
}
if ( $sql_parser->is_identifier($cond->{$arg}) ) {
MKDEBUG && _d($arg, "is an identifier");
my $ident_struct = $sql_parser->parse_identifier(
'column',
$cond->{$arg}
);
if ( !$ident_struct->{tbl} ) {
if ( @$tables == 1 ) {
MKDEBUG && _d("Condition column is not table-qualified; ",
"using query's only table:", $tables->[0]->{tbl});
$ident_struct->{tbl} = $tables->[0]->{tbl};
}
else {
MKDEBUG && _d("Condition column is not table-qualified and",
"query has multiple tables; cannot determine its table");
if ( $cond->{$arg} !~ m/\w+\(/ # not a function
&& $cond->{$arg} !~ m/^[\d.]+$/) { # not a number
$unknown_table = 1;
}
next ARG;
}
}
if ( !$ident_struct->{db} && @$tables == 1 && $tables->[0]->{db} ) {
MKDEBUG && _d("Condition column is not database-qualified; ",
"using its table's database:", $tables->[0]->{db});
$ident_struct->{db} = $tables->[0]->{db};
}
my $table = $self->_qualify_table_name(
%args,
%$ident_struct,
);
if ( $table ) {
push @tables, $table;
}
}
else {
MKDEBUG && _d($arg, "is a value");
$n_vals++;
}
} # ARG
if ( $is_constant || $n_vals == 2 ) {
MKDEBUG && _d("Condition is a constant or two values");
$filter_tables{$self->{constant_data_value}} = undef;
}
else {
if ( @tables == 1 ) {
if ( $unknown_table ) {
MKDEBUG && _d("Condition joins table",
$tables[0], "to column from unknown table");
$join_tables{$tables[0]} = undef;
}
else {
MKDEBUG && _d("Condition filters table", $tables[0]);
$filter_tables{$tables[0]} = undef;
}
}
elsif ( @tables == 2 ) {
MKDEBUG && _d("Condition joins tables",
$tables[0], "and", $tables[1]);
$join_tables{$tables[0]} = undef;
$join_tables{$tables[1]} = undef;
}
}
} # CONDITION
# NOTE: the sort is not necessary, it's done so test can be deterministic.
return {
filter_tables => [ sort keys %filter_tables ],
joined_tables => [ sort keys %join_tables ],
};
}
sub _get_tables_used_in_set {
my ( $self, %args ) = @_;
my @required_args = qw(tables set);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($tables, $set) = @args{@required_args};
my $sql_parser = $self->{SQLParser};
MKDEBUG && _d("Getting tables used in SET");
my @tables;
if ( @$tables == 1 ) {
my $table = $self->_qualify_table_name(
%args,
db => $tables->[0]->{db},
tbl => $tables->[0]->{tbl},
);
$tables[0] = {
table => $table,
value => $self->{constant_data_value}
};
}
else {
foreach my $cond ( @$set ) {
next unless $cond->{tbl};
my $table = $self->_qualify_table_name(
%args,
db => $cond->{db},
tbl => $cond->{tbl},
);
my $value = $self->{constant_data_value};
my $value_is_table = 0;
if ( $sql_parser->is_identifier($cond->{value}) ) {
my $ident_struct = $sql_parser->parse_identifier(
'column',
$cond->{value},
);
$value_is_table = 1;
$value = $self->_qualify_table_name(
%args,
db => $ident_struct->{db},
tbl => $ident_struct->{tbl},
);
}
push @tables, {
table => $table,
value => $value,
value_is_table => $value_is_table,
};
}
}
return \@tables;
}
sub _get_real_table_name {
my ( $self, %args ) = @_;
my @required_args = qw(tables name);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($tables, $name) = @args{@required_args};
foreach my $table ( @$tables ) {
if ( $table->{tbl} eq $name
|| ($table->{alias} || "") eq $name ) {
MKDEBUG && _d("Real table name for", $name, "is", $table->{tbl});
return $table->{tbl};
}
}
# The named thing isn't referenced as a table by the query, so it's
# probably a function or something else.
MKDEBUG && _d("Table", $name, "does not exist in query");
return;
}
sub _qualify_table_name {
my ( $self, %args) = @_;
my @required_args = qw(tables tbl);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($tables, $table) = @args{@required_args};
MKDEBUG && _d("Qualifying table with database:", $table);
my ($tbl, $db) = reverse split /[.]/, $table;
# Always use real table names, not alias.
$tbl = $self->_get_real_table_name(%args, name => $tbl);
return unless $tbl; # shouldn't happen
my $db_tbl;
if ( $db ) {
# Table was already db-qualified.
$db_tbl = "$db.$tbl";
}
elsif ( $args{db} ) {
# Database given, use it.
$db_tbl = "$args{db}.$tbl";
}
else {
# If no db is given, see if the table is db-qualified.
foreach my $tbl_info ( @$tables ) {
if ( ($tbl_info->{tbl} eq $tbl) && $tbl_info->{db} ) {
$db_tbl = "$tbl_info->{db}.$tbl";
last;
}
}
# Last resort: use default db if it's given.
if ( !$db_tbl && $args{default_db} ) {
$db_tbl = "$args{default_db}.$tbl";
}
# Can't db-qualify the table, so return just the real table name.
if ( !$db_tbl ) {
MKDEBUG && _d("Cannot determine database for table", $tbl);
$db_tbl = $tbl;
}
}
MKDEBUG && _d("Table qualified with database:", $db_tbl);
return $db_tbl;
}
sub _change_context {
my ( $self, %args) = @_;
my @required_args = qw(tables_used table old_context new_context tables);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
}
my ($tables_used, $table, $old_context, $new_context) = @args{@required_args};
MKDEBUG && _d("Change context of table", $table, "from", $old_context,
"to", $new_context);
foreach my $used_table ( @$tables_used ) {
if ( $used_table->{table} eq $table
&& $used_table->{context} eq $old_context ) {
$used_table->{context} = $new_context;
return;
}
}
MKDEBUG && _d("Table", $table, "is not used; cannot set its context");
return;
}
sub _d {
my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
map { defined $_ ? $_ : 'undef' }
@_;
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}
1;
}
# ###########################################################################
# End TableUsage package
# ###########################################################################