Files
percona-toolkit/lib/DuplicateKeyFinder.pm
Sveta Smirnova 94bcfd2b69 PT-2340 - Support MySQL 8.4
- Updated documentation
2024-09-03 20:05:26 +03:00

552 lines
22 KiB
Perl

# This program is copyright 2009-2011 Percona Ireland Ltd.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA 02111-1307 USA.
# ###########################################################################
# DuplicateKeyFinder package
# ###########################################################################
{
# Package: DuplicateKeyFinder
# DuplicateKeyFinder finds duplicate indexes (keys).
package DuplicateKeyFinder;
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
sub new {
my ( $class, %args ) = @_;
my $self = {};
return bless $self, $class;
}
# %args should contain:
#
# * keys (req) A hashref from TableParser::get_keys().
# * clustered_key The clustered key, if any; also from get_keys().
# * tbl_info { db, tbl, engine, ddl } hashref.
# * callback An anonymous subroutine, called for each dupe found.
# * ignore_order Order never matters for any type of index (generally
# order matters except for FULLTEXT).
# * ignore_structure Compare different index types as if they're the same.
# * clustered Perform duplication checks against the clustered key.
#
# Returns an arrayref of duplicate key hashrefs. Each contains
#
# * key The name of the index that's a duplicate.
# * cols The columns in that key (arrayref).
# * duplicate_of The name of the index it duplicates.
# * duplicate_of_cols The columns of the index it duplicates.
# * reason A human-readable description of why this is a duplicate.
# * dupe_type Either exact, prefix, fk, or clustered.
#
sub get_duplicate_keys {
my ( $self, $keys, %args ) = @_;
die "I need a keys argument" unless $keys;
my %keys = %$keys; # Copy keys because we remove non-duplicates.
my $primary_key;
my @unique_keys;
my @normal_keys;
my @fulltext_keys;
my @dupes;
KEY:
foreach my $keyname ( reverse sort keys %keys ) {
my $key = $keys{$keyname};
# Save real columns before we potentially re-order them. These are
# columns we want to print if the key is a duplicate.
$key->{real_cols} = [ @{$key->{cols}} ];
# We use column lengths to compare keys.
$key->{len_cols} = length $key->{colnames};
# The primary key is treated specially. It is effectively never a
# duplicate, so it is never removed. It is compared to all other
# keys, and in any case of duplication, the primary is always kept
# and the other key removed. Usually the primary is the actual
# PRIMARY KEY, but for an InnoDB table without a PRIMARY KEY, the
# effective primary key is the clustered key.
if ( $key->{name} eq 'PRIMARY'
|| ($args{clustered_key} && $key->{name} eq $args{clustered_key}) ) {
$primary_key = $key;
PTDEBUG && _d('primary key:', $key->{name});
next KEY;
}
# Key column order matters for all keys except FULLTEXT, so unless
# ignore_order is specified we only sort FULLTEXT keys.
my $is_fulltext = $key->{type} eq 'FULLTEXT' ? 1 : 0;
if ( $args{ignore_order} || $is_fulltext ) {
my $ordered_cols = join(',', sort(split(/,/, $key->{colnames})));
PTDEBUG && _d('Reordered', $key->{name}, 'cols from',
$key->{colnames}, 'to', $ordered_cols);
$key->{colnames} = $ordered_cols;
}
# Unless ignore_structure is specified, only keys of the same
# structure (btree, fulltext, etc.) are compared to one another.
# UNIQUE keys are kept separate to make comparisons easier.
my $push_to = $key->{is_unique} ? \@unique_keys : \@normal_keys;
if ( !$args{ignore_structure} ) {
$push_to = \@fulltext_keys if $is_fulltext;
# TODO:
# $push_to = \@hash_keys if $is_hash;
# $push_to = \@spatial_keys if $is_spatial;
}
push @$push_to, $key;
}
# Redundantly constrained unique keys are treated as normal keys.
push @normal_keys, $self->unconstrain_keys($primary_key, \@unique_keys);
# Do not check the primary key against uniques before unconstraining
# redundantly unique keys. In cases like
# PRIMARY KEY (a, b)
# UNIQUE KEY (a)
# the unique key will be wrongly removed. It is needed to keep
# column a unique. The process of unconstraining redundantly unique
# keys marks single column unique keys so that they are never removed
# (the mark is adding unique_col=>1 to the unique key's hash).
if ( $primary_key ) {
PTDEBUG && _d('Comparing PRIMARY KEY to UNIQUE keys');
push @dupes,
$self->remove_prefix_duplicates([$primary_key], \@unique_keys, %args);
PTDEBUG && _d('Comparing PRIMARY KEY to normal keys');
push @dupes,
$self->remove_prefix_duplicates([$primary_key], \@normal_keys, %args);
}
PTDEBUG && _d('Comparing UNIQUE keys to normal keys');
push @dupes,
$self->remove_prefix_duplicates(\@unique_keys, \@normal_keys, %args);
PTDEBUG && _d('Comparing normal keys');
push @dupes,
$self->remove_prefix_duplicates(\@normal_keys, \@normal_keys, %args);
# If --allstruct, then these special struct keys (FULLTEXT, HASH, etc.)
# will have already been put in and handled by @normal_keys.
PTDEBUG && _d('Comparing FULLTEXT keys');
push @dupes,
$self->remove_prefix_duplicates(\@fulltext_keys, \@fulltext_keys, %args, exact_duplicates => 1);
# Remove clustered duplicates.
my $clustered_key = $args{clustered_key} ? $keys{$args{clustered_key}}
: undef;
PTDEBUG && _d('clustered key:',
$clustered_key ? ($clustered_key->{name}, $clustered_key->{colnames})
: 'none');
if ( $clustered_key
&& $args{clustered}
&& $args{tbl_info}->{engine}
&& ( $args{tbl_info}->{engine} =~ m/InnoDB/i
|| $args{tbl_info}->{engine} =~ m/TokuDB/i
|| $args{tbl_info}->{engine} =~ m/RocksDB/i )
)
{
PTDEBUG && _d('Removing UNIQUE dupes of clustered key');
push @dupes,
$self->remove_clustered_duplicates($clustered_key, \@unique_keys, %args);
PTDEBUG && _d('Removing ordinary dupes of clustered key');
push @dupes,
$self->remove_clustered_duplicates($clustered_key, \@normal_keys, %args);
}
return \@dupes;
}
sub get_duplicate_fks {
my ( $self, $fks, %args ) = @_;
die "I need a fks argument" unless $fks;
my @fks = ();
foreach my $key ( sort keys %$fks ) {
push @fks, $fks->{$key};
}
my @dupes;
foreach my $i ( 0..$#fks - 1 ) {
next unless $fks[$i];
foreach my $j ( $i+1..$#fks ) {
next unless $fks[$j];
# A foreign key is a duplicate no matter what order the
# columns are in, so re-order them alphabetically so they
# can be compared.
my $i_cols = join(',', sort @{$fks[$i]->{cols}} );
my $j_cols = join(',', sort @{$fks[$j]->{cols}} );
my $i_pcols = join(',', sort @{$fks[$i]->{parent_cols}} );
my $j_pcols = join(',', sort @{$fks[$j]->{parent_cols}} );
if ( $fks[$i]->{parent_tblname} eq $fks[$j]->{parent_tblname}
&& $i_cols eq $j_cols
&& $i_pcols eq $j_pcols ) {
my $dupe = {
key => $fks[$j]->{name},
cols => [ @{$fks[$j]->{cols}} ],
ddl => $fks[$j]->{ddl},
duplicate_of => $fks[$i]->{name},
duplicate_of_cols => [ @{$fks[$i]->{cols}} ],
duplicate_of_ddl => $fks[$i]->{ddl},
reason =>
"FOREIGN KEY $fks[$j]->{name} ($fks[$j]->{colnames}) "
. "REFERENCES $fks[$j]->{parent_tblname} "
. "($fks[$j]->{parent_colnames}) "
. 'is a duplicate of '
. "FOREIGN KEY $fks[$i]->{name} ($fks[$i]->{colnames}) "
. "REFERENCES $fks[$i]->{parent_tblname} "
."($fks[$i]->{parent_colnames})",
dupe_type => 'fk',
};
push @dupes, $dupe;
delete $fks[$j];
$args{callback}->($dupe, %args) if $args{callback};
}
}
}
return \@dupes;
}
# Removes and returns prefix duplicate keys from right_keys.
# Both left_keys and right_keys are arrayrefs.
#
# Prefix duplicates are the typical type of duplicate like:
# KEY x (a)
# KEY y (a, b)
# Key x is a prefix duplicate of key y. This also covers exact
# duplicates like:
# KEY y (a, b)
# KEY z (a, b)
# Key y and z are exact duplicates.
#
# Usually two separate lists of keys are compared: the left and right
# keys. When a duplicate is found, the Left key is Left alone and the
# Right key is Removed. This is done because some keys are more important
# than others. For example, the PRIMARY KEY is always a left key because
# it is never removed. When comparing UNIQUE keys to normal (non-unique)
# keys, the UNIQUE keys are Left (alone) and any duplicating normal
# keys are Removed.
#
# A list of keys can be compared to itself in which case left and right
# keys reference the same list but this sub doesn't know that so it just
# removes dupes from the left as usual.
#
# Optional args are:
# * exact_duplicates Keys are dupes only if they're exact duplicates
# * callback Sub called for each dupe found
#
# For a full technical explanation of how/why this sub works, read:
# http://code.google.com/p/maatkit/wiki/DeterminingDuplicateKeys
sub remove_prefix_duplicates {
my ( $self, $left_keys, $right_keys, %args ) = @_;
my @dupes;
my $right_offset;
my $last_left_key;
my $last_right_key = scalar(@$right_keys) - 1;
# We use "scalar(@$arrayref) - 1" because the $# syntax is not
# reliable with arrayrefs across Perl versions. And we use index
# into the arrays because we delete elements.
if ( $right_keys != $left_keys ) {
# Right and left keys are different lists.
@$left_keys = sort { lc($a->{colnames}) cmp lc($b->{colnames}) }
grep { defined $_; }
@$left_keys;
@$right_keys = sort { lc($a->{colnames}) cmp lc($b->{colnames}) }
grep { defined $_; }
@$right_keys;
# Last left key is its very last key.
$last_left_key = scalar(@$left_keys) - 1;
# No need to offset where we begin looping through the right keys.
$right_offset = 0;
}
else {
# Right and left keys are the same list.
@$left_keys = reverse sort { lc($a->{colnames}) cmp lc($b->{colnames}) }
grep { defined $_; }
@$left_keys;
# Last left key is its second-to-last key.
# The very last left key will be used as a right key.
$last_left_key = scalar(@$left_keys) - 2;
# Since we're looping through the same list in two different
# positions, we must offset where we begin in the right keys
# so that we stay ahead of where we are in the left keys.
$right_offset = 1;
}
LEFT_KEY:
foreach my $left_index ( 0..$last_left_key ) {
next LEFT_KEY unless defined $left_keys->[$left_index];
RIGHT_KEY:
foreach my $right_index ( $left_index+$right_offset..$last_right_key ) {
next RIGHT_KEY unless defined $right_keys->[$right_index];
my $left_name = $left_keys->[$left_index]->{name};
my $left_cols = $left_keys->[$left_index]->{colnames};
my $left_len_cols = $left_keys->[$left_index]->{len_cols};
my $right_name = $right_keys->[$right_index]->{name};
my $right_cols = $right_keys->[$right_index]->{colnames};
my $right_len_cols = $right_keys->[$right_index]->{len_cols};
PTDEBUG && _d('Comparing left', $left_name, '(',$left_cols,')',
'to right', $right_name, '(',$right_cols,')');
# Compare the whole right key to the left key, not just
# the their common minimum length prefix. This is correct.
# Read http://code.google.com/p/maatkit/wiki/DeterminingDuplicateKeys.
if ( substr($left_cols, 0, $right_len_cols)
eq substr($right_cols, 0, $right_len_cols) ) {
# UNIQUE and FULLTEXT indexes are only duplicates if they
# are exact duplicates.
if ( $args{exact_duplicates} && ($right_len_cols<$left_len_cols) ) {
PTDEBUG && _d($right_name, 'not exact duplicate of', $left_name);
next RIGHT_KEY;
}
# Do not remove the unique key that is constraining a single
# column to uniqueness. This prevents UNIQUE KEY (a) from being
# removed by PRIMARY KEY (a, b).
if ( exists $right_keys->[$right_index]->{unique_col} ) {
PTDEBUG && _d('Cannot remove', $right_name,
'because is constrains col',
$right_keys->[$right_index]->{cols}->[0]);
next RIGHT_KEY;
}
PTDEBUG && _d('Remove', $right_name);
my $reason;
if ( my $type = $right_keys->[$right_index]->{unconstrained} ) {
$reason .= "Uniqueness of $right_name ignored because "
. $right_keys->[$right_index]->{constraining_key}->{name}
. " is a $type constraint\n";
}
my $exact_dupe = $right_len_cols < $left_len_cols ? 0 : 1;
$reason .= $right_name
. ($exact_dupe ? ' is a duplicate of '
: ' is a left-prefix of ')
. $left_name;
my $dupe = {
key => $right_name,
cols => $right_keys->[$right_index]->{real_cols},
ddl => $right_keys->[$right_index]->{ddl},
duplicate_of => $left_name,
duplicate_of_cols => $left_keys->[$left_index]->{real_cols},
duplicate_of_ddl => $left_keys->[$left_index]->{ddl},
reason => $reason,
dupe_type => $exact_dupe ? 'exact' : 'prefix',
};
push @dupes, $dupe;
delete $right_keys->[$right_index];
$args{callback}->($dupe, %args) if $args{callback};
}
else {
PTDEBUG && _d($right_name, 'not left-prefix of', $left_name);
next RIGHT_KEY;
}
} # RIGHT_KEY
} # LEFT_KEY
PTDEBUG && _d('No more keys');
# Cleanup the lists: remove removed keys.
@$left_keys = grep { defined $_; } @$left_keys;
@$right_keys = grep { defined $_; } @$right_keys;
return @dupes;
}
# Removes and returns clustered duplicate keys from keys.
# ck (clustered key) is hashref and keys is an arrayref.
#
# For engines with a clustered index, if a key ends with a prefix
# of the primary key, it's a duplicate. Example:
# PRIMARY KEY (a)
# KEY foo (b, a)
# Key foo is redundant to PRIMARY.
#
# Optional args are:
# * callback Sub called for each dupe found
#
sub remove_clustered_duplicates {
my ( $self, $ck, $keys, %args ) = @_;
die "I need a ck argument" unless $ck;
die "I need a keys argument" unless $keys;
my $ck_cols = $ck->{colnames};
my @dupes;
KEY:
for my $i ( 0 .. @$keys - 1 ) {
my $key = $keys->[$i]->{colnames};
if ( $key =~ m/$ck_cols$/ ) {
PTDEBUG && _d("clustered key dupe:", $keys->[$i]->{name},
$keys->[$i]->{colnames});
my $dupe = {
key => $keys->[$i]->{name},
cols => $keys->[$i]->{real_cols},
ddl => $keys->[$i]->{ddl},
duplicate_of => $ck->{name},
duplicate_of_cols => $ck->{real_cols},
duplicate_of_ddl => $ck->{ddl},
reason => "Key $keys->[$i]->{name} ends with a "
. "prefix of the clustered index",
dupe_type => 'clustered',
short_key => $self->shorten_clustered_duplicate(
$ck_cols,
join(',', map { "`$_`" }
@{$keys->[$i]->{real_cols}})
),
};
push @dupes, $dupe;
delete $keys->[$i];
$args{callback}->($dupe, %args) if $args{callback};
}
}
PTDEBUG && _d('No more keys');
# Cleanup the lists: remove removed keys.
@$keys = grep { defined $_; } @$keys;
return @dupes;
}
sub shorten_clustered_duplicate {
my ( $self, $ck_cols, $dupe_key_cols ) = @_;
return $ck_cols if $ck_cols eq $dupe_key_cols;
$dupe_key_cols =~ s/$ck_cols$//;
$dupe_key_cols =~ s/,+$//;
return $dupe_key_cols;
}
# Given a primary key (can be undef) and an arrayref of unique keys,
# removes and returns redundantly constrained unique keys from uniquie_keys.
sub unconstrain_keys {
my ( $self, $primary_key, $unique_keys ) = @_;
die "I need a unique_keys argument" unless $unique_keys;
my %unique_cols;
my @unique_sets;
my %unconstrain;
my @unconstrained_keys;
PTDEBUG && _d('Unconstraining redundantly unique keys');
# First determine which unique keys define unique columns
# and which define unique sets.
UNIQUE_KEY:
foreach my $unique_key ( $primary_key, @$unique_keys ) {
next unless $unique_key; # primary key may be undefined
my $cols = $unique_key->{cols};
if ( @$cols == 1 ) {
if ( !exists $unique_cols{$cols->[0]} ) {
PTDEBUG && _d($unique_key->{name}, 'defines unique column:',
$cols->[0]);
$unique_cols{$cols->[0]} = $unique_key;
$unique_key->{unique_col} = 1;
}
else {
# https://bugs.launchpad.net/percona-toolkit/+bug/1217013
# If two unique indexes are not exact, then they must be enforcing
# different uniqueness constraints. Else they're exact dupes
# so one can be treated as a non-unique and removed later
# when comparing unique to non-unique.
PTDEBUG && _d($unique_key->{name},
'redundantly constrains unique column:', $cols->[0]);
$unique_key->{exact_dupe} = 1;
$unique_key->{constraining_key} = $unique_cols{$cols->[0]};
}
}
else {
local $LIST_SEPARATOR = '-';
PTDEBUG && _d($unique_key->{name}, 'defines unique set:', @$cols);
push @unique_sets, { cols => $cols, key => $unique_key };
}
}
# Second, find which unique sets can be unconstraind (i.e. those
# which have which have at least one unique column).
UNIQUE_SET:
foreach my $unique_set ( @unique_sets ) {
my $n_unique_cols = 0;
COL:
foreach my $col ( @{$unique_set->{cols}} ) {
if ( exists $unique_cols{$col} ) {
PTDEBUG && _d('Unique set', $unique_set->{key}->{name},
'has unique col', $col);
last COL if ++$n_unique_cols > 1;
$unique_set->{constraining_key} = $unique_cols{$col};
}
}
if ( $n_unique_cols && $unique_set->{key}->{name} ne 'PRIMARY' ) {
# Unique set is redundantly constrained.
PTDEBUG && _d('Will unconstrain unique set',
$unique_set->{key}->{name},
'because it is redundantly constrained by key',
$unique_set->{constraining_key}->{name},
'(',$unique_set->{constraining_key}->{colnames},')');
$unconstrain{$unique_set->{key}->{name}}
= $unique_set->{constraining_key};
}
}
# And finally, unconstrain the redundantly unique sets found above by
# removing them from the list of unique keys and adding them to the
# list of normal keys.
for my $i ( 0..(scalar @$unique_keys-1) ) {
if ( exists $unconstrain{$unique_keys->[$i]->{name}} ) {
PTDEBUG && _d('Unconstraining weak', $unique_keys->[$i]->{name});
$unique_keys->[$i]->{unconstrained} = 'stronger';
$unique_keys->[$i]->{constraining_key}
= $unconstrain{$unique_keys->[$i]->{name}};
push @unconstrained_keys, $unique_keys->[$i];
delete $unique_keys->[$i];
}
elsif ( $unique_keys->[$i]->{exact_dupe} ) {
# https://bugs.launchpad.net/percona-toolkit/+bug/1217013
PTDEBUG && _d('Unconstraining dupe', $unique_keys->[$i]->{name});
$unique_keys->[$i]->{unconstrained} = 'duplicate';
push @unconstrained_keys, $unique_keys->[$i];
delete $unique_keys->[$i];
}
}
PTDEBUG && _d('No more keys');
return @unconstrained_keys;
}
sub _d {
my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
map { defined $_ ? $_ : 'undef' }
@_;
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}
1;
}
# ###########################################################################
# End DuplicateKeyFinder package
# ###########################################################################