percona-toolkit/lib/QueryAdvisorRules.pm

# This program is copyright 2010-2011 Percona Ireland Ltd.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA  02111-1307  USA.
# ###########################################################################
# QueryAdvisorRules package
# ###########################################################################
{
# Package: QueryAdvisorRules
# QueryAdvisorRules encapsulates rules for checking queries.
package QueryAdvisorRules;
use base 'AdvisorRules';

use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;

sub new {
   my ( $class, %args ) = @_;
   my $self = $class->SUPER::new(%args);
   @{$self->{rules}} = $self->get_rules();
   PTDEBUG && _d(scalar @{$self->{rules}}, "rules");
   return $self;
}

# Each rules is a hashref with two keys:
#   * id       Unique PREFIX.NUMBER for the rule.  The prefix is three chars
#              which hints to the nature of the rule.  See example below.
#   * code     Coderef to check rule, returns undef if rule does not match,
#              else returns the string pos near where the rule matches or 0
#              to indicate it doesn't know the pos.  The code is passed a\
#              single arg: a hashref event.
sub get_rules {
   return
   {
      id   => 'ALI.001',      # Implicit alias
      code => sub {
         my ( %args ) = @_;
         my $event  = $args{event};
         my $struct = $event->{query_struct};
         my $tbls   = $struct->{from} || $struct->{into} || $struct->{tables};
         return unless $tbls;
         foreach my $tbl ( @$tbls ) {
            return 0 if $tbl->{alias} && !$tbl->{explicit_alias};
         }
         my $cols = $struct->{columns};
         return unless $cols;
         foreach my $col ( @$cols ) {
            return 0 if $col->{alias} && !$col->{explicit_alias};
         }
         return;
      },
   },
   {
      id   => 'ALI.002',      # tbl.* alias
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         my $cols  = $event->{query_struct}->{columns};
         return unless $cols;
         foreach my $col ( @$cols ) {
            return 0 if $col->{tbl} && $col->{col} eq '*' &&  $col->{alias};
         }
         return;
      },
   },
   {
      id   => 'ALI.003',      # tbl AS tbl
      code => sub {
         my ( %args ) = @_;
         my $event  = $args{event};
         my $struct = $event->{query_struct};
         my $tbls   = $struct->{from} || $struct->{into} || $struct->{tables};
         return unless $tbls;
         foreach my $tbl ( @$tbls ) {
            return 0 if $tbl->{alias} && $tbl->{alias} eq $tbl->{tbl};
         }
         my $cols = $struct->{columns};
         return unless $cols;
         foreach my $col ( @$cols ) {
            return 0 if $col->{alias} && $col->{alias} eq $col->{col};
         }
         return;
      },
   },
   {
      id   => 'ARG.001',      # col = '%foo'
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         my $where = $event->{query_struct}->{where};
         return unless $where && @$where;
         foreach my $arg ( @$where ) {
            return 0
               if ($arg->{operator} || '') eq 'like'
                  && $arg->{right_arg} =~ m/[\'\"][\%\_]./;
         }
         return;
      },
   },
   {
      id   => 'ARG.002',      # LIKE w/o wildcard
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         my $where = $event->{query_struct}->{where};
         return unless $where && @$where;
         foreach my $arg ( @$where ) {
            return 0
               if ($arg->{operator} || '') eq 'like'
                  && $arg->{right_arg} !~ m/[%_]/;
         }
         return;
      },
   },
   {
      id   => 'CLA.001',      # SELECT w/o WHERE
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return unless ($event->{query_struct}->{type} || '') eq 'select';
         return unless $event->{query_struct}->{from};
         return 0 unless $event->{query_struct}->{where};
         return;
      },
   },
   {
      id   => 'CLA.002',      # ORDER BY RAND()
      code => sub {
         my ( %args ) = @_;
         my $event   = $args{event};
         my $orderby = $event->{query_struct}->{order_by};
         return unless $orderby;
         foreach my $ident ( @$orderby ) {
            # SQLParser will have uppercased the function name.
            return 0 if $ident->{function} && $ident->{function} eq 'RAND';
         }
         return;
      },
   },
   {
      id   => 'CLA.003',      # LIMIT w/ OFFSET
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return unless $event->{query_struct}->{limit};
         return unless defined $event->{query_struct}->{limit}->{offset};
         return 0;
      },
   },
   {
      id   => 'CLA.004',      # GROUP BY <number>
      code => sub {
         my ( %args ) = @_;
         my $event   = $args{event};
         my $groupby = $event->{query_struct}->{group_by};
         return unless $groupby;
         foreach my $ident ( @$groupby ) {
            return 0 if exists $ident->{position};
         }
         return;
      },
   },
   {
      id   => 'CLA.005',      # ORDER BY col where col=<constant>
      code => sub {
         my ( %args ) = @_;
         my $event   = $args{event};
         my $orderby = $event->{query_struct}->{order_by};
         return unless $orderby;
         my $where   = $event->{query_struct}->{where};
         return unless $where;
         my %orderby_col = map { lc $_->{column} => 1 }
                           grep { $_->{column} }
                           @$orderby;
         foreach my $pred ( @$where ) {
            my $val = $pred->{right_arg};
            next unless $val;
            return 0 if $val =~ m/^\d+$/ && $orderby_col{lc($pred->{left_arg} || '')};
         }
         return;
      },
   },
   {
      id   => 'CLA.006',      # GROUP BY or ORDER BY different tables
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         my $groupby = $event->{query_struct}->{group_by};
         my $orderby = $event->{query_struct}->{order_by};
         return unless $groupby || $orderby;

         my %groupby_tbls = map { $_->{table} => 1 }
                            grep { $_->{table} }
                            @$groupby;
         return 0 if scalar keys %groupby_tbls > 1;

         my %orderby_tbls = map { $_->{table} => 1 }
                            grep { $_->{table} }
                            @$orderby;
         return 0 if scalar keys %orderby_tbls > 1;

         # Remove ORDER BY tables from GROUP BY tables.  Any tables
         # remain in GROUP BY are unique to GROUP BY, i.e. not in
         # ORDER BY, so we have a case like: group by tbl1.id order by tbl2.id
         map { delete $groupby_tbls{$_} } keys %orderby_tbls;
         return 0 if scalar keys %groupby_tbls;

         return;
      },
   },
   {
      id   => 'CLA.007',      # ORDER BY ASC/DESC mix can't use index
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         my $order_by = $event->{query_struct}->{order_by};
         return unless $order_by;
         my ($asc, $desc) = (0, 0);
         foreach my $col ( @$order_by ) {
            if ( ($col->{sort} || 'ASC') eq 'ASC' ) {
               $asc++;
            }
            else {
               $desc++;
            }
            return 0 if $asc && $desc;
         }
         return;
      },
   },
   {
      id   => 'COL.001',      # SELECT *
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return unless ($event->{query_struct}->{type} || '') eq 'select';
         my $cols = $event->{query_struct}->{columns};
         return unless $cols;
         foreach my $col ( @$cols ) {
            return 0 if $col->{col} eq '*' && !$col->{func};
         }
         return;
      },
   },
   {
      id   => 'COL.002',      # INSERT w/o (cols) def
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         my $type  = $event->{query_struct}->{type} || '';
         return unless $type eq 'insert' || $type eq 'replace';
         return 0 unless $event->{query_struct}->{columns};
         return;
      },
   },
   {
      id   => 'LIT.001',      # IP as string
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         if ( $event->{arg} =~ m/['"]\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/gc ) {
            return (pos $event->{arg}) || 0;
         }
         return;
      },
   },
   {
      id   => 'LIT.002',      # Date not quoted
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         # YYYY-MM-DD
         if ( $event->{arg} =~ m/(?<!['"\w-])\d{4}-\d{1,2}-\d{1,2}\b/gc ) {
            return (pos $event->{arg}) || 0;
         }
         # YY-MM-DD
         if ( $event->{arg} =~ m/(?<!['"\w\d-])\d{2}-\d{1,2}-\d{1,2}\b/gc ) {
            return (pos $event->{arg}) || 0;
         }
         return;
      },
   },
   {
      id   => 'KWR.001',      # SQL_CALC_FOUND_ROWS
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return 0 if $event->{query_struct}->{keywords}->{sql_calc_found_rows};
         return;
      },
   },
   {
      id   => 'JOI.001',      # comma and ansi joins
      code => sub {
         my ( %args ) = @_;
         my $event  = $args{event};
         my $struct = $event->{query_struct};
         my $tbls   = $struct->{from} || $struct->{into} || $struct->{tables};
         return unless $tbls;
         my $comma_join = 0;
         my $ansi_join  = 0;
         foreach my $tbl ( @$tbls ) {
            if ( $tbl->{join} ) {
               if ( $tbl->{join}->{ansi} ) {
                  $ansi_join = 1;
               }
               else {
                  $comma_join = 1;
               }
            }
            return 0 if $comma_join && $ansi_join;
         }
         return;
      },
   },
   {
      id   => 'RES.001',      # non-deterministic GROUP BY
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return unless ($event->{query_struct}->{type} || '') eq 'select';
         my $groupby = $event->{query_struct}->{group_by};
         return unless $groupby;
         # Only check GROUP BY column names, not numbers.  GROUP BY number
         # is handled in CLA.004.
         my %groupby_col = map { $_->{column} => 1 }
                           grep { $_->{column} }
                           @$groupby;
         return unless scalar %groupby_col;
         # Skip non-columns -- NULL, digits, functions, variables
         my $cols = [
                      grep { _looks_like_column($_->{col}) }
                      grep { ! exists $_->{func} }
                      @{$event->{query_struct}->{columns}}
                    ];
         # All SELECT cols must be in GROUP BY cols clause.
         # E.g. select a, b, c from tbl group by a; -- non-deterministic
         foreach my $col ( @$cols ) {
            return 0 unless $groupby_col{ $col->{col} }
                         || ($col->{alias} && $groupby_col{ $col->{alias} });
         }
         return;
      },
   },
   {
      id   => 'RES.002',      # non-deterministic LIMIT w/o ORDER BY
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return unless $event->{query_struct}->{limit};
         # If query doesn't use tables then this check isn't applicable.
         return unless    $event->{query_struct}->{from}
                         || $event->{query_struct}->{into}
                         || $event->{query_struct}->{tables};
         return 0 unless $event->{query_struct}->{order_by};
         return;
      },
   },
   {
      id   => 'STA.001',      # != instead of <>
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         return 0 if $event->{arg} =~ m/!=/;
         return;
      },
   },
   {
      id   => 'SUB.001',      # IN(<subquery>)
      code => sub {
         my ( %args ) = @_;
         my $event = $args{event};
         if ( $event->{arg} =~ m/\bIN\s*\(\s*SELECT\b/gi ) {
            return pos $event->{arg};
         }
         return;
      },
   },
   {
      id   => 'JOI.002',      # table joined more than once, but not self-join
      code => sub {
         my ( %args ) = @_;
         my $event  = $args{event};
         my $struct = $event->{query_struct};
         return unless $struct;
         my $tbls   = $struct->{from} || $struct->{into} || $struct->{tables};
         return unless $tbls;
         my %tbl_cnt;
         my $n_tbls = scalar @$tbls;

         # To detect this rule we look for tables joined more than once
         # (if cnt > 1) and via both an ansi and comma join.  This captures
         # "t AS a JOIN t AS b a.foo=b.bar, t" but not the simple self-join
         # "t AS a JOIN t AS b a.foo=b.bar" or cases where a table is joined
         # to many other tables all via ansi joins or the implicit self-join
         # (which we really can't detect) "t AS a, t AS b WHERE a.foo=b.bar".
         # When a table shows up multiple times in ansi joins and then again
         # in a comma join, the comma join is usually culprit of this rule.
         for my $i ( 0..($n_tbls-1) ) {
            my $tbl      = $tbls->[$i];
            my $tbl_name = lc $tbl->{tbl};

            $tbl_cnt{$tbl_name}->{cnt}++;
            $tbl_cnt{$tbl_name}->{ansi_join}++
               if $tbl->{join} && $tbl->{join}->{ansi};
            $tbl_cnt{$tbl_name}->{comma_join}++
               if $tbl->{join} && !$tbl->{join}->{ansi};

            if ( $tbl_cnt{$tbl_name}->{cnt} > 1 ) {
               return 0
                  if    $tbl_cnt{$tbl_name}->{ansi_join}
                     && $tbl_cnt{$tbl_name}->{comma_join};
            }
         }
         return;
      },
   },
   {
      id   => 'JOI.003',  # OUTER JOIN converted to INNER JOIN
      code => sub {
         my ( %args ) = @_;
         my $event  = $args{event};
         my $struct = $event->{query_struct};
         return unless $struct;
         my $tbls   = $struct->{from} || $struct->{into} || $struct->{tables};
         return unless $tbls;
         my $where  = $struct->{where};
         return unless $where;

         # Good LEFT OUTER JOIN:
         #   select * from L left join R using(c) where L.a=5;
         # Converts to INNER JOIN when:
         #   select * from L left join R using(c) where L.a=5 and R.b=10;
         # To detect this condition we need to see if there's an OUTER
         # join then see if there's a column from the outer table in the
         # WHERE clause that is anything but "IS NULL".  So in the example
         # above, R.b=10 is this culprit.
         # http://code.google.com/p/maatkit/issues/detail?id=950
         my %outer_tbls = map { $_->{tbl} => 1 } get_outer_tables($tbls);
         PTDEBUG && _d("Outer tables:", keys %outer_tbls);
         return unless %outer_tbls;

         foreach my $pred ( @$where ) {
            next unless $pred->{left_arg};  # skip constants like 1 in "WHERE 1"
            my ($tbl, $col) = split /\./, $pred->{left_arg};
            if ( $tbl && $col && $outer_tbls{$tbl} ) {
               # Only outer_tbl.col IS NULL is permissible.
               if ($pred->{operator} ne 'is' || $pred->{right_arg} !~ m/null/i)
               {
                  PTDEBUG && _d("Predicate prevents OUTER JOIN:",
                     map { $pred->{$_} } qw(left_arg operator right_arg));
                  return 0;
               }
            }
         }

         return;
      }
   },
   {
      id   => 'JOI.004',  # broken exclusion join
      code => sub {
         my ( %args ) = @_;
         my $event  = $args{event};
         my $struct = $event->{query_struct};
         return unless $struct;
         my $tbls   = $struct->{from} || $struct->{into} || $struct->{tables};
         return unless $tbls;
         my $where  = $struct->{where};
         return unless $where;

         my %outer_tbls;
         my %outer_tbl_join_cols;
         my @unknown_join_cols;
         foreach my $outer_tbl ( get_outer_tables($tbls) ) {
            $outer_tbls{$outer_tbl->{tbl}} = 1;

            # For "L LEFT JOIN R" R is the outer table and since it follows
            # L its table struct will have the join struct with the join
            # condition.  But for "L RIGHT JOIN R" L is the outer table and
            # will not have the join struct because it precedes R.  This
            # is due to how parse_from() works.  So if the outer table doesn't
            # have the join struct, we need to get it from the inner table.
            my $join = $outer_tbl->{join};
            if ( !$join ) {
               my ($inner_tbl) = grep {
                  exists $_->{join}
                  && $_->{join}->{to} eq $outer_tbl->{tbl}
               } @$tbls;
               $join = $inner_tbl->{join};
               die "Cannot find join structure for $outer_tbl->{tbl}"
                  unless $join;
            }

            # Get the outer table columns used in the jon condition.
            if ( $join->{condition} eq 'using' ) {
               %outer_tbl_join_cols = map { $_ => 1 } @{$join->{columns}};
            }
            else {
               my $where = $join->{where};
               die "Join structure for ON condition has no where structure"
                  unless $where;
               my @join_cols;
               foreach my $pred ( @$where ) {
                  next unless $pred->{operator} eq '=';
                  # Assume all equality comparisons are like tbl1.col=tbl2.col.
                  # Thus join conditions like tbl1.col<tbl2.col aren't handled.
                  push @join_cols, $pred->{left_arg}, $pred->{right_arg};
               }
               PTDEBUG && _d("Join columns:", @join_cols);
               foreach my $join_col ( @join_cols ) {
                  my ($tbl, $col) = split /\./, $join_col;
                  if ( !$col ) {
                     $col = $tbl;
                     $tbl = determine_table_for_column(
                        column      => $col,
                        tbl_structs => $event->{tbl_structs},
                     );
                  }
                  if ( !$tbl ) {
                     PTDEBUG && _d("Cannot determine the table for join column",
                        $col);
                     push @unknown_join_cols, $col;
                  }
                  else {
                     $outer_tbl_join_cols{$col} = 1
                        if $tbl eq $outer_tbl->{tbl};
                  }
               }
            }
         }
         PTDEBUG && _d("Outer table join columns:", keys %outer_tbl_join_cols);
         PTDEBUG && _d("Unknown join columns:", @unknown_join_cols);

         # Here's a problem query:
         #   select c from L left join R on L.a=R.b where L.a=5 and R.c is null
         # The problem is "R.c is null" will not allow one to determine if
         # a null row from the outer table is null due to not matching the
         # inner table or due to R.c actually having a null value.  So we
         # need to check every outer table column in the WHERE clause for
         # ones that are 1) not in the JOIN expression and 2) "IS NULL'.
         # http://code.google.com/p/maatkit/issues/detail?id=950
         foreach my $pred ( @$where ) {
            next unless $pred->{left_arg}; # skip constants like 1 in "WHERE 1"
            next unless $pred->{operator} eq 'is'
               && $pred->{right_arg} =~ m/NULL/i;

            my ($tbl, $col) = split /\./, $pred->{left_arg};
            if ( !$col ) {
               # A col in the WHERE clause isn't table-qualified.  Try to
               # determine its table.  If we can, great, if not "return 0 if"
               # below will immediately fail because $tbl will be undef still.
               # That's ok; it just means this test tries as best it can and
               # gets skipped silently when we can't tbl-qualify cols.
               $col = $tbl;
               $tbl = determine_table_for_column(
                  column      => $col,
                  tbl_structs => $event->{tbl_structs},
               );
            }
            next unless $tbl;               # can't check tbl if tbl is unknown
            next unless $outer_tbls{$tbl};  # only want outer tbl cols

            # At this point we know col is from outer table and "IS NULL".
            # "outer_tbl.join_col IS NULL" is ok, but...
            next if $outer_tbl_join_cols{$col};

            # ...this rule could match here for two reasons.  First, if
            # we know the outer tbl join cols and this col isn't one of them
            # (hence the statement above passed and we got here), then
            # @unknown_join_cols will be empty and we'll match.  This is like
            # "outer_tbl.NON_join_col IS NULL".  Or second, we don't know
            # the outer tbl join cols and @unknown_join_cols will have cols
            # and we'll match if this col isn't one of the unknown join cols.
            # This is for cases like:
            #   select c from L left join R on a=b where L.a=5 and R.c is null
            # We don't know if a or b belong to L or R but we know c is from
            # the outer table and is neither a nor b.
            return 0 unless grep { $col eq $_ } @unknown_join_cols;
         }

         return;  # rule does not match, as best as we can determine
      }
   },
};


# Sub: get_outer_tables
#   Get the outer tables in joins.
#
# Parameters:
#   $tbls - Arrayref of hashrefs with table info
#
# Returns:
#   Array of hashref to the outer tables
sub get_outer_tables {
   my ( $tbls ) = @_;
   return unless $tbls;
   my @outer_tbls;
   my $n_tbls = scalar @$tbls;
   for my $i( 0..($n_tbls-1) ) {
      my $tbl = $tbls->[$i];
      next unless $tbl->{join} && $tbl->{join}->{type} =~ m/left|right/i;
      push @outer_tbls,
         $tbl->{join}->{type} =~ m/left/i ? $tbl
                                          : $tbls->[$i - 1];
   }
   return @outer_tbls;
}


# Sub: determine_table_for_column
#   Determine which table a column belongs to.  No extensive, online effort
#   is made to determine the column's table.  The caller is responsible for
#   using the parsed SQL structure to get its db/tables and their tbl structs
#   and providing a list of them.
#
# Parameters:
#   %args - Arguments
#
# Required Arguments:
#   column - column name, not quoted
#
# Optional Arguments:
#   tbl_structs - arrayref hashrefs returned by <TableParser::parse()>
#
# Returns:
#   Table name, not quoted
sub determine_table_for_column {
   my ( %args ) = @_;
   my @required_args = qw(column);
   foreach my $arg ( @required_args ) {
      die "I need a $arg argument" unless $args{$arg};
   }
   my ($col) = @args{@required_args};

   my $tbl_structs = $args{tbl_structs};
   return unless $tbl_structs;

   foreach my $db ( keys %$tbl_structs ) {
      foreach my $tbl ( keys %{$tbl_structs->{$db}} ) {
         if ( $tbl_structs->{$db}->{$tbl}->{is_col}->{$col} ) {
            PTDEBUG && _d($col, "column belongs to", $db, $tbl);
            return $tbl;
         }
      }
   }

   PTDEBUG && _d("Cannot determine table for column", $col);
   return;
}

sub _looks_like_column {
   my $col = shift;
   # NULL, numbers, variables and functions are definitely not columns
   return if $col eq '*' || uc($col) eq 'NULL';
   return if $col =~ /\A(?:\b[0-9]+\b|\@{1,2}.+)/;
   return $col;
}

sub _d {
   my ($package, undef, $line) = caller 0;
   @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
        map { defined $_ ? $_ : 'undef' }
        @_;
   print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}

1;
}
# ###########################################################################
# End QueryAdvisorRules package
# ###########################################################################