# This program is copyright 2010-2011 Percona Inc. # Feedback and improvements are welcome. # # THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, version 2; OR the Perl Artistic License. On UNIX and similar # systems, you can issue `man perlgpl' or `man perlartistic' to read these # licenses. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 Temple # Place, Suite 330, Boston, MA 02111-1307 USA. # ########################################################################### # QueryAdvisorRules package # ########################################################################### { # Package: QueryAdvisorRules # QueryAdvisorRules encapsulates rules for checking queries. package QueryAdvisorRules; use base 'AdvisorRules'; use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); use constant PTDEBUG => $ENV{PTDEBUG} || 0; sub new { my ( $class, %args ) = @_; my $self = $class->SUPER::new(%args); @{$self->{rules}} = $self->get_rules(); PTDEBUG && _d(scalar @{$self->{rules}}, "rules"); return $self; } # Each rules is a hashref with two keys: # * id Unique PREFIX.NUMBER for the rule. The prefix is three chars # which hints to the nature of the rule. See example below. # * code Coderef to check rule, returns undef if rule does not match, # else returns the string pos near where the rule matches or 0 # to indicate it doesn't know the pos. The code is passed a\ # single arg: a hashref event. sub get_rules { return { id => 'ALI.001', # Implicit alias code => sub { my ( %args ) = @_; my $event = $args{event}; my $struct = $event->{query_struct}; my $tbls = $struct->{from} || $struct->{into} || $struct->{tables}; return unless $tbls; foreach my $tbl ( @$tbls ) { return 0 if $tbl->{alias} && !$tbl->{explicit_alias}; } my $cols = $struct->{columns}; return unless $cols; foreach my $col ( @$cols ) { return 0 if $col->{alias} && !$col->{explicit_alias}; } return; }, }, { id => 'ALI.002', # tbl.* alias code => sub { my ( %args ) = @_; my $event = $args{event}; my $cols = $event->{query_struct}->{columns}; return unless $cols; foreach my $col ( @$cols ) { return 0 if $col->{tbl} && $col->{col} eq '*' && $col->{alias}; } return; }, }, { id => 'ALI.003', # tbl AS tbl code => sub { my ( %args ) = @_; my $event = $args{event}; my $struct = $event->{query_struct}; my $tbls = $struct->{from} || $struct->{into} || $struct->{tables}; return unless $tbls; foreach my $tbl ( @$tbls ) { return 0 if $tbl->{alias} && $tbl->{alias} eq $tbl->{tbl}; } my $cols = $struct->{columns}; return unless $cols; foreach my $col ( @$cols ) { return 0 if $col->{alias} && $col->{alias} eq $col->{col}; } return; }, }, { id => 'ARG.001', # col = '%foo' code => sub { my ( %args ) = @_; my $event = $args{event}; my $where = $event->{query_struct}->{where}; return unless $where && @$where; foreach my $arg ( @$where ) { return 0 if ($arg->{operator} || '') eq 'like' && $arg->{right_arg} =~ m/[\'\"][\%\_]./; } return; }, }, { id => 'ARG.002', # LIKE w/o wildcard code => sub { my ( %args ) = @_; my $event = $args{event}; my $where = $event->{query_struct}->{where}; return unless $where && @$where; foreach my $arg ( @$where ) { return 0 if ($arg->{operator} || '') eq 'like' && $arg->{right_arg} !~ m/[%_]/; } return; }, }, { id => 'CLA.001', # SELECT w/o WHERE code => sub { my ( %args ) = @_; my $event = $args{event}; return unless ($event->{query_struct}->{type} || '') eq 'select'; return unless $event->{query_struct}->{from}; return 0 unless $event->{query_struct}->{where}; return; }, }, { id => 'CLA.002', # ORDER BY RAND() code => sub { my ( %args ) = @_; my $event = $args{event}; my $orderby = $event->{query_struct}->{order_by}; return unless $orderby; foreach my $ident ( @$orderby ) { # SQLParser will have uppercased the function name. return 0 if $ident->{function} && $ident->{function} eq 'RAND'; } return; }, }, { id => 'CLA.003', # LIMIT w/ OFFSET code => sub { my ( %args ) = @_; my $event = $args{event}; return unless $event->{query_struct}->{limit}; return unless defined $event->{query_struct}->{limit}->{offset}; return 0; }, }, { id => 'CLA.004', # GROUP BY code => sub { my ( %args ) = @_; my $event = $args{event}; my $groupby = $event->{query_struct}->{group_by}; return unless $groupby; foreach my $ident ( @$groupby ) { return 0 if exists $ident->{position}; } return; }, }, { id => 'CLA.005', # ORDER BY col where col= code => sub { my ( %args ) = @_; my $event = $args{event}; my $orderby = $event->{query_struct}->{order_by}; return unless $orderby; my $where = $event->{query_struct}->{where}; return unless $where; my %orderby_col = map { lc $_->{column} => 1 } grep { $_->{column} } @$orderby; foreach my $pred ( @$where ) { my $val = $pred->{right_arg}; next unless $val; return 0 if $val =~ m/^\d+$/ && $orderby_col{lc($pred->{left_arg} || '')}; } return; }, }, { id => 'CLA.006', # GROUP BY or ORDER BY different tables code => sub { my ( %args ) = @_; my $event = $args{event}; my $groupby = $event->{query_struct}->{group_by}; my $orderby = $event->{query_struct}->{order_by}; return unless $groupby || $orderby; my %groupby_tbls = map { $_->{table} => 1 } grep { $_->{table} } @$groupby; return 0 if scalar keys %groupby_tbls > 1; my %orderby_tbls = map { $_->{table} => 1 } grep { $_->{table} } @$orderby; return 0 if scalar keys %orderby_tbls > 1; # Remove ORDER BY tables from GROUP BY tables. Any tables # remain in GROUP BY are unique to GROUP BY, i.e. not in # ORDER BY, so we have a case like: group by tbl1.id order by tbl2.id map { delete $groupby_tbls{$_} } keys %orderby_tbls; return 0 if scalar keys %groupby_tbls; return; }, }, { id => 'CLA.007', # ORDER BY ASC/DESC mix can't use index code => sub { my ( %args ) = @_; my $event = $args{event}; my $order_by = $event->{query_struct}->{order_by}; return unless $order_by; my ($asc, $desc) = (0, 0); foreach my $col ( @$order_by ) { if ( ($col->{sort} || 'ASC') eq 'ASC' ) { $asc++; } else { $desc++; } return 0 if $asc && $desc; } return; }, }, { id => 'COL.001', # SELECT * code => sub { my ( %args ) = @_; my $event = $args{event}; return unless ($event->{query_struct}->{type} || '') eq 'select'; my $cols = $event->{query_struct}->{columns}; return unless $cols; foreach my $col ( @$cols ) { return 0 if $col->{col} eq '*' && !$col->{func}; } return; }, }, { id => 'COL.002', # INSERT w/o (cols) def code => sub { my ( %args ) = @_; my $event = $args{event}; my $type = $event->{query_struct}->{type} || ''; return unless $type eq 'insert' || $type eq 'replace'; return 0 unless $event->{query_struct}->{columns}; return; }, }, { id => 'LIT.001', # IP as string code => sub { my ( %args ) = @_; my $event = $args{event}; if ( $event->{arg} =~ m/['"]\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/gc ) { return (pos $event->{arg}) || 0; } return; }, }, { id => 'LIT.002', # Date not quoted code => sub { my ( %args ) = @_; my $event = $args{event}; # YYYY-MM-DD if ( $event->{arg} =~ m/(?{arg}) || 0; } # YY-MM-DD if ( $event->{arg} =~ m/(?{arg}) || 0; } return; }, }, { id => 'KWR.001', # SQL_CALC_FOUND_ROWS code => sub { my ( %args ) = @_; my $event = $args{event}; return 0 if $event->{query_struct}->{keywords}->{sql_calc_found_rows}; return; }, }, { id => 'JOI.001', # comma and ansi joins code => sub { my ( %args ) = @_; my $event = $args{event}; my $struct = $event->{query_struct}; my $tbls = $struct->{from} || $struct->{into} || $struct->{tables}; return unless $tbls; my $comma_join = 0; my $ansi_join = 0; foreach my $tbl ( @$tbls ) { if ( $tbl->{join} ) { if ( $tbl->{join}->{ansi} ) { $ansi_join = 1; } else { $comma_join = 1; } } return 0 if $comma_join && $ansi_join; } return; }, }, { id => 'RES.001', # non-deterministic GROUP BY code => sub { my ( %args ) = @_; my $event = $args{event}; return unless ($event->{query_struct}->{type} || '') eq 'select'; my $groupby = $event->{query_struct}->{group_by}; return unless $groupby; # Only check GROUP BY column names, not numbers. GROUP BY number # is handled in CLA.004. my %groupby_col = map { $_->{column} => 1 } grep { $_->{column} } @$groupby; return unless scalar %groupby_col; # Skip non-columns -- NULL, digits, functions, variables my $cols = [ grep { _looks_like_column($_->{col}) } grep { ! exists $_->{func} } @{$event->{query_struct}->{columns}} ]; # All SELECT cols must be in GROUP BY cols clause. # E.g. select a, b, c from tbl group by a; -- non-deterministic foreach my $col ( @$cols ) { return 0 unless $groupby_col{ $col->{col} } || ($col->{alias} && $groupby_col{ $col->{alias} }); } return; }, }, { id => 'RES.002', # non-deterministic LIMIT w/o ORDER BY code => sub { my ( %args ) = @_; my $event = $args{event}; return unless $event->{query_struct}->{limit}; # If query doesn't use tables then this check isn't applicable. return unless $event->{query_struct}->{from} || $event->{query_struct}->{into} || $event->{query_struct}->{tables}; return 0 unless $event->{query_struct}->{order_by}; return; }, }, { id => 'STA.001', # != instead of <> code => sub { my ( %args ) = @_; my $event = $args{event}; return 0 if $event->{arg} =~ m/!=/; return; }, }, { id => 'SUB.001', # IN() code => sub { my ( %args ) = @_; my $event = $args{event}; if ( $event->{arg} =~ m/\bIN\s*\(\s*SELECT\b/gi ) { return pos $event->{arg}; } return; }, }, { id => 'JOI.002', # table joined more than once, but not self-join code => sub { my ( %args ) = @_; my $event = $args{event}; my $struct = $event->{query_struct}; return unless $struct; my $tbls = $struct->{from} || $struct->{into} || $struct->{tables}; return unless $tbls; my %tbl_cnt; my $n_tbls = scalar @$tbls; # To detect this rule we look for tables joined more than once # (if cnt > 1) and via both an ansi and comma join. This captures # "t AS a JOIN t AS b a.foo=b.bar, t" but not the simple self-join # "t AS a JOIN t AS b a.foo=b.bar" or cases where a table is joined # to many other tables all via ansi joins or the implicit self-join # (which we really can't detect) "t AS a, t AS b WHERE a.foo=b.bar". # When a table shows up multiple times in ansi joins and then again # in a comma join, the comma join is usually culprit of this rule. for my $i ( 0..($n_tbls-1) ) { my $tbl = $tbls->[$i]; my $tbl_name = lc $tbl->{tbl}; $tbl_cnt{$tbl_name}->{cnt}++; $tbl_cnt{$tbl_name}->{ansi_join}++ if $tbl->{join} && $tbl->{join}->{ansi}; $tbl_cnt{$tbl_name}->{comma_join}++ if $tbl->{join} && !$tbl->{join}->{ansi}; if ( $tbl_cnt{$tbl_name}->{cnt} > 1 ) { return 0 if $tbl_cnt{$tbl_name}->{ansi_join} && $tbl_cnt{$tbl_name}->{comma_join}; } } return; }, }, { id => 'JOI.003', # OUTER JOIN converted to INNER JOIN code => sub { my ( %args ) = @_; my $event = $args{event}; my $struct = $event->{query_struct}; return unless $struct; my $tbls = $struct->{from} || $struct->{into} || $struct->{tables}; return unless $tbls; my $where = $struct->{where}; return unless $where; # Good LEFT OUTER JOIN: # select * from L left join R using(c) where L.a=5; # Converts to INNER JOIN when: # select * from L left join R using(c) where L.a=5 and R.b=10; # To detect this condition we need to see if there's an OUTER # join then see if there's a column from the outer table in the # WHERE clause that is anything but "IS NULL". So in the example # above, R.b=10 is this culprit. # http://code.google.com/p/maatkit/issues/detail?id=950 my %outer_tbls = map { $_->{tbl} => 1 } get_outer_tables($tbls); PTDEBUG && _d("Outer tables:", keys %outer_tbls); return unless %outer_tbls; foreach my $pred ( @$where ) { next unless $pred->{left_arg}; # skip constants like 1 in "WHERE 1" my ($tbl, $col) = split /\./, $pred->{left_arg}; if ( $tbl && $col && $outer_tbls{$tbl} ) { # Only outer_tbl.col IS NULL is permissible. if ($pred->{operator} ne 'is' || $pred->{right_arg} !~ m/null/i) { PTDEBUG && _d("Predicate prevents OUTER JOIN:", map { $pred->{$_} } qw(left_arg operator right_arg)); return 0; } } } return; } }, { id => 'JOI.004', # broken exclusion join code => sub { my ( %args ) = @_; my $event = $args{event}; my $struct = $event->{query_struct}; return unless $struct; my $tbls = $struct->{from} || $struct->{into} || $struct->{tables}; return unless $tbls; my $where = $struct->{where}; return unless $where; my %outer_tbls; my %outer_tbl_join_cols; my @unknown_join_cols; foreach my $outer_tbl ( get_outer_tables($tbls) ) { $outer_tbls{$outer_tbl->{tbl}} = 1; # For "L LEFT JOIN R" R is the outer table and since it follows # L its table struct will have the join struct with the join # condition. But for "L RIGHT JOIN R" L is the outer table and # will not have the join struct because it precedes R. This # is due to how parse_from() works. So if the outer table doesn't # have the join struct, we need to get it from the inner table. my $join = $outer_tbl->{join}; if ( !$join ) { my ($inner_tbl) = grep { exists $_->{join} && $_->{join}->{to} eq $outer_tbl->{tbl} } @$tbls; $join = $inner_tbl->{join}; die "Cannot find join structure for $outer_tbl->{tbl}" unless $join; } # Get the outer table columns used in the jon condition. if ( $join->{condition} eq 'using' ) { %outer_tbl_join_cols = map { $_ => 1 } @{$join->{columns}}; } else { my $where = $join->{where}; die "Join structure for ON condition has no where structure" unless $where; my @join_cols; foreach my $pred ( @$where ) { next unless $pred->{operator} eq '='; # Assume all equality comparisons are like tbl1.col=tbl2.col. # Thus join conditions like tbl1.col{left_arg}, $pred->{right_arg}; } PTDEBUG && _d("Join columns:", @join_cols); foreach my $join_col ( @join_cols ) { my ($tbl, $col) = split /\./, $join_col; if ( !$col ) { $col = $tbl; $tbl = determine_table_for_column( column => $col, tbl_structs => $event->{tbl_structs}, ); } if ( !$tbl ) { PTDEBUG && _d("Cannot determine the table for join column", $col); push @unknown_join_cols, $col; } else { $outer_tbl_join_cols{$col} = 1 if $tbl eq $outer_tbl->{tbl}; } } } } PTDEBUG && _d("Outer table join columns:", keys %outer_tbl_join_cols); PTDEBUG && _d("Unknown join columns:", @unknown_join_cols); # Here's a problem query: # select c from L left join R on L.a=R.b where L.a=5 and R.c is null # The problem is "R.c is null" will not allow one to determine if # a null row from the outer table is null due to not matching the # inner table or due to R.c actually having a null value. So we # need to check every outer table column in the WHERE clause for # ones that are 1) not in the JOIN expression and 2) "IS NULL'. # http://code.google.com/p/maatkit/issues/detail?id=950 foreach my $pred ( @$where ) { next unless $pred->{left_arg}; # skip constants like 1 in "WHERE 1" next unless $pred->{operator} eq 'is' && $pred->{right_arg} =~ m/NULL/i; my ($tbl, $col) = split /\./, $pred->{left_arg}; if ( !$col ) { # A col in the WHERE clause isn't table-qualified. Try to # determine its table. If we can, great, if not "return 0 if" # below will immediately fail because $tbl will be undef still. # That's ok; it just means this test tries as best it can and # gets skipped silently when we can't tbl-qualify cols. $col = $tbl; $tbl = determine_table_for_column( column => $col, tbl_structs => $event->{tbl_structs}, ); } next unless $tbl; # can't check tbl if tbl is unknown next unless $outer_tbls{$tbl}; # only want outer tbl cols # At this point we know col is from outer table and "IS NULL". # "outer_tbl.join_col IS NULL" is ok, but... next if $outer_tbl_join_cols{$col}; # ...this rule could match here for two reasons. First, if # we know the outer tbl join cols and this col isn't one of them # (hence the statement above passed and we got here), then # @unknown_join_cols will be empty and we'll match. This is like # "outer_tbl.NON_join_col IS NULL". Or second, we don't know # the outer tbl join cols and @unknown_join_cols will have cols # and we'll match if this col isn't one of the unknown join cols. # This is for cases like: # select c from L left join R on a=b where L.a=5 and R.c is null # We don't know if a or b belong to L or R but we know c is from # the outer table and is neither a nor b. return 0 unless grep { $col eq $_ } @unknown_join_cols; } return; # rule does not match, as best as we can determine } }, }; # Sub: get_outer_tables # Get the outer tables in joins. # # Parameters: # $tbls - Arrayref of hashrefs with table info # # Returns: # Array of hashref to the outer tables sub get_outer_tables { my ( $tbls ) = @_; return unless $tbls; my @outer_tbls; my $n_tbls = scalar @$tbls; for my $i( 0..($n_tbls-1) ) { my $tbl = $tbls->[$i]; next unless $tbl->{join} && $tbl->{join}->{type} =~ m/left|right/i; push @outer_tbls, $tbl->{join}->{type} =~ m/left/i ? $tbl : $tbls->[$i - 1]; } return @outer_tbls; } # Sub: determine_table_for_column # Determine which table a column belongs to. No extensive, online effort # is made to determine the column's table. The caller is responsible for # using the parsed SQL structure to get its db/tables and their tbl structs # and providing a list of them. # # Parameters: # %args - Arguments # # Required Arguments: # column - column name, not quoted # # Optional Arguments: # tbl_structs - arrayref hashrefs returned by # # Returns: # Table name, not quoted sub determine_table_for_column { my ( %args ) = @_; my @required_args = qw(column); foreach my $arg ( @required_args ) { die "I need a $arg argument" unless $args{$arg}; } my ($col) = @args{@required_args}; my $tbl_structs = $args{tbl_structs}; return unless $tbl_structs; foreach my $db ( keys %$tbl_structs ) { foreach my $tbl ( keys %{$tbl_structs->{$db}} ) { if ( $tbl_structs->{$db}->{$tbl}->{is_col}->{$col} ) { PTDEBUG && _d($col, "column belongs to", $db, $tbl); return $tbl; } } } PTDEBUG && _d("Cannot determine table for column", $col); return; } sub _looks_like_column { my $col = shift; # NULL, numbers, variables and functions are definitely not columns return if $col eq '*' || uc($col) eq 'NULL'; return if $col =~ /\A(?:\b[0-9]+\b|\@{1,2}.+)/; return $col; } sub _d { my ($package, undef, $line) = caller 0; @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } map { defined $_ ? $_ : 'undef' } @_; print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; } 1; } # ########################################################################### # End QueryAdvisorRules package # ###########################################################################