percona-toolkit/lib/QueryRewriter.pm

# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA  02111-1307  USA.
# ###########################################################################
# QueryRewriter package
# ###########################################################################
{
# Package: QueryRewriter
# QueryRewriter rewrites and transforms queries.
package QueryRewriter;

use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;

# A list of verbs that can appear in queries.  I know this is incomplete -- it
# does not have CREATE, DROP, ALTER, TRUNCATE for example.  But I don't need
# those for my client yet.  Other verbs: KILL, LOCK, UNLOCK
our $verbs   = qr{^SHOW|^FLUSH|^COMMIT|^ROLLBACK|^BEGIN|SELECT|INSERT
                  |UPDATE|DELETE|REPLACE|^SET|UNION|^START|^LOCK}xi;
my $quote_re = qr/"(?:(?!(?<!\\)").)*"|'(?:(?!(?<!\\)').)*'/; # Costly!
my $bal;
$bal         = qr/
                  \(
                  (?:
                     (?> [^()]+ )    # Non-parens without backtracking
                     |
                     (??{ $bal })    # Group with matching parens
                  )*
                  \)
                 /x;

# The one-line comment pattern is quite crude.  This is intentional for
# performance.  The multi-line pattern does not match version-comments.
my $olc_re = qr/(?:--|#)[^'"\r\n]*(?=[\r\n]|\Z)/;  # One-line comments
my $mlc_re = qr#/\*[^!].*?\*/#sm;                  # But not /*!version */
my $vlc_re = qr#/\*.*?[0-9]+.*?\*/#sm;             # For SHOW + /*!version */
my $vlc_rf = qr#^(?:SHOW).*?/\*![0-9]+(.*?)\*/#sm;     # Variation for SHOW


sub new {
   my ( $class, %args ) = @_;
   my $self = { %args };
   return bless $self, $class;
}

# Strips comments out of queries.
sub strip_comments {
   my ( $self, $query ) = @_;
   return unless $query;
   $query =~ s/$mlc_re//go;
   $query =~ s/$olc_re//go;
   if ( $query =~ m/$vlc_rf/i ) { # contains show + version
      my $qualifier = $1 || '';
      $query =~ s/$vlc_re/$qualifier/go;
   }
   return $query;
}

# Shortens long queries by normalizing stuff out of them.  $length is used only
# for IN() lists.  If $length is given, the query is shortened if it's longer
# than that.
sub shorten {
   my ( $self, $query, $length ) = @_;
   # Shorten multi-value insert/replace, all the way up to on duplicate key
   # update if it exists.
   $query =~ s{
      \A(
         (?:INSERT|REPLACE)
         (?:\s+LOW_PRIORITY|DELAYED|HIGH_PRIORITY|IGNORE)?
         (?:\s\w+)*\s+\S+\s+VALUES\s*\(.*?\)
      )
      \s*,\s*\(.*?(ON\s+DUPLICATE|\Z)}
      {$1 /*... omitted ...*/$2}xsi;

   # Shortcut!  Find out if there's an IN() list with values.
   return $query unless $query =~ m/IN\s*\(\s*(?!select)/i;

   # Shorten long IN() lists of literals.  But only if the string is longer than
   # the $length limit.  Assumption: values don't contain commas or closing
   # parens inside them.
   my $last_length  = 0;
   my $query_length = length($query);
   while (
      $length          > 0
      && $query_length > $length
      && $query_length < ( $last_length || $query_length + 1 )
   ) {
      $last_length = $query_length;
      $query =~ s{
         (\bIN\s*\()    # The opening of an IN list
         ([^\)]+)       # Contents of the list, assuming no item contains paren
         (?=\))           # Close of the list
      }
      {
         $1 . __shorten($2)
      }gexsi;
   }

   return $query;
}

# Used by shorten().  The argument is the stuff inside an IN() list.  The
# argument might look like this:
#  1,2,3,4,5,6
# Or, if this is a second or greater iteration, it could even look like this:
#  /*... omitted 5 items ...*/ 6,7,8,9
# In the second case, we need to trim out 6,7,8 and increment "5 items" to "8
# items".  We assume that the values in the list don't contain commas; if they
# do, the results could be a little bit wrong, but who cares.  We keep the first
# 20 items because we don't want to nuke all the samples from the query, we just
# want to shorten it.
sub __shorten {
   my ( $snippet ) = @_;
   my @vals = split(/,/, $snippet);
   return $snippet unless @vals > 20;
   my @keep = splice(@vals, 0, 20);  # Remove and save the first 20 items
   return
      join(',', @keep)
      . "/*... omitted "
      . scalar(@vals)
      . " items ...*/";
}

# Normalizes variable queries to a "query fingerprint" by abstracting away
# parameters, canonicalizing whitespace, etc.  See
# http://dev.mysql.com/doc/refman/5.0/en/literals.html for literal syntax.
# Note: Any changes to this function must be profiled for speed!  Speed of this
# function is critical for pt-query-digest.  There are known bugs in this,
# but the balance between maybe-you-get-a-bug and speed favors speed.
# See past Maatkit revisions of this subroutine for more correct, but slower,
# regexes.
sub fingerprint {
   my ( $self, $query ) = @_;

   # First, we start with a bunch of special cases that we can optimize because
   # they are special behavior or because they are really big and we want to
   # throw them away as early as possible.
   $query =~ m#\ASELECT /\*!40001 SQL_NO_CACHE \*/ \* FROM `# # mysqldump query
      && return 'mysqldump';
   # Matches queries like REPLACE /*foo.bar:3/3*/ INTO checksum.checksum
   $query =~ m#/\*\w+\.\w+:[0-9]/[0-9]\*/#     # pt-table-checksum, etc query
      && return 'percona-toolkit';
   # Administrator commands appear to be a comment, so return them as-is
   $query =~ m/\Aadministrator command: /
      && return $query;
   # Special-case for stored procedures.
   $query =~ m/\A\s*(call\s+\S+)\(/i
      && return lc($1); # Warning! $1 used, be careful.
   # mysqldump's INSERT statements will have long values() lists, don't waste
   # time on them... they also tend to segfault Perl on some machines when you
   # get to the "# Collapse IN() and VALUES() lists" regex below!
   if ( my ($beginning) = $query =~ m/\A((?:INSERT|REPLACE)(?: IGNORE)?\s+INTO.+?VALUES\s*\(.*?\))\s*,\s*\(/is ) {
      $query = $beginning; # Shorten multi-value INSERT statements ASAP
   }

   $query =~ s/$mlc_re//go;
   $query =~ s/$olc_re//go;
   $query =~ s/\Ause \S+\Z/use ?/i       # Abstract the DB in USE
      && return $query;

   # -----------------------------------------------------------
   # Remove quoted strings
   # -----------------------------------------------------------
   $query =~ s/([^\\])(\\')/$1/sg;
   $query =~ s/([^\\])(\\")/$1/sg;
   $query =~ s/\\\\//sg;
   $query =~ s/\\'//sg;
   $query =~ s/\\"//sg;
   $query =~ s/([^\\])(".*?[^\\]?")/$1?/sg;
   $query =~ s/([^\\])('.*?[^\\]?')/$1?/sg;
   # -----------------------------------------------------------

   $query =~ s/\bfalse\b|\btrue\b/?/isg; # boolean values

   # MD5 checksums which are always 32 hex chars
   if ( $self->{match_md5_checksums} ) {
      $query =~ s/([._-])[a-f0-9]{32}/$1?/g;
   }

   # Things resembling numbers/hex.
   if ( !$self->{match_embedded_numbers} ) {
      # For speed, this regex is extremely broad in its definition
      # of what looks like a number.
      $query =~ s/[0-9+-][0-9a-f.xb+-]*/?/g;
   }
   else {
      $query =~ s/\b[0-9+-][0-9a-f.xb+-]*/?/g;
   }

   # Clean up leftovers
   if ( $self->{match_md5_checksums} ) {
      $query =~ s/[xb+-]\?/?/g;
   }
   else {
      $query =~ s/[xb.+-]\?/?/g;
   }

   $query =~ s/\A\s+//;                  # Chop off leading whitespace
   chomp $query;                         # Kill trailing whitespace
   $query =~ tr[ \n\t\r\f][ ]s;          # Collapse whitespace
   $query = lc $query;
   $query =~ s/\bnull\b/?/g;             # Get rid of NULLs
   $query =~ s{                          # Collapse IN and VALUES lists
               \b(in|values?)(?:[\s,]*\([\s?,]*\))+
              }
              {$1(?+)}gx;
   $query =~ s{                          # Collapse UNION
               \b(select\s.*?)(?:(\sunion(?:\sall)?)\s\1)+
              }
              {$1 /*repeat$2*/}xg;
   $query =~ s/\blimit \?(?:, ?\?| offset \?)?/limit ?/; # LIMIT
   # The following are disabled because of speed issues.  Should we try to
   # normalize whitespace between and around operators?  My gut feeling is no.
   # $query =~ s/ , | ,|, /,/g;                # Normalize commas
   # $query =~ s/ = | =|= /=/g;                # Normalize equals
   # $query =~ s# [,=+*/-] ?|[,=+*/-] #+#g;    # Normalize operators

   # Remove ASC keywords from ORDER BY clause so these queries fingerprint
   # the same:
   #   SELECT * FROM `products`  ORDER BY name ASC, shape ASC;
   #   SELECT * FROM `products`  ORDER BY name, shape;
   # ASC is default so "ORDER BY col ASC" is really the same as just
   # "ORDER BY col".
   # http://code.google.com/p/maatkit/issues/detail?id=1030
   if ( $query =~ m/\bORDER BY /gi ) {  # Find, anchor on ORDER BY clause
      # Replace any occurrence of "ASC" after anchor until end of query.
      # I have verified this with regex debug: it's a single forward pass
      # without backtracking.  Probably as fast as it gets.
      1 while $query =~ s/\G(.+?)\s+ASC/$1/gi && pos $query;
   }

   return $query;
}

# Gets the verbs from an SQL query, such as SELECT, UPDATE, etc.
sub distill_verbs {
   my ( $self, $query ) = @_;

   # Simple verbs that normally don't have comments, extra clauses, etc.
   $query =~ m/\A\s*call\s+(\S+)\(/i && return "CALL $1";
   $query =~ m/\A\s*use\s+/          && return "USE";
   $query =~ m/\A\s*UNLOCK TABLES/i  && return "UNLOCK";
   $query =~ m/\A\s*xa\s+(\S+)/i     && return "XA_$1";

   if ( $query =~ m/\A\s*LOAD/i ) {
      my ($tbl) = $query =~ m/INTO TABLE\s+(\S+)/i;
      $tbl ||= '';
      $tbl =~ s/`//g;
      return "LOAD DATA $tbl";
   }

   if ( $query =~ m/\Aadministrator command:/ ) {
      $query =~ s/administrator command:/ADMIN/;
      $query = uc $query;
      return $query;
   }

   # All other, more complex verbs.
   $query = $self->strip_comments($query);

   # SHOW statements are either 2 or 3 words: SHOW A (B), where A and B
   # are words; B is optional.  E.g. "SHOW TABLES" or "SHOW REPLICA STATUS".
   # There's a few common keywords that may show up in place of A, so we
   # remove them first.  Then there's some keywords that signify extra clauses
   # that may show up in place of B and since these clauses are at the
   # end of the statement, we remove everything from the clause onward.
   if ( $query =~ m/\A\s*SHOW\s+/i ) {
      PTDEBUG && _d($query);

      # Remove common keywords.
      $query = uc $query;
      $query =~ s/\s+(?:SESSION|FULL|STORAGE|ENGINE)\b/ /g;
      # This should be in the regex above but Perl doesn't seem to match
      # COUNT\(.+\) properly when it's grouped.
      $query =~ s/\s+COUNT[^)]+\)//g;

      # Remove clause keywords and everything after.
      $query =~ s/\s+(?:FOR|FROM|LIKE|WHERE|LIMIT|IN)\b.+//ms;

      # The query should now be like SHOW A B C ... delete everything after B,
      # collapse whitespace, and we're done.
      $query =~ s/\A(SHOW(?:\s+\S+){1,2}).*\Z/$1/s;
      $query =~ s/\s+/ /g;
      PTDEBUG && _d($query);
      return $query;
   }

   # Data definition statements verbs like CREATE and ALTER.
   # The two evals are a hack to keep Perl from warning that
   # "QueryParser::data_def_stmts" used only once: possible typo at...".
   # Some day we'll group all our common regex together in a packet and
   # export/import them properly.
   eval $QueryParser::data_def_stmts;
   eval $QueryParser::tbl_ident;
   my ( $dds ) = $query =~ /^\s*($QueryParser::data_def_stmts)\b/i;
   if ( $dds) {
      # https://bugs.launchpad.net/percona-toolkit/+bug/821690
      $query =~ s/\s+IF(?:\s+NOT)?\s+EXISTS/ /i;
      my ( $obj ) = $query =~ m/$dds.+(DATABASE|TABLE)\b/i;
      $obj = uc $obj if $obj;
      PTDEBUG && _d('Data def statement:', $dds, 'obj:', $obj);
      my ($db_or_tbl)
         = $query =~ m/(?:TABLE|DATABASE)\s+($QueryParser::tbl_ident)(\s+.*)?/i;
      PTDEBUG && _d('Matches db or table:', $db_or_tbl);
      return uc($dds . ($obj ? " $obj" : '')), $db_or_tbl;
   }

   # All other verbs, like SELECT, INSERT, UPDATE, etc.  First, get
   # the query type -- just extract all the verbs and collapse them
   # together.
   my @verbs = $query =~ m/\b($verbs)\b/gio;
   @verbs    = do {
      my $last = '';
      grep { my $pass = $_ ne $last; $last = $_; $pass } map { uc } @verbs;
   };

   # http://code.google.com/p/maatkit/issues/detail?id=1176
   # A SELECT query can't have any verbs other than UNION.
   # Subqueries (SELECT SELECT) are reduced to 1 SELECT in the
   # do loop above.  And there's no valid SQL syntax like
   # SELECT ... DELETE (there are valid multi-verb syntaxes, like
   # INSERT ... SELECT).  So if it's a SELECT with multiple verbs,
   # we need to check it else SELECT c FROM t WHERE col='delete'
   # will incorrectly distill as SELECT DELETE t.
   if ( ($verbs[0] || '') eq 'SELECT' && @verbs > 1 ) {
      PTDEBUG && _d("False-positive verbs after SELECT:", @verbs[1..$#verbs]);
      my $union = grep { $_ eq 'UNION' } @verbs;
      @verbs    = $union ? qw(SELECT UNION) : qw(SELECT);
   }

   # This used to be "my $verbs" but older versions of Perl complain that
   # ""my" variable $verbs masks earlier declaration in same scope" where
   # the earlier declaration is our $verbs.
   # http://code.google.com/p/maatkit/issues/detail?id=957
   my $verb_str = join(q{ }, @verbs);
   return $verb_str;
}

sub __distill_tables {
   my ( $self, $query, $table, %args ) = @_;
   my $qp = $args{QueryParser} || $self->{QueryParser};
   die "I need a QueryParser argument" unless $qp;

   # "Fingerprint" the tables.
   my @tables = map {
      $_ =~ s/`//g;
      $_ =~ s/(_?)[0-9]+/$1?/g;
      $_;
   } grep { defined $_ } $qp->get_tables($query);

   push @tables, $table if $table;

   # Collapse the table list
   @tables = do {
      my $last = '';
      grep { my $pass = $_ ne $last; $last = $_; $pass } @tables;
   };

   return @tables;
}

# This is kind of like fingerprinting, but it super-fingerprints to something
# that shows the query type and the tables/objects it accesses.
sub distill {
   my ( $self, $query, %args ) = @_;

   if ( $args{generic} ) {
      # Do a generic distillation which returns the first two words
      # of a simple "cmd arg" query, like memcached and HTTP stuff.
      my ($cmd, $arg) = $query =~ m/^(\S+)\s+(\S+)/;
      return '' unless $cmd;
      $query = (uc $cmd) . ($arg ? " $arg" : '');
   }
   else {
      # distill_verbs() may return a table if it's a special statement
      # like TRUNCATE TABLE foo.  __distill_tables() handles some but not
      # all special statements so we pass the special table from distill_verbs()
      # to __distill_tables() in case it's a statement that the latter
      # can't handle.  If it can handle it, it will eliminate any duplicate
      # tables.
      my ($verbs, $table)  = $self->distill_verbs($query, %args);

      if ( $verbs && $verbs =~ m/^SHOW/ ) {
         # Ignore tables for SHOW statements and normalize some
         # aliases like SCHEMA==DATABASE, KEYS==INDEX.
         my %alias_for = qw(
            SCHEMA   DATABASE
            KEYS     INDEX
            INDEXES  INDEX
         );
         map { $verbs =~ s/$_/$alias_for{$_}/ } keys %alias_for;
         $query = $verbs;
      }
      elsif ( $verbs && $verbs =~ m/^LOAD DATA/ ) {
         return $verbs;
      }
      else {
         # For everything else, distill the tables.
         my @tables = $self->__distill_tables($query, $table, %args);
         $query     = join(q{ }, $verbs, @tables);
      }
   }

   if ( $args{trf} ) {
      $query = $args{trf}->($query, %args);
   }

   return $query;
}

sub convert_to_select {
   my ( $self, $query ) = @_;
   return unless $query;

   # Trying to convert statements that have subqueries as values to column
   # assignments doesn't work.  E.g. SET col=(SELECT ...).  But subqueries
   # do work in other cases like JOIN (SELECT ...).
   # http://code.google.com/p/maatkit/issues/detail?id=347
   return if $query =~ m/=\s*\(\s*SELECT /i;

   $query =~ s{
                 \A.*?
                 update(?:\s+(?:low_priority|ignore))?\s+(.*?)
                 \s+set\b(.*?)
                 (?:\s*where\b(.*?))?
                 (limit\s*[0-9]+(?:\s*,\s*[0-9]+)?)?
                 \Z
              }
              {__update_to_select($1, $2, $3, $4)}exsi
      # INSERT|REPLACE tbl (cols) VALUES (vals)
      || $query =~ s{
                    \A.*?
                    (?:insert(?:\s+ignore)?|replace)\s+
                    .*?\binto\b(.*?)\(([^\)]+)\)\s*
                    values?\s*(\(.*?\))\s*
                    (?:\blimit\b|on\s+duplicate\s+key.*)?\s*
                    \Z
                 }
                 {__insert_to_select($1, $2, $3)}exsi
      # INSERT|REPLACE tbl SET vals
      || $query =~ s{
                    \A.*?
                    (?:insert(?:\s+ignore)?|replace)\s+
                    (?:.*?\binto)\b(.*?)\s*
                    set\s+(.*?)\s*
                    (?:\blimit\b|on\s+duplicate\s+key.*)?\s*
                    \Z
                 }
                 {__insert_to_select_with_set($1, $2)}exsi
      || $query =~ s{
                    \A.*?
                    delete\s+(.*?)
                    \bfrom\b(.*)
                    \Z
                 }
                 {__delete_to_select($1, $2)}exsi;
   $query =~ s/\s*on\s+duplicate\s+key\s+update.*\Z//si;
   $query =~ s/\A.*?(?=\bSELECT\s*\b)//ism;
   return $query;
}

sub convert_select_list {
   my ( $self, $query ) = @_;
   $query =~ s{
               \A\s*select(.*?)\bfrom\b
              }
              {$1 =~ m/\*/ ? "select 1 from" : "select isnull(coalesce($1)) from"}exi;
   return $query;
}

sub __delete_to_select {
   my ( $delete, $join ) = @_;
   if ( $join =~ m/\bjoin\b/ ) {
      return "select 1 from $join";
   }
   return "select * from $join";
}

sub __insert_to_select {
   my ( $tbl, $cols, $vals ) = @_;
   PTDEBUG && _d('Args:', @_);
   my @cols = split(/,/, $cols);
   PTDEBUG && _d('Cols:', @cols);
   $vals =~ s/^\(|\)$//g; # Strip leading/trailing parens
   my @vals = $vals =~ m/($quote_re|[^,]*${bal}[^,]*|[^,]+)/g;
   PTDEBUG && _d('Vals:', @vals);
   if ( @cols == @vals ) {
      return "select * from $tbl where "
         . join(' and ', map { "$cols[$_]=$vals[$_]" } (0..$#cols));
   }
   else {
      return "select * from $tbl limit 1";
   }
}

sub __insert_to_select_with_set {
   my ( $from, $set ) = @_;
   $set =~ s/,/ and /g;
   return "select * from $from where $set ";
}

sub __update_to_select {
   my ( $from, $set, $where, $limit ) = @_;
   return "select $set from $from "
      . ( $where ? "where $where" : '' )
      . ( $limit ? " $limit "      : '' );
}

sub wrap_in_derived {
   my ( $self, $query ) = @_;
   return unless $query;
   return $query =~ m/\A\s*select/i
      ? "select 1 from ($query) as x limit 1"
      : $query;
}

sub _d {
   my ($package, undef, $line) = caller 0;
   @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
        map { defined $_ ? $_ : 'undef' }
        @_;
   print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}

1;
}
# ###########################################################################
# End QueryRewriter package
# ###########################################################################