# This program is copyright 2008-2011 Percona Ireland Ltd. # Feedback and improvements are welcome. # # THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, version 2; OR the Perl Artistic License. On UNIX and similar # systems, you can issue `man perlgpl' or `man perlartistic' to read these # licenses. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 Temple # Place, Suite 330, Boston, MA 02111-1307 USA. # ########################################################################### # QueryParser package # ########################################################################### { # Package: QueryParser # QueryParser extracts parts of SQL statements, like table lists and subqueries. # This package differs from SQLParser because it only extracts from a query # what is needed and only when that can be accomplished rather simply. By # contrast, SQLParser parses the entire SQL statement no matter the complexity. package QueryParser; use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); use constant PTDEBUG => $ENV{PTDEBUG} || 0; our $tbl_ident = qr/(?:`[^`]+`|\w+)(?:\.(?:`[^`]+`|\w+))?/; # This regex finds things that look like database.table identifiers, based on # their proximity to keywords. (?get_tables($select); } my ($tbl) = $query =~ m/TABLE\s+($tbl_ident)(\s+.*)?/i; PTDEBUG && _d('Matches table:', $tbl); return ($tbl); } # These keywords may appear between UPDATE or SELECT and the table refs. # They need to be removed so that they are not mistaken for tables. $query =~ s/(?:LOW_PRIORITY|IGNORE|STRAIGHT_JOIN|DELAYED)\s+/ /ig; # Another special case: LOCK TABLES tbl [[AS] alias] READ|WRITE, etc. # We strip the LOCK TABLES stuff and append "FROM" to fake a SELECT # statement and allow $tbl_regex to match below. if ( $query =~ s/^\s*LOCK TABLES\s+//i ) { PTDEBUG && _d('Special table type: LOCK TABLES'); $query =~ s/\s+(?:READ(?:\s+LOCAL)?|WRITE)\s*//gi; PTDEBUG && _d('Locked tables:', $query); $query = "FROM $query"; } $query =~ s/\\["']//g; # quoted strings $query =~ s/".*?"/?/sg; # quoted strings $query =~ s/'.*?'/?/sg; # quoted strings # INSERT and REPLACE without INTO # https://bugs.launchpad.net/percona-toolkit/+bug/984053 if ( $query =~ m/\A\s*(?:INSERT|REPLACE)(?!\s+INTO)/i ) { # Add INTO so the reset of the code work as usual. $query =~ s/\A\s*((?:INSERT|REPLACE))\s+/$1 INTO /i; } if ( $query =~ m/\A\s*LOAD DATA/i ) { my ($tbl) = $query =~ m/INTO TABLE\s+(\S+)/i; return $tbl; } my @tables; foreach my $tbls ( $query =~ m/$tbl_regex/gio ) { PTDEBUG && _d('Match tables:', $tbls); # Some queries coming from certain ORM systems will have superfluous # parens around table names, like SELECT * FROM (`mytable`); We match # these so the table names can be extracted more simply with regexes. But # in case of subqueries, this can cause us to match SELECT as a table # name, for example, in SELECT * FROM (SELECT ....) AS X; It's possible # that SELECT is really a table name, but so unlikely that we just skip # this case. next if $tbls =~ m/\ASELECT\b/i; foreach my $tbl ( split(',', $tbls) ) { # Remove implicit or explicit (AS) alias. $tbl =~ s/\s*($tbl_ident)(\s+.*)?/$1/gio; # Sanity check for cases like when a column is named `from` # and the regex matches junk. Instead of complex regex to # match around these rarities, this simple check will save us. if ( $tbl !~ m/[a-zA-Z]/ ) { PTDEBUG && _d('Skipping suspicious table name:', $tbl); next; } push @tables, $tbl; } } return @tables; } # Returns true if it sees what looks like a "derived table", e.g. a subquery in # the FROM clause. sub has_derived_table { my ( $self, $query ) = @_; # See the $tbl_regex regex above. my $match = $query =~ m/$has_derived/; PTDEBUG && _d($query, 'has ' . ($match ? 'a' : 'no') . ' derived table'); return $match; } # Return a data structure of tables/databases and the name they're aliased to. # Given the following query, SELECT * FROM db.tbl AS foo; the structure is: # { TABLE => { foo => tbl }, DATABASE => { tbl => db } } # If $list is true, then a flat list of tables found in the query is returned # instead. This is used for things that want to know what tables the query # touches, but don't care about aliases. sub get_aliases { my ( $self, $query, $list ) = @_; # This is the basic result every query must return. my $result = { DATABASE => {}, TABLE => {}, }; return $result unless $query; # These keywords may appear between UPDATE or SELECT and the table refs. # They need to be removed so that they are not mistaken for tables. $query =~ s/ (?:LOW_PRIORITY|IGNORE|STRAIGHT_JOIN)//ig; # These keywords may appear before JOIN. They need to be removed so # that they are not mistaken for implicit aliases of the preceding table. $query =~ s/ (?:INNER|OUTER|CROSS|LEFT|RIGHT|NATURAL)//ig; # Get the table references clause and the keyword that starts the clause. # See the comments below for why we need the starting keyword. my @tbl_refs; my ($tbl_refs, $from) = $query =~ m{ ( (FROM|INTO|UPDATE)\b\s* # Keyword before table refs .+? # Table refs ) (?:\s+|\z) # If the query does not end with the table # refs then there must be at least 1 space # between the last tbl ref and the next # keyword (?:WHERE|ORDER|LIMIT|HAVING|SET|VALUES|\z) # Keyword after table refs }ix; if ( $tbl_refs ) { if ( $query =~ m/^(?:INSERT|REPLACE)/i ) { # Remove optional columns def from INSERT/REPLACE. $tbl_refs =~ s/\([^\)]+\)\s*//; } PTDEBUG && _d('tbl refs:', $tbl_refs); # These keywords precede a table ref. They signal the start of a table # ref, but to know where the table ref ends we need the after tbl ref # keywords below. my $before_tbl = qr/(?:,|JOIN|\s|$from)+/i; # These keywords signal the end of a table ref and either 1) the start # of another table ref, or 2) the start of an ON|USING part of a JOIN # clause (which we want to skip over), or 3) the end of the string (\z). # We need these after tbl ref keywords so that they are not mistaken # for implicit aliases of the preceding table. my $after_tbl = qr/(?:,|JOIN|ON|USING|\z)/i; # This is required for cases like: # FROM t1 JOIN t2 ON t1.col1=t2.col2 JOIN t3 ON t2.col3 = t3.col4 # Because spaces may precede a tbl and a tbl may end with \z, then # t3.col4 will match as a table. However, t2.col3=t3.col4 will not match. $tbl_refs =~ s/ = /=/g; while ( $tbl_refs =~ m{ $before_tbl\b\s* ( ($tbl_ident) (?:\s+ (?:AS\s+)? (\w+))? ) \s*$after_tbl }xgio ) { my ( $tbl_ref, $db_tbl, $alias ) = ($1, $2, $3); PTDEBUG && _d('Match table:', $tbl_ref); push @tbl_refs, $tbl_ref; $alias = $self->trim_identifier($alias); # Handle subqueries. if ( $tbl_ref =~ m/^AS\s+\w+/i ) { # According to the manual # http://dev.mysql.com/doc/refman/5.0/en/unnamed-views.html: # "The [AS] name clause is mandatory, because every table in a # FROM clause must have a name." # So if the tbl ref begins with 'AS', then we probably have a # subquery. PTDEBUG && _d('Subquery', $tbl_ref); $result->{TABLE}->{$alias} = undef; next; } my ( $db, $tbl ) = $db_tbl =~ m/^(?:(.*?)\.)?(.*)/; $db = $self->trim_identifier($db); $tbl = $self->trim_identifier($tbl); $result->{TABLE}->{$alias || $tbl} = $tbl; $result->{DATABASE}->{$tbl} = $db if $db; } } else { PTDEBUG && _d("No tables ref in", $query); } if ( $list ) { # Return raw text of the tbls without aliases, instead of identifier # mappings. Include all identifier quotings and such. return \@tbl_refs; } else { return $result; } } # Splits a compound statement and returns an array with each sub-statement. # Example: # INSERT INTO ... SELECT ... # is split into two statements: "INSERT INTO ..." and "SELECT ...". sub split { my ( $self, $query ) = @_; return unless $query; $query = $self->clean_query($query); PTDEBUG && _d('Splitting', $query); my $verbs = qr{SELECT|INSERT|UPDATE|DELETE|REPLACE|UNION|CREATE}i; # This splits a statement on the above verbs which means that the verb # gets chopped out. Capturing the verb (e.g. ($verb)) will retain it, # but then it's disjointed from its statement. Example: for this query, # INSERT INTO ... SELECT ... # split returns ('INSERT', 'INTO ...', 'SELECT', '...'). Therefore, # we must re-attach each verb to its statement; we do this later... my @split_statements = grep { $_ } split(m/\b($verbs\b(?!(?:\s*\()))/io, $query); my @statements; if ( @split_statements == 1 ) { # This happens if the query has no verbs, so it's probably a single # statement. push @statements, $query; } else { # ...Re-attach verbs to their statements. for ( my $i = 0; $i <= $#split_statements; $i += 2 ) { push @statements, $split_statements[$i].$split_statements[$i+1]; # Variable-width negative look-behind assertions, (? to make it more clear where each one begins/ends. PTDEBUG && _d('statements:', map { $_ ? "<$_>" : 'none' } @statements); return @statements; } sub clean_query { my ( $self, $query ) = @_; return unless $query; $query =~ s!/\*.*?\*/! !g; # Remove /* comment blocks */ $query =~ s/^\s+//; # Remove leading spaces $query =~ s/\s+$//; # Remove trailing spaces $query =~ s/\s{2,}/ /g; # Remove extra spaces return $query; } sub split_subquery { my ( $self, $query ) = @_; return unless $query; $query = $self->clean_query($query); $query =~ s/;$//; my @subqueries; my $sqno = 0; # subquery number my $pos = 0; while ( $query =~ m/(\S+)(?:\s+|\Z)/g ) { $pos = pos($query); my $word = $1; PTDEBUG && _d($word, $sqno); if ( $word =~ m/^\(?SELECT\b/i ) { my $start_pos = $pos - length($word) - 1; if ( $start_pos ) { $sqno++; PTDEBUG && _d('Subquery', $sqno, 'starts at', $start_pos); $subqueries[$sqno] = { start_pos => $start_pos, end_pos => 0, len => 0, words => [$word], lp => 1, # left parentheses rp => 0, # right parentheses done => 0, }; } else { PTDEBUG && _d('Main SELECT at pos 0'); } } else { next unless $sqno; # next unless we're in a subquery PTDEBUG && _d('In subquery', $sqno); my $sq = $subqueries[$sqno]; if ( $sq->{done} ) { PTDEBUG && _d('This subquery is done; SQL is for', ($sqno - 1 ? "subquery $sqno" : "the main SELECT")); next; } push @{$sq->{words}}, $word; my $lp = ($word =~ tr/\(//) || 0; my $rp = ($word =~ tr/\)//) || 0; PTDEBUG && _d('parentheses left', $lp, 'right', $rp); if ( ($sq->{lp} + $lp) - ($sq->{rp} + $rp) == 0 ) { my $end_pos = $pos - 1; PTDEBUG && _d('Subquery', $sqno, 'ends at', $end_pos); $sq->{end_pos} = $end_pos; $sq->{len} = $end_pos - $sq->{start_pos}; } } } for my $i ( 1..$#subqueries ) { my $sq = $subqueries[$i]; next unless $sq; $sq->{sql} = join(' ', @{$sq->{words}}); substr $query, $sq->{start_pos} + 1, # +1 for ( $sq->{len} - 1, # -1 for ) "__subquery_$i"; } return $query, map { $_->{sql} } grep { defined $_ } @subqueries; } sub query_type { my ( $self, $query, $qr ) = @_; my ($type, undef) = $qr->distill_verbs($query); my $rw; if ( $type =~ m/^SELECT\b/ ) { $rw = 'read'; } elsif ( $type =~ m/^$data_manip_stmts\b/ || $type =~ m/^$data_def_stmts\b/ ) { $rw = 'write' } return { type => $type, rw => $rw, } } sub get_columns { my ( $self, $query ) = @_; my $cols = []; return $cols unless $query; my $cols_def; if ( $query =~ m/^SELECT/i ) { $query =~ s/ ^SELECT\s+ (?:ALL |DISTINCT |DISTINCTROW |HIGH_PRIORITY |STRAIGHT_JOIN |SQL_SMALL_RESULT |SQL_BIG_RESULT |SQL_BUFFER_RESULT |SQL_CACHE |SQL_NO_CACHE |SQL_CALC_FOUND_ROWS )\s+ /SELECT /xgi; ($cols_def) = $query =~ m/^SELECT\s+(.+?)\s+FROM/i; } elsif ( $query =~ m/^(?:INSERT|REPLACE)/i ) { ($cols_def) = $query =~ m/\(([^\)]+)\)\s*VALUE/i; } PTDEBUG && _d('Columns:', $cols_def); if ( $cols_def ) { @$cols = split(',', $cols_def); map { my $col = $_; $col = s/^\s+//g; $col = s/\s+$//g; $col; } @$cols; } return $cols; } sub parse { my ( $self, $query ) = @_; return unless $query; my $parsed = {}; # Flatten and clean query. $query =~ s/\n/ /g; $query = $self->clean_query($query); $parsed->{query} = $query, $parsed->{tables} = $self->get_aliases($query, 1); $parsed->{columns} = $self->get_columns($query); my ($type) = $query =~ m/^(\w+)/; $parsed->{type} = lc $type; # my @words = $query =~ m/ # [A-Za-z_.]+\(.*?\)+ # Match FUNCTION(...) # |\(.*?\)+ # Match grouped items # |"(?:[^"]|\"|"")*"+ # Match double quotes # |'[^'](?:|\'|'')*'+ # and single quotes # |`(?:[^`]|``)*`+ # and backticks # |[^ ,]+ # |, #/gx; $parsed->{sub_queries} = []; return $parsed; } # Returns an array of arrayrefs like [db,tbl] for each unique db.tbl # in the query and its subqueries. db may be undef. sub extract_tables { my ( $self, %args ) = @_; my $query = $args{query}; my $default_db = $args{default_db}; my $q = $self->{Quoter} || $args{Quoter}; return unless $query; PTDEBUG && _d('Extracting tables'); my @tables; my %seen; foreach my $db_tbl ( $self->get_tables($query) ) { next unless $db_tbl; next if $seen{$db_tbl}++; # Unique-ify for issue 337. my ( $db, $tbl ) = $q->split_unquote($db_tbl); push @tables, [ $db || $default_db, $tbl ]; } return @tables; } # This is a special trim function that removes whitespace and identifier-quotes # (backticks, in the case of MySQL) from the string. sub trim_identifier { my ($self, $str) = @_; return unless defined $str; $str =~ s/`//g; $str =~ s/^\s+//; $str =~ s/\s+$//; return $str; } sub _d { my ($package, undef, $line) = caller 0; @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } map { defined $_ ? $_ : 'undef' } @_; print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; } 1; } # ########################################################################### # End QueryParser package # ###########################################################################