Files
percona-toolkit/lib/QueryParser.pm
2013-08-03 12:11:56 -07:00

539 lines
18 KiB
Perl

# This program is copyright 2008-2011 Percona Ireland Ltd.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA 02111-1307 USA.
# ###########################################################################
# QueryParser package
# ###########################################################################
{
# Package: QueryParser
# QueryParser extracts parts of SQL statements, like table lists and subqueries.
# This package differs from SQLParser because it only extracts from a query
# what is needed and only when that can be accomplished rather simply. By
# contrast, SQLParser parses the entire SQL statement no matter the complexity.
package QueryParser;
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
our $tbl_ident = qr/(?:`[^`]+`|\w+)(?:\.(?:`[^`]+`|\w+))?/;
# This regex finds things that look like database.table identifiers, based on
# their proximity to keywords. (?<!KEY\s) is a workaround for ON DUPLICATE KEY
# UPDATE, which is usually followed by a column name.
our $tbl_regex = qr{
\b(?:FROM|JOIN|(?<!KEY\s)UPDATE|INTO) # Words that precede table names
\b\s*
\(? # Optional paren around tables
# Capture the identifier and any number of comma-join identifiers that
# follow it, optionally with aliases with or without the AS keyword
($tbl_ident
(?: (?:\s+ (?:AS\s+)? \w+)?, \s*$tbl_ident )*
)
}xio;
# This regex is meant to match "derived table" queries, of the form
# .. from ( select ...
# .. join ( select ...
# .. bar join foo, ( select ...
# Unfortunately it'll also match this:
# select a, b, (select ...
our $has_derived = qr{
\b(?:FROM|JOIN|,)
\s*\(\s*SELECT
}xi;
# http://dev.mysql.com/doc/refman/5.1/en/sql-syntax-data-definition.html
# We treat TRUNCATE as a dds but really it's a data manipulation statement.
our $data_def_stmts = qr/(?:CREATE|ALTER|TRUNCATE|DROP|RENAME)/i;
# http://dev.mysql.com/doc/refman/5.1/en/sql-syntax-data-manipulation.html
# Data manipulation statements.
our $data_manip_stmts = qr/(?:INSERT|UPDATE|DELETE|REPLACE)/i;
sub new {
my ( $class ) = @_;
bless {}, $class;
}
# Returns a list of table names found in the query text.
sub get_tables {
my ( $self, $query ) = @_;
return unless $query;
PTDEBUG && _d('Getting tables for', $query);
# Handle CREATE, ALTER, TRUNCATE and DROP TABLE.
my ( $ddl_stmt ) = $query =~ m/^\s*($data_def_stmts)\b/i;
if ( $ddl_stmt ) {
PTDEBUG && _d('Special table type:', $ddl_stmt);
$query =~ s/IF\s+(?:NOT\s+)?EXISTS//i;
if ( $query =~ m/$ddl_stmt DATABASE\b/i ) {
# Handles CREATE DATABASE, not to be confused with CREATE TABLE.
PTDEBUG && _d('Query alters a database, not a table');
return ();
}
if ( $ddl_stmt =~ m/CREATE/i && $query =~ m/$ddl_stmt\b.+?\bSELECT\b/i ) {
# Handle CREATE TABLE ... SELECT. In this case, the real tables
# come from the SELECT, not the CREATE.
my ($select) = $query =~ m/\b(SELECT\b.+)/is;
PTDEBUG && _d('CREATE TABLE ... SELECT:', $select);
return $self->get_tables($select);
}
my ($tbl) = $query =~ m/TABLE\s+($tbl_ident)(\s+.*)?/i;
PTDEBUG && _d('Matches table:', $tbl);
return ($tbl);
}
# These keywords may appear between UPDATE or SELECT and the table refs.
# They need to be removed so that they are not mistaken for tables.
$query =~ s/(?:LOW_PRIORITY|IGNORE|STRAIGHT_JOIN|DELAYED)\s+/ /ig;
# Another special case: LOCK TABLES tbl [[AS] alias] READ|WRITE, etc.
# We strip the LOCK TABLES stuff and append "FROM" to fake a SELECT
# statement and allow $tbl_regex to match below.
if ( $query =~ s/^\s*LOCK TABLES\s+//i ) {
PTDEBUG && _d('Special table type: LOCK TABLES');
$query =~ s/\s+(?:READ(?:\s+LOCAL)?|WRITE)\s*//gi;
PTDEBUG && _d('Locked tables:', $query);
$query = "FROM $query";
}
$query =~ s/\\["']//g; # quoted strings
$query =~ s/".*?"/?/sg; # quoted strings
$query =~ s/'.*?'/?/sg; # quoted strings
# INSERT and REPLACE without INTO
# https://bugs.launchpad.net/percona-toolkit/+bug/984053
if ( $query =~ m/\A\s*(?:INSERT|REPLACE)(?!\s+INTO)/i ) {
# Add INTO so the reset of the code work as usual.
$query =~ s/\A\s*((?:INSERT|REPLACE))\s+/$1 INTO /i;
}
if ( $query =~ m/\A\s*LOAD DATA/i ) {
my ($tbl) = $query =~ m/INTO TABLE\s+(\S+)/i;
return $tbl;
}
my @tables;
foreach my $tbls ( $query =~ m/$tbl_regex/gio ) {
PTDEBUG && _d('Match tables:', $tbls);
# Some queries coming from certain ORM systems will have superfluous
# parens around table names, like SELECT * FROM (`mytable`); We match
# these so the table names can be extracted more simply with regexes. But
# in case of subqueries, this can cause us to match SELECT as a table
# name, for example, in SELECT * FROM (SELECT ....) AS X; It's possible
# that SELECT is really a table name, but so unlikely that we just skip
# this case.
next if $tbls =~ m/\ASELECT\b/i;
foreach my $tbl ( split(',', $tbls) ) {
# Remove implicit or explicit (AS) alias.
$tbl =~ s/\s*($tbl_ident)(\s+.*)?/$1/gio;
# Sanity check for cases like when a column is named `from`
# and the regex matches junk. Instead of complex regex to
# match around these rarities, this simple check will save us.
if ( $tbl !~ m/[a-zA-Z]/ ) {
PTDEBUG && _d('Skipping suspicious table name:', $tbl);
next;
}
push @tables, $tbl;
}
}
return @tables;
}
# Returns true if it sees what looks like a "derived table", e.g. a subquery in
# the FROM clause.
sub has_derived_table {
my ( $self, $query ) = @_;
# See the $tbl_regex regex above.
my $match = $query =~ m/$has_derived/;
PTDEBUG && _d($query, 'has ' . ($match ? 'a' : 'no') . ' derived table');
return $match;
}
# Return a data structure of tables/databases and the name they're aliased to.
# Given the following query, SELECT * FROM db.tbl AS foo; the structure is:
# { TABLE => { foo => tbl }, DATABASE => { tbl => db } }
# If $list is true, then a flat list of tables found in the query is returned
# instead. This is used for things that want to know what tables the query
# touches, but don't care about aliases.
sub get_aliases {
my ( $self, $query, $list ) = @_;
# This is the basic result every query must return.
my $result = {
DATABASE => {},
TABLE => {},
};
return $result unless $query;
# These keywords may appear between UPDATE or SELECT and the table refs.
# They need to be removed so that they are not mistaken for tables.
$query =~ s/ (?:LOW_PRIORITY|IGNORE|STRAIGHT_JOIN)//ig;
# These keywords may appear before JOIN. They need to be removed so
# that they are not mistaken for implicit aliases of the preceding table.
$query =~ s/ (?:INNER|OUTER|CROSS|LEFT|RIGHT|NATURAL)//ig;
# Get the table references clause and the keyword that starts the clause.
# See the comments below for why we need the starting keyword.
my @tbl_refs;
my ($tbl_refs, $from) = $query =~ m{
(
(FROM|INTO|UPDATE)\b\s* # Keyword before table refs
.+? # Table refs
)
(?:\s+|\z) # If the query does not end with the table
# refs then there must be at least 1 space
# between the last tbl ref and the next
# keyword
(?:WHERE|ORDER|LIMIT|HAVING|SET|VALUES|\z) # Keyword after table refs
}ix;
if ( $tbl_refs ) {
if ( $query =~ m/^(?:INSERT|REPLACE)/i ) {
# Remove optional columns def from INSERT/REPLACE.
$tbl_refs =~ s/\([^\)]+\)\s*//;
}
PTDEBUG && _d('tbl refs:', $tbl_refs);
# These keywords precede a table ref. They signal the start of a table
# ref, but to know where the table ref ends we need the after tbl ref
# keywords below.
my $before_tbl = qr/(?:,|JOIN|\s|$from)+/i;
# These keywords signal the end of a table ref and either 1) the start
# of another table ref, or 2) the start of an ON|USING part of a JOIN
# clause (which we want to skip over), or 3) the end of the string (\z).
# We need these after tbl ref keywords so that they are not mistaken
# for implicit aliases of the preceding table.
my $after_tbl = qr/(?:,|JOIN|ON|USING|\z)/i;
# This is required for cases like:
# FROM t1 JOIN t2 ON t1.col1=t2.col2 JOIN t3 ON t2.col3 = t3.col4
# Because spaces may precede a tbl and a tbl may end with \z, then
# t3.col4 will match as a table. However, t2.col3=t3.col4 will not match.
$tbl_refs =~ s/ = /=/g;
while (
$tbl_refs =~ m{
$before_tbl\b\s*
( ($tbl_ident) (?:\s+ (?:AS\s+)? (\w+))? )
\s*$after_tbl
}xgio )
{
my ( $tbl_ref, $db_tbl, $alias ) = ($1, $2, $3);
PTDEBUG && _d('Match table:', $tbl_ref);
push @tbl_refs, $tbl_ref;
$alias = $self->trim_identifier($alias);
# Handle subqueries.
if ( $tbl_ref =~ m/^AS\s+\w+/i ) {
# According to the manual
# http://dev.mysql.com/doc/refman/5.0/en/unnamed-views.html:
# "The [AS] name clause is mandatory, because every table in a
# FROM clause must have a name."
# So if the tbl ref begins with 'AS', then we probably have a
# subquery.
PTDEBUG && _d('Subquery', $tbl_ref);
$result->{TABLE}->{$alias} = undef;
next;
}
my ( $db, $tbl ) = $db_tbl =~ m/^(?:(.*?)\.)?(.*)/;
$db = $self->trim_identifier($db);
$tbl = $self->trim_identifier($tbl);
$result->{TABLE}->{$alias || $tbl} = $tbl;
$result->{DATABASE}->{$tbl} = $db if $db;
}
}
else {
PTDEBUG && _d("No tables ref in", $query);
}
if ( $list ) {
# Return raw text of the tbls without aliases, instead of identifier
# mappings. Include all identifier quotings and such.
return \@tbl_refs;
}
else {
return $result;
}
}
# Splits a compound statement and returns an array with each sub-statement.
# Example:
# INSERT INTO ... SELECT ...
# is split into two statements: "INSERT INTO ..." and "SELECT ...".
sub split {
my ( $self, $query ) = @_;
return unless $query;
$query = $self->clean_query($query);
PTDEBUG && _d('Splitting', $query);
my $verbs = qr{SELECT|INSERT|UPDATE|DELETE|REPLACE|UNION|CREATE}i;
# This splits a statement on the above verbs which means that the verb
# gets chopped out. Capturing the verb (e.g. ($verb)) will retain it,
# but then it's disjointed from its statement. Example: for this query,
# INSERT INTO ... SELECT ...
# split returns ('INSERT', 'INTO ...', 'SELECT', '...'). Therefore,
# we must re-attach each verb to its statement; we do this later...
my @split_statements = grep { $_ } split(m/\b($verbs\b(?!(?:\s*\()))/io, $query);
my @statements;
if ( @split_statements == 1 ) {
# This happens if the query has no verbs, so it's probably a single
# statement.
push @statements, $query;
}
else {
# ...Re-attach verbs to their statements.
for ( my $i = 0; $i <= $#split_statements; $i += 2 ) {
push @statements, $split_statements[$i].$split_statements[$i+1];
# Variable-width negative look-behind assertions, (?<!), aren't
# fully supported so we split ON DUPLICATE KEY UPDATE. This
# puts it back together.
if ( $statements[-2] && $statements[-2] =~ m/on duplicate key\s+$/i ) {
$statements[-2] .= pop @statements;
}
}
}
# Wrap stmts in <> to make it more clear where each one begins/ends.
PTDEBUG && _d('statements:', map { $_ ? "<$_>" : 'none' } @statements);
return @statements;
}
sub clean_query {
my ( $self, $query ) = @_;
return unless $query;
$query =~ s!/\*.*?\*/! !g; # Remove /* comment blocks */
$query =~ s/^\s+//; # Remove leading spaces
$query =~ s/\s+$//; # Remove trailing spaces
$query =~ s/\s{2,}/ /g; # Remove extra spaces
return $query;
}
sub split_subquery {
my ( $self, $query ) = @_;
return unless $query;
$query = $self->clean_query($query);
$query =~ s/;$//;
my @subqueries;
my $sqno = 0; # subquery number
my $pos = 0;
while ( $query =~ m/(\S+)(?:\s+|\Z)/g ) {
$pos = pos($query);
my $word = $1;
PTDEBUG && _d($word, $sqno);
if ( $word =~ m/^\(?SELECT\b/i ) {
my $start_pos = $pos - length($word) - 1;
if ( $start_pos ) {
$sqno++;
PTDEBUG && _d('Subquery', $sqno, 'starts at', $start_pos);
$subqueries[$sqno] = {
start_pos => $start_pos,
end_pos => 0,
len => 0,
words => [$word],
lp => 1, # left parentheses
rp => 0, # right parentheses
done => 0,
};
}
else {
PTDEBUG && _d('Main SELECT at pos 0');
}
}
else {
next unless $sqno; # next unless we're in a subquery
PTDEBUG && _d('In subquery', $sqno);
my $sq = $subqueries[$sqno];
if ( $sq->{done} ) {
PTDEBUG && _d('This subquery is done; SQL is for',
($sqno - 1 ? "subquery $sqno" : "the main SELECT"));
next;
}
push @{$sq->{words}}, $word;
my $lp = ($word =~ tr/\(//) || 0;
my $rp = ($word =~ tr/\)//) || 0;
PTDEBUG && _d('parentheses left', $lp, 'right', $rp);
if ( ($sq->{lp} + $lp) - ($sq->{rp} + $rp) == 0 ) {
my $end_pos = $pos - 1;
PTDEBUG && _d('Subquery', $sqno, 'ends at', $end_pos);
$sq->{end_pos} = $end_pos;
$sq->{len} = $end_pos - $sq->{start_pos};
}
}
}
for my $i ( 1..$#subqueries ) {
my $sq = $subqueries[$i];
next unless $sq;
$sq->{sql} = join(' ', @{$sq->{words}});
substr $query,
$sq->{start_pos} + 1, # +1 for (
$sq->{len} - 1, # -1 for )
"__subquery_$i";
}
return $query, map { $_->{sql} } grep { defined $_ } @subqueries;
}
sub query_type {
my ( $self, $query, $qr ) = @_;
my ($type, undef) = $qr->distill_verbs($query);
my $rw;
if ( $type =~ m/^SELECT\b/ ) {
$rw = 'read';
}
elsif ( $type =~ m/^$data_manip_stmts\b/
|| $type =~ m/^$data_def_stmts\b/ ) {
$rw = 'write'
}
return {
type => $type,
rw => $rw,
}
}
sub get_columns {
my ( $self, $query ) = @_;
my $cols = [];
return $cols unless $query;
my $cols_def;
if ( $query =~ m/^SELECT/i ) {
$query =~ s/
^SELECT\s+
(?:ALL
|DISTINCT
|DISTINCTROW
|HIGH_PRIORITY
|STRAIGHT_JOIN
|SQL_SMALL_RESULT
|SQL_BIG_RESULT
|SQL_BUFFER_RESULT
|SQL_CACHE
|SQL_NO_CACHE
|SQL_CALC_FOUND_ROWS
)\s+
/SELECT /xgi;
($cols_def) = $query =~ m/^SELECT\s+(.+?)\s+FROM/i;
}
elsif ( $query =~ m/^(?:INSERT|REPLACE)/i ) {
($cols_def) = $query =~ m/\(([^\)]+)\)\s*VALUE/i;
}
PTDEBUG && _d('Columns:', $cols_def);
if ( $cols_def ) {
@$cols = split(',', $cols_def);
map {
my $col = $_;
$col = s/^\s+//g;
$col = s/\s+$//g;
$col;
} @$cols;
}
return $cols;
}
sub parse {
my ( $self, $query ) = @_;
return unless $query;
my $parsed = {};
# Flatten and clean query.
$query =~ s/\n/ /g;
$query = $self->clean_query($query);
$parsed->{query} = $query,
$parsed->{tables} = $self->get_aliases($query, 1);
$parsed->{columns} = $self->get_columns($query);
my ($type) = $query =~ m/^(\w+)/;
$parsed->{type} = lc $type;
# my @words = $query =~ m/
# [A-Za-z_.]+\(.*?\)+ # Match FUNCTION(...)
# |\(.*?\)+ # Match grouped items
# |"(?:[^"]|\"|"")*"+ # Match double quotes
# |'[^'](?:|\'|'')*'+ # and single quotes
# |`(?:[^`]|``)*`+ # and backticks
# |[^ ,]+
# |,
#/gx;
$parsed->{sub_queries} = [];
return $parsed;
}
# Returns an array of arrayrefs like [db,tbl] for each unique db.tbl
# in the query and its subqueries. db may be undef.
sub extract_tables {
my ( $self, %args ) = @_;
my $query = $args{query};
my $default_db = $args{default_db};
my $q = $self->{Quoter} || $args{Quoter};
return unless $query;
PTDEBUG && _d('Extracting tables');
my @tables;
my %seen;
foreach my $db_tbl ( $self->get_tables($query) ) {
next unless $db_tbl;
next if $seen{$db_tbl}++; # Unique-ify for issue 337.
my ( $db, $tbl ) = $q->split_unquote($db_tbl);
push @tables, [ $db || $default_db, $tbl ];
}
return @tables;
}
# This is a special trim function that removes whitespace and identifier-quotes
# (backticks, in the case of MySQL) from the string.
sub trim_identifier {
my ($self, $str) = @_;
return unless defined $str;
$str =~ s/`//g;
$str =~ s/^\s+//;
$str =~ s/\s+$//;
return $str;
}
sub _d {
my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
map { defined $_ ? $_ : 'undef' }
@_;
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}
1;
}
# ###########################################################################
# End QueryParser package
# ###########################################################################