mirror of
https://github.com/percona/percona-toolkit.git
synced 2025-09-01 18:25:59 +00:00
1432 lines
47 KiB
Perl
1432 lines
47 KiB
Perl
# This program is copyright 2010-2012 Percona Ireland Ltd.
|
|
# Feedback and improvements are welcome.
|
|
#
|
|
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
|
|
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify it under
|
|
# the terms of the GNU General Public License as published by the Free Software
|
|
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
|
|
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
|
|
# licenses.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along with
|
|
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
|
|
# Place, Suite 330, Boston, MA 02111-1307 USA.
|
|
# ###########################################################################
|
|
# SQLParser package
|
|
# ###########################################################################
|
|
{
|
|
# Package: SQLParser
|
|
# SQLParser parses common MySQL SQL statements into data structures.
|
|
# This parser is MySQL-specific and intentionally meant to handle only
|
|
# "common" cases. Although there are many limitations (like UNION, CASE,
|
|
# etc.), many complex cases are handled that no other free, Perl SQL
|
|
# parser at the time of writing can parse, notably subqueries in all their
|
|
# places and varieties.
|
|
#
|
|
# This package has not been profiled and since it relies heavily on
|
|
# mildly complex regex, so do not expect amazing performance.
|
|
#
|
|
# See SQLParser.t for examples of the various data structures. There are
|
|
# many and they vary a lot depending on the statement parsed, so documentation
|
|
# in this file is not exhaustive.
|
|
#
|
|
# This package differs from QueryParser because here we parse the entire SQL
|
|
# statement (thus giving access to all its parts), whereas QueryParser extracts
|
|
# just needed parts (and ignores all the rest).
|
|
package SQLParser;
|
|
|
|
use strict;
|
|
use warnings FATAL => 'all';
|
|
use English qw(-no_match_vars);
|
|
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
|
|
|
|
use Data::Dumper;
|
|
$Data::Dumper::Indent = 1;
|
|
$Data::Dumper::Sortkeys = 1;
|
|
$Data::Dumper::Quotekeys = 0;
|
|
|
|
# Basic identifiers for database, table, column and function names.
|
|
my $quoted_ident = qr/`[^`]+`/;
|
|
my $unquoted_ident = qr/
|
|
\@{0,2} # optional @ or @@ for variables
|
|
\w+ # the ident name
|
|
(?:\([^\)]*\))? # optional function params
|
|
/x;
|
|
|
|
my $ident_alias = qr/
|
|
\s+ # space before alias
|
|
(?:(AS)\s+)? # optional AS keyword
|
|
((?>$quoted_ident|$unquoted_ident)) # alias
|
|
/xi;
|
|
|
|
# A table is identified by 1 or 2 identifiers separated by a period
|
|
# and optionally followed by an alias. See parse_table_reference()
|
|
# for why an optional index hint is not included here.
|
|
my $table_ident = qr/(?:
|
|
((?:(?>$quoted_ident|$unquoted_ident)\.?){1,2}) # table
|
|
(?:$ident_alias)? # optional alias
|
|
)/xo;
|
|
|
|
# A column is identified by 1 to 3 identifiers separated by periods
|
|
# and optionally followed by an alias.
|
|
my $column_ident = qr/(?:
|
|
((?:(?>$quoted_ident|$unquoted_ident|\*)\.?){1,3}) # column
|
|
(?:$ident_alias)? # optional alias
|
|
)/xo;
|
|
|
|
my $function_ident = qr/
|
|
\b
|
|
(
|
|
\w+ # function name
|
|
\( # opening parenthesis
|
|
[^\)]+ # function args, if any
|
|
\) # closing parenthesis
|
|
)
|
|
/x;
|
|
|
|
my %ignore_function = (
|
|
INDEX => 1,
|
|
KEY => 1,
|
|
);
|
|
|
|
# Sub: new
|
|
# Create a SQLParser object.
|
|
#
|
|
# Parameters:
|
|
# %args - Arguments
|
|
#
|
|
# Optional Arguments:
|
|
# Schema - <Schema> object. Can be set later by calling <set_Schema()>.
|
|
#
|
|
# Returns:
|
|
# SQLParser object
|
|
sub new {
|
|
my ( $class, %args ) = @_;
|
|
my $self = {
|
|
%args,
|
|
};
|
|
return bless $self, $class;
|
|
}
|
|
|
|
# Sub: parse
|
|
# Parse a SQL statement. Only statements of $allowed_types are parsed.
|
|
# This sub recurses to parse subqueries.
|
|
#
|
|
# Parameters:
|
|
# $query - SQL statement
|
|
#
|
|
# Returns:
|
|
# A complex hashref of the parsed SQL statement. All keys and almost all
|
|
# values are lowercase for consistency. The struct is roughly:
|
|
# (start code)
|
|
# {
|
|
# type => '', # one of $allowed_types
|
|
# clauses => {}, # raw, unparsed text of clauses
|
|
# <clause> => struct # parsed clause struct, e.g. from => [<tables>]
|
|
# keywords => {}, # LOW_PRIORITY, DISTINCT, SQL_CACHE, etc.
|
|
# functions => {}, # MAX(), SUM(), NOW(), etc.
|
|
# select => {}, # SELECT struct for INSERT/REPLACE ... SELECT
|
|
# subqueries => [], # pointers to subquery structs
|
|
# }
|
|
# (end code)
|
|
# It varies, of course, depending on the query. If something is missing
|
|
# it means the query doesn't have that part. E.g. INSERT has an INTO clause
|
|
# but DELETE does not, and only DELETE and SELECT have FROM clauses. Each
|
|
# clause struct is different; see their respective parse_CLAUSE subs.
|
|
sub parse {
|
|
my ( $self, $query ) = @_;
|
|
return unless $query;
|
|
|
|
# Only these types of statements are parsed.
|
|
my $allowed_types = qr/(?:
|
|
DELETE
|
|
|INSERT
|
|
|REPLACE
|
|
|SELECT
|
|
|UPDATE
|
|
|CREATE
|
|
)/xi;
|
|
|
|
# Flatten and clean query.
|
|
$query = $self->clean_query($query);
|
|
|
|
# Remove first word, should be the statement type. The parse_TYPE subs
|
|
# expect that this is already removed.
|
|
my $type;
|
|
if ( $query =~ s/^(\w+)\s+// ) {
|
|
$type = lc $1;
|
|
PTDEBUG && _d('Query type:', $type);
|
|
die "Cannot parse " . uc($type) . " queries"
|
|
unless $type =~ m/$allowed_types/i;
|
|
}
|
|
else {
|
|
die "Query does not begin with a word"; # shouldn't happen
|
|
}
|
|
|
|
$query = $self->normalize_keyword_spaces($query);
|
|
|
|
# If query has any subqueries, remove/save them and replace them.
|
|
# They'll be parsed later, after the main outer query.
|
|
my @subqueries;
|
|
if ( $query =~ m/(\(SELECT )/i ) {
|
|
PTDEBUG && _d('Removing subqueries');
|
|
@subqueries = $self->remove_subqueries($query);
|
|
$query = shift @subqueries;
|
|
}
|
|
elsif ( $type eq 'create' && $query =~ m/\s+SELECT/ ) {
|
|
PTDEBUG && _d('CREATE..SELECT');
|
|
($subqueries[0]->{query}) = $query =~ m/\s+(SELECT .+)/;
|
|
$query =~ s/\s+SELECT.+//;
|
|
}
|
|
|
|
# Parse raw text parts from query. The parse_TYPE subs only do half
|
|
# the work: parsing raw text parts of clauses, tables, functions, etc.
|
|
# Since these parts are invariant (e.g. a LIMIT clause is same for any
|
|
# type of SQL statement) they are parsed later via other parse_CLAUSE
|
|
# subs, instead of parsing them individually in each parse_TYPE sub.
|
|
my $parse_func = "parse_$type";
|
|
my $struct = $self->$parse_func($query);
|
|
if ( !$struct ) {
|
|
PTDEBUG && _d($parse_func, 'failed to parse query');
|
|
return;
|
|
}
|
|
$struct->{type} = $type;
|
|
$self->_parse_clauses($struct);
|
|
# TODO: parse functions
|
|
|
|
if ( @subqueries ) {
|
|
PTDEBUG && _d('Parsing subqueries');
|
|
foreach my $subquery ( @subqueries ) {
|
|
my $subquery_struct = $self->parse($subquery->{query});
|
|
@{$subquery_struct}{keys %$subquery} = values %$subquery;
|
|
push @{$struct->{subqueries}}, $subquery_struct;
|
|
}
|
|
}
|
|
|
|
PTDEBUG && _d('Query struct:', Dumper($struct));
|
|
return $struct;
|
|
}
|
|
|
|
|
|
# Sub: _parse_clauses
|
|
# Parse raw text of clauses into data structures. This sub recurses
|
|
# to parse the clauses of subqueries. The clauses are read from
|
|
# and their data structures saved into the $struct parameter.
|
|
#
|
|
# Parameters:
|
|
# $struct - Hashref from which clauses are read (%{$struct->{clauses}})
|
|
# and into which data structs are saved (e.g. $struct->{from}=...).
|
|
sub _parse_clauses {
|
|
my ( $self, $struct ) = @_;
|
|
# Parse raw text of clauses and functions.
|
|
foreach my $clause ( keys %{$struct->{clauses}} ) {
|
|
# Rename/remove clauses with space in their names, like ORDER BY.
|
|
if ( $clause =~ m/ / ) {
|
|
(my $clause_no_space = $clause) =~ s/ /_/g;
|
|
$struct->{clauses}->{$clause_no_space} = $struct->{clauses}->{$clause};
|
|
delete $struct->{clauses}->{$clause};
|
|
$clause = $clause_no_space;
|
|
}
|
|
|
|
my $parse_func = "parse_$clause";
|
|
$struct->{$clause} = $self->$parse_func($struct->{clauses}->{$clause});
|
|
|
|
if ( $clause eq 'select' ) {
|
|
PTDEBUG && _d('Parsing subquery clauses');
|
|
$struct->{select}->{type} = 'select';
|
|
$self->_parse_clauses($struct->{select});
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
# Sub: clean_query
|
|
# Remove spaces, flatten, and normalize some patterns for easier parsing.
|
|
#
|
|
# Parameters:
|
|
# $query - SQL statement
|
|
#
|
|
# Returns:
|
|
# Cleaned $query
|
|
sub clean_query {
|
|
my ( $self, $query ) = @_;
|
|
return unless $query;
|
|
|
|
# Whitespace and comments.
|
|
$query =~ s/^\s*--.*$//gm; # -- comments
|
|
$query =~ s/\s+/ /g; # extra spaces/flatten
|
|
$query =~ s!/\*.*?\*/!!g; # /* comments */
|
|
$query =~ s/^\s+//; # leading spaces
|
|
$query =~ s/\s+$//; # trailing spaces
|
|
|
|
return $query;
|
|
}
|
|
|
|
# Sub: normalize_keyword_spaces
|
|
# Normalize spaces around certain SQL keywords. Spaces are added and
|
|
# removed around certain SQL keywords to make parsing easier.
|
|
#
|
|
# Parameters:
|
|
# $query - SQL statement
|
|
#
|
|
# Returns:
|
|
# Normalized $query
|
|
sub normalize_keyword_spaces {
|
|
my ( $self, $query ) = @_;
|
|
|
|
# Add spaces between important tokens to help the parse_* subs.
|
|
$query =~ s/\b(VALUE(?:S)?)\(/$1 (/i;
|
|
$query =~ s/\bON\(/on (/gi;
|
|
$query =~ s/\bUSING\(/using (/gi;
|
|
|
|
# Start of (SELECT subquery).
|
|
$query =~ s/\(\s+SELECT\s+/(SELECT /gi;
|
|
|
|
return $query;
|
|
}
|
|
|
|
# Sub: _parse_query
|
|
# This sub is called by the parse_TYPE subs except parse_insert.
|
|
# It does two things: remove, save the given keywords, all of which
|
|
# should appear at the beginning of the query; and, save (but not
|
|
# remove) the given clauses. The query should start with the values
|
|
# for the first clause because the query's first word was removed
|
|
# in parse(). So for "SELECT cols FROM ...", the query given here
|
|
# is "cols FROM ..." where "cols" belongs to the first clause "columns".
|
|
# Then the query is walked clause-by-clause, saving each.
|
|
#
|
|
# Parameters:
|
|
# $query - SQL statement with first word (SELECT, INSERT, etc.) removed
|
|
# $keywords - Compiled regex of keywords that can appear in $query
|
|
# $first_clause - First clause word to expect in $query
|
|
# $clauses - Compiled regex of clause words that can appear in $query
|
|
#
|
|
# Returns:
|
|
# Hashref with raw text of clauses
|
|
sub _parse_query {
|
|
my ( $self, $query, $keywords, $first_clause, $clauses ) = @_;
|
|
return unless $query;
|
|
my $struct = {};
|
|
|
|
# Save, remove keywords.
|
|
1 while $query =~ s/$keywords\s+/$struct->{keywords}->{lc $1}=1, ''/gie;
|
|
|
|
# Go clausing.
|
|
my @clause = grep { defined $_ }
|
|
($query =~ m/\G(.+?)(?:$clauses\s+|\Z)/gci);
|
|
|
|
my $clause = $first_clause,
|
|
my $value = shift @clause;
|
|
$struct->{clauses}->{$clause} = $value;
|
|
PTDEBUG && _d('Clause:', $clause, $value);
|
|
|
|
# All other clauses.
|
|
while ( @clause ) {
|
|
$clause = shift @clause;
|
|
$value = shift @clause;
|
|
$struct->{clauses}->{lc $clause} = $value;
|
|
PTDEBUG && _d('Clause:', $clause, $value);
|
|
}
|
|
|
|
($struct->{unknown}) = ($query =~ m/\G(.+)/);
|
|
|
|
return $struct;
|
|
}
|
|
|
|
sub parse_delete {
|
|
my ( $self, $query ) = @_;
|
|
if ( $query =~ s/FROM\s+//i ) {
|
|
my $keywords = qr/(LOW_PRIORITY|QUICK|IGNORE)/i;
|
|
my $clauses = qr/(FROM|WHERE|ORDER BY|LIMIT)/i;
|
|
return $self->_parse_query($query, $keywords, 'from', $clauses);
|
|
}
|
|
else {
|
|
die "DELETE without FROM: $query";
|
|
}
|
|
}
|
|
|
|
sub parse_insert {
|
|
my ( $self, $query ) = @_;
|
|
return unless $query;
|
|
my $struct = {};
|
|
|
|
# Save, remove keywords.
|
|
my $keywords = qr/(LOW_PRIORITY|DELAYED|HIGH_PRIORITY|IGNORE)/i;
|
|
1 while $query =~ s/$keywords\s+/$struct->{keywords}->{lc $1}=1, ''/gie;
|
|
|
|
if ( $query =~ m/ON DUPLICATE KEY UPDATE (.+)/i ) {
|
|
my $values = $1;
|
|
die "No values after ON DUPLICATE KEY UPDATE: $query" unless $values;
|
|
$struct->{clauses}->{on_duplicate} = $values;
|
|
PTDEBUG && _d('Clause: on duplicate key update', $values);
|
|
|
|
# This clause can be confused for JOIN ... ON in INSERT-SELECT queries,
|
|
# so we remove the ON DUPLICATE KEY UPDATE clause after extracting its
|
|
# values.
|
|
$query =~ s/\s+ON DUPLICATE KEY UPDATE.+//;
|
|
}
|
|
|
|
# Parse INTO clause. Literal "INTO" is optional.
|
|
if ( my @into = ($query =~ m/
|
|
(?=.*?(?:VALUE|SE(?:T|LECT))) # Avoid a backtracking explosion
|
|
(?:INTO\s+)? # INTO, optional
|
|
(.+?)\s+ # table ref
|
|
(\([^\)]+\)\s+)? # column list, optional
|
|
(VALUE.?|SET|SELECT)\s+ # start of next clause
|
|
/xgci)
|
|
) {
|
|
my $tbl = shift @into; # table ref
|
|
$struct->{clauses}->{into} = $tbl;
|
|
PTDEBUG && _d('Clause: into', $tbl);
|
|
|
|
my $cols = shift @into; # columns, maybe
|
|
if ( $cols ) {
|
|
$cols =~ s/[\(\)]//g;
|
|
$struct->{clauses}->{columns} = $cols;
|
|
PTDEBUG && _d('Clause: columns', $cols);
|
|
}
|
|
|
|
my $next_clause = lc(shift @into); # VALUES, SET or SELECT
|
|
die "INSERT/REPLACE without clause after table: $query"
|
|
unless $next_clause;
|
|
$next_clause = 'values' if $next_clause eq 'value';
|
|
my ($values) = ($query =~ m/\G(.+)/gci);
|
|
die "INSERT/REPLACE without values: $query" unless $values;
|
|
$struct->{clauses}->{$next_clause} = $values;
|
|
PTDEBUG && _d('Clause:', $next_clause, $values);
|
|
}
|
|
|
|
# Save any leftovers. If there are any, parsing missed something.
|
|
($struct->{unknown}) = ($query =~ m/\G(.+)/);
|
|
|
|
return $struct;
|
|
}
|
|
{
|
|
# Suppress warnings like "Name "SQLParser::parse_set" used only once:
|
|
# possible typo at SQLParser.pm line 480." caused by the fact that we
|
|
# don't call these aliases directly, they're called indirectly using
|
|
# $parse_func, hence Perl can't see their being called a compile time.
|
|
no warnings;
|
|
# INSERT and REPLACE are so similar that they are both parsed
|
|
# in parse_insert().
|
|
*parse_replace = \&parse_insert;
|
|
}
|
|
|
|
sub parse_select {
|
|
my ( $self, $query ) = @_;
|
|
|
|
# Keywords are expected to be at the start of the query, so these
|
|
# that appear at the end are handled separately. Afaik, SELECT is
|
|
# only statement with optional keywords at the end. Also, these
|
|
# appear to be the only keywords with spaces instead of _.
|
|
my @keywords;
|
|
my $final_keywords = qr/(FOR UPDATE|LOCK IN SHARE MODE)/i;
|
|
1 while $query =~ s/\s+$final_keywords/(push @keywords, $1), ''/gie;
|
|
|
|
my $keywords = qr/(
|
|
ALL
|
|
|DISTINCT
|
|
|DISTINCTROW
|
|
|HIGH_PRIORITY
|
|
|STRAIGHT_JOIN
|
|
|SQL_SMALL_RESULT
|
|
|SQL_BIG_RESULT
|
|
|SQL_BUFFER_RESULT
|
|
|SQL_CACHE
|
|
|SQL_NO_CACHE
|
|
|SQL_CALC_FOUND_ROWS
|
|
)/xi;
|
|
my $clauses = qr/(
|
|
FROM
|
|
|WHERE
|
|
|GROUP\sBY
|
|
|HAVING
|
|
|ORDER\sBY
|
|
|LIMIT
|
|
|PROCEDURE
|
|
|INTO OUTFILE
|
|
)/xi;
|
|
my $struct = $self->_parse_query($query, $keywords, 'columns', $clauses);
|
|
|
|
# Add final keywords, if any.
|
|
map { s/ /_/g; $struct->{keywords}->{lc $_} = 1; } @keywords;
|
|
|
|
return $struct;
|
|
}
|
|
|
|
sub parse_update {
|
|
my $keywords = qr/(LOW_PRIORITY|IGNORE)/i;
|
|
my $clauses = qr/(SET|WHERE|ORDER BY|LIMIT)/i;
|
|
return _parse_query(@_, $keywords, 'tables', $clauses);
|
|
|
|
}
|
|
|
|
sub parse_create {
|
|
my ($self, $query) = @_;
|
|
my ($obj, $name) = $query =~ m/
|
|
(\S+)\s+
|
|
(?:IF NOT EXISTS\s+)?
|
|
(\S+)
|
|
/xi;
|
|
return {
|
|
object => lc $obj,
|
|
name => $name,
|
|
unknown => undef,
|
|
};
|
|
}
|
|
|
|
# Sub: parse_from
|
|
# Parse a FROM clause, a.k.a. the table references. Does not handle
|
|
# nested joins. See http://dev.mysql.com/doc/refman/5.1/en/join.html
|
|
#
|
|
# Parameters:
|
|
# $from - FROM clause (with the word "FROM")
|
|
#
|
|
# Returns:
|
|
# Arrayref of hashrefs, one hashref for each table in the order that
|
|
# the tables appear, like:
|
|
# (start code)
|
|
# {
|
|
# name => 't2', -- table's real name
|
|
# alias => 'b', -- table's alias, if any
|
|
# explicit_alias => 1, -- if explicitly aliased with AS
|
|
# join => { -- if joined to another table, all but first
|
|
# -- table are because comma implies INNER JOIN
|
|
# to => 't1', -- table name on left side of join, if this is
|
|
# -- LEFT JOIN then this is the inner table, if
|
|
# -- RIGHT JOIN then this is outer table
|
|
# type => '', -- left, right, inner, outer, cross, natural
|
|
# condition => 'using', -- on or using, if applicable
|
|
# columns => ['id'], -- columns for USING condition, if applicable
|
|
# ansi => 1, -- true of ANSI JOIN, i.e. true if not implicit
|
|
# -- INNER JOIN due to following a comma
|
|
# },
|
|
# },
|
|
# {
|
|
# name => 't3',
|
|
# join => {
|
|
# to => 't2',
|
|
# type => 'left',
|
|
# condition => 'on', -- an ON condition is like a WHERE clause so
|
|
# where => [...] -- this arrayref of predicates appears, see
|
|
# -- <parse_where()> for its structure
|
|
# },
|
|
# },
|
|
# (end code)
|
|
sub parse_from {
|
|
my ( $self, $from ) = @_;
|
|
return unless $from;
|
|
PTDEBUG && _d('Parsing FROM', $from);
|
|
|
|
# Extract the column list from USING(col, ...) clauses else
|
|
# the inner commas will be captured by $comma_join.
|
|
my $using_cols;
|
|
($from, $using_cols) = $self->remove_using_columns($from);
|
|
|
|
my $funcs;
|
|
($from, $funcs) = $self->remove_functions($from);
|
|
|
|
# Table references in a FROM clause are separated either by commas
|
|
# (comma/theta join, implicit INNER join) or the JOIN keyword (ansi
|
|
# join). JOIN can be preceded by other keywords like LEFT, RIGHT,
|
|
# OUTER, etc. There must be spaces before and after JOIN and its
|
|
# keywords, but there does not have to be spaces before or after a
|
|
# comma. See http://dev.mysql.com/doc/refman/5.5/en/join.html
|
|
my $comma_join = qr/(?>\s*,\s*)/;
|
|
my $ansi_join = qr/(?>
|
|
\s+
|
|
(?:(?:INNER|CROSS|STRAIGHT_JOIN|LEFT|RIGHT|OUTER|NATURAL)\s+)*
|
|
JOIN
|
|
\s+
|
|
)/xi;
|
|
|
|
my @tbls; # all table refs, a hashref for each
|
|
my $tbl_ref; # current table ref hashref
|
|
my $join; # join info hahsref for current table ref
|
|
foreach my $thing ( split /($comma_join|$ansi_join)/io, $from ) {
|
|
# We shouldn't parse empty things.
|
|
die "Error parsing FROM clause" unless $thing;
|
|
|
|
# Strip leading and trailing spaces.
|
|
$thing =~ s/^\s+//;
|
|
$thing =~ s/\s+$//;
|
|
PTDEBUG && _d('Table thing:', $thing);
|
|
|
|
if ( $thing =~ m/\s+(?:ON|USING)\s+/i ) {
|
|
PTDEBUG && _d("JOIN condition");
|
|
# This join condition follows a JOIN (comma joins don't have
|
|
# conditions). It includes a table ref, ON|USING, and then
|
|
# the value to ON|USING.
|
|
my ($tbl_ref_txt, $join_condition_verb, $join_condition_value)
|
|
= $thing =~ m/^(.+?)\s+(ON|USING)\s+(.+)/i;
|
|
|
|
$tbl_ref = $self->parse_table_reference($tbl_ref_txt);
|
|
|
|
$join->{condition} = lc $join_condition_verb;
|
|
if ( $join->{condition} eq 'on' ) {
|
|
# The value for ON can be, as the MySQL manual says, is just
|
|
# like a WHERE clause.
|
|
$join->{where} = $self->parse_where($join_condition_value, $funcs);
|
|
}
|
|
else { # USING
|
|
# Although calling parse_columns() works, it's overkill.
|
|
# This is not a columns def as in "SELECT col1, col2", it's
|
|
# a simple csv list of column names without aliases, etc.
|
|
$join->{columns} = $self->_parse_csv(shift @$using_cols);
|
|
}
|
|
}
|
|
elsif ( $thing =~ m/(?:,|JOIN)/i ) {
|
|
# A comma or JOIN signals the end of the current table ref and
|
|
# the beginning of the next table ref. Save the current table ref.
|
|
if ( $join ) {
|
|
$tbl_ref->{join} = $join;
|
|
}
|
|
push @tbls, $tbl_ref;
|
|
PTDEBUG && _d("Complete table reference:", Dumper($tbl_ref));
|
|
|
|
# Reset vars for the next table ref.
|
|
$tbl_ref = undef;
|
|
$join = {};
|
|
|
|
# Next table ref becomes the current table ref. It's joined to
|
|
# the previous table ref either implicitly (comma join) or explicitly
|
|
# (ansi join).
|
|
$join->{to} = $tbls[-1]->{tbl};
|
|
if ( $thing eq ',' ) {
|
|
$join->{type} = 'inner';
|
|
$join->{ansi} = 0;
|
|
}
|
|
else { # ansi join
|
|
my $type = $thing =~ m/^(.+?)\s+JOIN$/i ? lc $1 : 'inner';
|
|
$join->{type} = $type;
|
|
$join->{ansi} = 1;
|
|
}
|
|
}
|
|
else {
|
|
# First table ref and comma-joined tables.
|
|
$tbl_ref = $self->parse_table_reference($thing);
|
|
PTDEBUG && _d('Table reference:', Dumper($tbl_ref));
|
|
}
|
|
}
|
|
|
|
# Save the last table ref. It's not completed in the loop above because
|
|
# there's no comma or JOIN after it.
|
|
if ( $tbl_ref ) {
|
|
if ( $join ) {
|
|
$tbl_ref->{join} = $join;
|
|
}
|
|
push @tbls, $tbl_ref;
|
|
PTDEBUG && _d("Complete table reference:", Dumper($tbl_ref));
|
|
}
|
|
|
|
return \@tbls;
|
|
}
|
|
|
|
# Parse a table ref like "tbl", "tbl alias" or "tbl AS alias", where
|
|
# tbl can be optionally "db." qualified. Also handles FORCE|USE|IGNORE
|
|
# INDEX hints. Does not handle "FOR JOIN" hint because "JOIN" here gets
|
|
# confused with the "JOIN" thing in parse_from().
|
|
sub parse_table_reference {
|
|
my ( $self, $tbl_ref ) = @_;
|
|
return unless $tbl_ref;
|
|
PTDEBUG && _d('Parsing table reference:', $tbl_ref);
|
|
my %tbl;
|
|
|
|
# First, check for an index hint. Remove and save it if present.
|
|
# This can't be included in the $table_ident regex because, for example,
|
|
# `tbl` FORCE INDEX (foo), makes FORCE look like an implicit alias.
|
|
if ( $tbl_ref =~ s/
|
|
\s+(
|
|
(?:FORCE|USE|IGNORE)\s
|
|
(?:INDEX|KEY)
|
|
\s*\([^\)]+\)\s*
|
|
)//xi)
|
|
{
|
|
$tbl{index_hint} = $1;
|
|
PTDEBUG && _d('Index hint:', $tbl{index_hint});
|
|
}
|
|
|
|
if ( $tbl_ref =~ m/$table_ident/ ) {
|
|
my ($db_tbl, $as, $alias) = ($1, $2, $3); # XXX
|
|
my $ident_struct = $self->parse_identifier('table', $db_tbl);
|
|
$alias =~ s/`//g if $alias;
|
|
@tbl{keys %$ident_struct} = values %$ident_struct;
|
|
$tbl{explicit_alias} = 1 if $as;
|
|
$tbl{alias} = $alias if $alias;
|
|
}
|
|
else {
|
|
die "Table ident match failed"; # shouldn't happen
|
|
}
|
|
|
|
return \%tbl;
|
|
}
|
|
{
|
|
no warnings; # Why? See same line above.
|
|
*parse_into = \&parse_from;
|
|
*parse_tables = \&parse_from;
|
|
}
|
|
|
|
# This is not your traditional parser, but it works for simple to rather
|
|
# complex cases, with a few noted and intentional limitations. First,
|
|
# the limitations:
|
|
#
|
|
# * probably doesn't handle every possible operator (see $op)
|
|
# * doesn't care about grouping with parentheses
|
|
# * not "fully" tested because the possibilities are infinite
|
|
#
|
|
# It works in four steps; let's take this WHERE clause as an example:
|
|
#
|
|
# i="x and y" or j in ("and", "or") and x is not null or a between 1 and 10 and sz="this 'and' foo"
|
|
#
|
|
# The first step splits the string on and|or, the only two keywords I'm
|
|
# aware of that join the separate predicates. This step doesn't care if
|
|
# and|or is really between two predicates or in a string or something else.
|
|
# The second step is done while the first step is being done: check predicate
|
|
# "fragments" (from step 1) for operators; save which ones have and don't
|
|
# have at least one operator. So the result of step 1 and 2 is:
|
|
#
|
|
# PREDICATE FRAGMENT OPERATOR
|
|
# ================================ ========
|
|
# i="x Y
|
|
# and y" N
|
|
# or j in (" Y
|
|
# and", " N
|
|
# or") N
|
|
# and x is not null Y
|
|
# or a between 1 Y
|
|
# and 10 N
|
|
# and sz="this ' Y
|
|
# and' foo" N
|
|
#
|
|
# The third step runs through the list of pred frags backwards and joins
|
|
# the current frag to the preceding frag if it does not have an operator.
|
|
# The result is:
|
|
#
|
|
# PREDICATE FRAGMENT OPERATOR
|
|
# ================================ ========
|
|
# i="x and y" Y
|
|
# N
|
|
# or j in ("and", "or") Y
|
|
# N
|
|
# N
|
|
# and x is not null Y
|
|
# or a between 1 and 10 Y
|
|
# N
|
|
# and sz="this 'and' foo" Y
|
|
# N
|
|
#
|
|
# The fourth step is similar but not shown: pred frags with unbalanced ' or "
|
|
# are joined to the preceding pred frag. This fixes cases where a pred frag
|
|
# has multiple and|or in a string value; e.g. "foo and bar or dog".
|
|
#
|
|
# After the pred frags are complete, the parts of these predicates are parsed
|
|
# and returned in an arrayref of hashrefs like:
|
|
#
|
|
# {
|
|
# predicate => 'and',
|
|
# column => 'id',
|
|
# operator => '>=',
|
|
# value => '42',
|
|
# }
|
|
#
|
|
# Invalid predicates, or valid ones that we can't parse, will cause
|
|
# the sub to die.
|
|
sub parse_where {
|
|
my ( $self, $where, $functions ) = @_;
|
|
return unless $where;
|
|
PTDEBUG && _d("Parsing WHERE", $where);
|
|
|
|
# Not all the operators listed at
|
|
# http://dev.mysql.com/doc/refman/5.1/en/non-typed-operators.html
|
|
# are supported. E.g. - (minus) is an op but does it ever show up
|
|
# in a where clause? "col-3=2" is valid (where col=5), but we're
|
|
# not interested in weird stuff like that.
|
|
my $op_symbol = qr/
|
|
(?:
|
|
<=(?:>)?
|
|
|>=
|
|
|<>
|
|
|!=
|
|
|<
|
|
|>
|
|
|=
|
|
)/xi;
|
|
my $op_verb = qr/
|
|
(?:
|
|
(?:(?:NOT\s)?LIKE)
|
|
|(?:IS(?:\sNOT\s)?)
|
|
|(?:(?:\sNOT\s)?BETWEEN)
|
|
|(?:(?:NOT\s)?IN)
|
|
)
|
|
/xi;
|
|
my $op_pat = qr/
|
|
(
|
|
(?>
|
|
(?:$op_symbol) # don't need spaces around the symbols, e.g.: col=1
|
|
|(?:\s+$op_verb) # must have space before verb op, e.g.: col LIKE ...
|
|
)
|
|
)/x;
|
|
|
|
# Step 1 and 2: split on and|or and look for operators.
|
|
my $offset = 0;
|
|
my $pred = "";
|
|
my @pred;
|
|
my @has_op;
|
|
while ( $where =~ m/\b(and|or)\b/gi ) {
|
|
my $pos = (pos $where) - (length $1); # pos at and|or, not after
|
|
|
|
$pred = substr $where, $offset, ($pos-$offset);
|
|
push @pred, $pred;
|
|
push @has_op, $pred =~ m/$op_pat/o ? 1 : 0;
|
|
|
|
$offset = $pos;
|
|
}
|
|
# Final predicate fragment: last and|or to end of string.
|
|
$pred = substr $where, $offset;
|
|
push @pred, $pred;
|
|
push @has_op, $pred =~ m/$op_pat/o ? 1 : 0;
|
|
PTDEBUG && _d("Predicate fragments:", Dumper(\@pred));
|
|
PTDEBUG && _d("Predicate frags with operators:", @has_op);
|
|
|
|
# Step 3: join pred frags without ops to preceding pred frag.
|
|
my $n = scalar @pred - 1;
|
|
for my $i ( 1..$n ) {
|
|
$i *= -1;
|
|
my $j = $i - 1; # preceding pred frag
|
|
|
|
# Two constants in a row, like "TRUE or FALSE", are a special case.
|
|
# The current pred ($i) will not have an op but in this case it's
|
|
# not a continuation of the preceding pred ($j) so we don't want to
|
|
# join them. And there's a special case within this special case:
|
|
# "BETWEEN 1 AND 10". _is_constant() strips leading AND or OR so
|
|
# 10 is going to look like an independent constant but really it's
|
|
# part of the BETWEEN op, so this whole special check is skipped
|
|
# if the preceding pred contains BETWEEN. Yes, parsing SQL is tricky.
|
|
next if $pred[$j] !~ m/\s+between\s+/i && $self->_is_constant($pred[$i]);
|
|
|
|
if ( !$has_op[$i] ) {
|
|
$pred[$j] .= $pred[$i];
|
|
$pred[$i] = undef;
|
|
}
|
|
}
|
|
PTDEBUG && _d("Predicate fragments joined:", Dumper(\@pred));
|
|
|
|
# Step 4: join pred frags with unbalanced ' or " to preceding pred frag.
|
|
for my $i ( 0..@pred ) {
|
|
$pred = $pred[$i];
|
|
next unless defined $pred;
|
|
my $n_single_quotes = ($pred =~ tr/'//);
|
|
my $n_double_quotes = ($pred =~ tr/"//);
|
|
if ( ($n_single_quotes % 2) || ($n_double_quotes % 2) ) {
|
|
$pred[$i] .= $pred[$i + 1];
|
|
$pred[$i + 1] = undef;
|
|
}
|
|
}
|
|
PTDEBUG && _d("Predicate fragments balanced:", Dumper(\@pred));
|
|
|
|
# Parse, clean up and save the complete predicates.
|
|
my @predicates;
|
|
foreach my $pred ( @pred ) {
|
|
next unless defined $pred;
|
|
$pred =~ s/^\s+//;
|
|
$pred =~ s/\s+$//;
|
|
my $conj;
|
|
if ( $pred =~ s/^(and|or)\s+//i ) {
|
|
$conj = lc $1;
|
|
}
|
|
my ($col, $op, $val) = $pred =~ m/^(.+?)$op_pat(.+)$/o;
|
|
if ( !$col || !$op ) {
|
|
if ( $self->_is_constant($pred) ) {
|
|
$val = lc $pred;
|
|
}
|
|
else {
|
|
die "Failed to parse WHERE condition: $pred";
|
|
}
|
|
}
|
|
|
|
# Remove whitespace and lowercase some keywords.
|
|
if ( $col ) {
|
|
$col =~ s/\s+$//;
|
|
$col =~ s/^\(+//; # no unquoted column name begins with (
|
|
}
|
|
if ( $op ) {
|
|
$op = lc $op;
|
|
$op =~ s/^\s+//;
|
|
$op =~ s/\s+$//;
|
|
}
|
|
$val =~ s/^\s+//;
|
|
|
|
# No unquoted value ends with ) except FUNCTION(...)
|
|
if ( ($op || '') !~ m/IN/i && $val !~ m/^\w+\([^\)]+\)$/ ) {
|
|
$val =~ s/\)+$//;
|
|
}
|
|
|
|
if ( $val =~ m/NULL|TRUE|FALSE/i ) {
|
|
$val = lc $val;
|
|
}
|
|
|
|
if ( $functions ) {
|
|
$col = shift @$functions if $col =~ m/__FUNC\d+__/;
|
|
$val = shift @$functions if $val =~ m/__FUNC\d+__/;
|
|
}
|
|
|
|
push @predicates, {
|
|
predicate => $conj,
|
|
left_arg => $col,
|
|
operator => $op,
|
|
right_arg => $val,
|
|
};
|
|
}
|
|
|
|
return \@predicates;
|
|
}
|
|
|
|
# Returns true if the value is a constant. Constants are TRUE, FALSE,
|
|
# and any signed number. A leading AND or OR keyword is removed first.
|
|
sub _is_constant {
|
|
my ( $self, $val ) = @_;
|
|
return 0 unless defined $val;
|
|
$val =~ s/^\s*(?:and|or)\s+//;
|
|
return
|
|
$val =~ m/^\s*(?:TRUE|FALSE)\s*$/i || $val =~ m/^\s*-?\d+\s*$/ ? 1 : 0;
|
|
}
|
|
|
|
sub parse_having {
|
|
my ( $self, $having ) = @_;
|
|
# TODO
|
|
return $having;
|
|
}
|
|
|
|
# GROUP BY {col_name | expr | position} [ASC | DESC], ... [WITH ROLLUP]
|
|
sub parse_group_by {
|
|
my ( $self, $group_by ) = @_;
|
|
return unless $group_by;
|
|
PTDEBUG && _d('Parsing GROUP BY', $group_by);
|
|
|
|
# Remove special "WITH ROLLUP" clause so we're left with a simple csv list.
|
|
my $with_rollup = $group_by =~ s/\s+WITH ROLLUP\s*//i;
|
|
|
|
# Parse the identifiers.
|
|
my $idents = $self->parse_identifiers( $self->_parse_csv($group_by) );
|
|
|
|
$idents->{with_rollup} = 1 if $with_rollup;
|
|
|
|
return $idents;
|
|
}
|
|
|
|
# [ORDER BY {col_name | expr | position} [ASC | DESC], ...]
|
|
sub parse_order_by {
|
|
my ( $self, $order_by ) = @_;
|
|
return unless $order_by;
|
|
PTDEBUG && _d('Parsing ORDER BY', $order_by);
|
|
my $idents = $self->parse_identifiers( $self->_parse_csv($order_by) );
|
|
return $idents;
|
|
}
|
|
|
|
# [LIMIT {[offset,] row_count | row_count OFFSET offset}]
|
|
sub parse_limit {
|
|
my ( $self, $limit ) = @_;
|
|
return unless $limit;
|
|
my $struct = {
|
|
row_count => undef,
|
|
};
|
|
if ( $limit =~ m/(\S+)\s+OFFSET\s+(\S+)/i ) {
|
|
$struct->{explicit_offset} = 1;
|
|
$struct->{row_count} = $1;
|
|
$struct->{offset} = $2;
|
|
}
|
|
else {
|
|
my ($offset, $cnt) = $limit =~ m/(?:(\S+),\s+)?(\S+)/i;
|
|
$struct->{row_count} = $cnt;
|
|
$struct->{offset} = $offset if defined $offset;
|
|
}
|
|
return $struct;
|
|
}
|
|
|
|
# Parses the list of values after, e.g., INSERT tbl VALUES (...), (...).
|
|
# Does not currently parse each set of values; it just splits the list.
|
|
sub parse_values {
|
|
my ( $self, $values ) = @_;
|
|
return unless $values;
|
|
$values =~ s/^\s*\(//;
|
|
$values =~ s/\s*\)//;
|
|
my $vals = $self->_parse_csv(
|
|
$values,
|
|
quoted_values => 1,
|
|
remove_quotes => 0,
|
|
);
|
|
return $vals;
|
|
}
|
|
|
|
sub parse_set {
|
|
my ( $self, $set ) = @_;
|
|
PTDEBUG && _d("Parse SET", $set);
|
|
return unless $set;
|
|
my $vals = $self->_parse_csv($set);
|
|
return unless $vals && @$vals;
|
|
|
|
my @set;
|
|
foreach my $col_val ( @$vals ) {
|
|
# Do not remove quotes around the val because quotes let us determine
|
|
# the value's type. E.g. tbl might be a table, but "tbl" is a string,
|
|
# and NOW() is the function, but 'NOW()' is a string.
|
|
my ($col, $val) = $col_val =~ m/^([^=]+)\s*=\s*(.+)/;
|
|
my $ident_struct = $self->parse_identifier('column', $col);
|
|
my $set_struct = {
|
|
%$ident_struct,
|
|
value => $val,
|
|
};
|
|
PTDEBUG && _d("SET:", Dumper($set_struct));
|
|
push @set, $set_struct;
|
|
}
|
|
return \@set;
|
|
}
|
|
|
|
# Split any comma-separated list of values, removing leading
|
|
# and trailing spaces.
|
|
sub _parse_csv {
|
|
my ( $self, $vals, %args ) = @_;
|
|
return unless $vals;
|
|
|
|
my @vals;
|
|
if ( $args{quoted_values} ) {
|
|
# If the vals are quoted, then they can contain commas, like:
|
|
# "hello, world!", 'batman'. If only we could use Text::CSV,
|
|
# then we wouldn't write yet another csv parser to handle this,
|
|
# but Percona Toolkit doesn't like package dependencies, so here's
|
|
# our light implementation of this classic problem.
|
|
my $quote_char = '';
|
|
VAL:
|
|
foreach my $val ( split(',', $vals) ) {
|
|
PTDEBUG && _d("Next value:", $val);
|
|
# If there's a quote char, then this val is the rest of a previously
|
|
# quoted and split value.
|
|
if ( $quote_char ) {
|
|
PTDEBUG && _d("Value is part of previous quoted value");
|
|
# split() removed the comma inside the quoted value,
|
|
# so add it back else "hello, world" is incorrectly
|
|
# returned as "hello world".
|
|
$vals[-1] .= ",$val";
|
|
|
|
# Quoted and split value is complete when a val ends with the
|
|
# same quote char that began the split value.
|
|
if ( $val =~ m/[^\\]*$quote_char$/ ) {
|
|
if ( $args{remove_quotes} ) {
|
|
$vals[-1] =~ s/^\s*$quote_char//;
|
|
$vals[-1] =~ s/$quote_char\s*$//;
|
|
}
|
|
PTDEBUG && _d("Previous quoted value is complete:", $vals[-1]);
|
|
$quote_char = '';
|
|
}
|
|
|
|
next VAL;
|
|
}
|
|
|
|
# Start of new value so strip leading spaces but not trailing
|
|
# spaces yet because if the next check determines that this is
|
|
# a quoted and split val, then trailing space is actually space
|
|
# inside the quoted val, so we want to preserve it.
|
|
$val =~ s/^\s+//;
|
|
|
|
# A value is quoted *and* split (because there's a comma in the
|
|
# quoted value) if the vale begins with a quote char and does not
|
|
# end with that char. E.g.: "foo but not "foo". The val "foo is
|
|
# the first part of the split value, e.g. "foo, bar".
|
|
if ( $val =~ m/^(['"])/ ) {
|
|
PTDEBUG && _d("Value is quoted");
|
|
$quote_char = $1; # XXX
|
|
if ( $val =~ m/.$quote_char$/ ) {
|
|
PTDEBUG && _d("Value is complete");
|
|
$quote_char = '';
|
|
if ( $args{remove_quotes} ) {
|
|
$vals[-1] =~ s/^\s*$quote_char//;
|
|
$vals[-1] =~ s/$quote_char\s*$//;
|
|
}
|
|
}
|
|
else {
|
|
PTDEBUG && _d("Quoted value is not complete");
|
|
}
|
|
}
|
|
else {
|
|
$val =~ s/\s+$//;
|
|
}
|
|
|
|
# Save complete value (e.g. foo or "foo" without the quotes),
|
|
# or save the first part of a quoted and split value; the rest
|
|
# of such a value will be joined back above.
|
|
PTDEBUG && _d("Saving value", ($quote_char ? "fragment" : ""));
|
|
push @vals, $val;
|
|
}
|
|
}
|
|
else {
|
|
@vals = map { s/^\s+//; s/\s+$//; $_ } split(',', $vals);
|
|
}
|
|
|
|
return \@vals;
|
|
}
|
|
{
|
|
no warnings; # Why? See same line above.
|
|
*parse_on_duplicate = \&_parse_csv;
|
|
}
|
|
|
|
sub parse_columns {
|
|
my ( $self, $cols ) = @_;
|
|
PTDEBUG && _d('Parsing columns list:', $cols);
|
|
|
|
my @cols;
|
|
pos $cols = 0;
|
|
while (pos $cols < length $cols) {
|
|
if ($cols =~ m/\G\s*$column_ident\s*(?>,|\Z)/gcxo) {
|
|
my ($db_tbl_col, $as, $alias) = ($1, $2, $3); # XXX
|
|
my $ident_struct = $self->parse_identifier('column', $db_tbl_col);
|
|
$alias =~ s/`//g if $alias;
|
|
my $col_struct = {
|
|
%$ident_struct,
|
|
($as ? (explicit_alias => 1) : ()),
|
|
($alias ? (alias => $alias) : ()),
|
|
};
|
|
push @cols, $col_struct;
|
|
}
|
|
else {
|
|
die "Column ident match failed"; # shouldn't happen
|
|
}
|
|
}
|
|
|
|
return \@cols;
|
|
}
|
|
|
|
# Remove subqueries from query, return modified query and list of subqueries.
|
|
# Each subquery is replaced with the special token __SQn__ where n is the
|
|
# subquery's ID. Subqueries are parsed and removed in to out, last to first;
|
|
# i.e. the last, inner-most subquery is ID 0 and the first, outermost
|
|
# subquery has the greatest ID. Each subquery ID corresponds to its index in
|
|
# the list of returned subquery hashrefs after the modified query. __SQ2__
|
|
# is subqueries[2]. Each hashref is like:
|
|
# * query Subquery text
|
|
# * context scalar, list or identifier
|
|
# * nested (optional) 1 if nested
|
|
# This sub does not handle UNION and it expects to that subqueries start
|
|
# with "(SELECT ". See SQLParser.t for examples.
|
|
sub remove_subqueries {
|
|
my ( $self, $query ) = @_;
|
|
|
|
# Find starting pos of all subqueries.
|
|
my @start_pos;
|
|
while ( $query =~ m/(\(SELECT )/gi ) {
|
|
my $pos = (pos $query) - (length $1);
|
|
push @start_pos, $pos;
|
|
}
|
|
|
|
# Starting with the inner-most, last subquery, find ending pos of
|
|
# all subqueries. This is done by counting open and close parentheses
|
|
# until all are closed. The last closing ) should close the ( that
|
|
# opened the subquery. No sane regex can help us here for cases like:
|
|
# (select max(id) from t where col in(1,2,3) and foo='(bar)').
|
|
@start_pos = reverse @start_pos;
|
|
my @end_pos;
|
|
for my $i ( 0..$#start_pos ) {
|
|
my $closed = 0;
|
|
pos $query = $start_pos[$i];
|
|
while ( $query =~ m/([\(\)])/cg ) {
|
|
my $c = $1;
|
|
$closed += ($c eq '(' ? 1 : -1);
|
|
last unless $closed;
|
|
}
|
|
push @end_pos, pos $query;
|
|
}
|
|
|
|
# Replace each subquery with a __SQn__ token.
|
|
my @subqueries;
|
|
my $len_adj = 0;
|
|
my $n = 0;
|
|
for my $i ( 0..$#start_pos ) {
|
|
PTDEBUG && _d('Query:', $query);
|
|
my $offset = $start_pos[$i];
|
|
my $len = $end_pos[$i] - $start_pos[$i] - $len_adj;
|
|
PTDEBUG && _d("Subquery $n start", $start_pos[$i],
|
|
'orig end', $end_pos[$i], 'adj', $len_adj, 'adj end',
|
|
$offset + $len, 'len', $len);
|
|
|
|
my $struct = {};
|
|
my $token = '__SQ' . $n . '__';
|
|
my $subquery = substr($query, $offset, $len, $token);
|
|
PTDEBUG && _d("Subquery $n:", $subquery);
|
|
|
|
# Adjust len for next outer subquery. This is required because the
|
|
# subqueries' start/end pos are found relative to one another, so
|
|
# when a subquery is replaced with its shorter __SQn__ token the end
|
|
# pos for the other subqueries decreases. The token is shorter than
|
|
# any valid subquery so the end pos should only decrease.
|
|
my $outer_start = $start_pos[$i + 1];
|
|
my $outer_end = $end_pos[$i + 1];
|
|
if ( $outer_start && ($outer_start < $start_pos[$i])
|
|
&& $outer_end && ($outer_end > $end_pos[$i]) ) {
|
|
PTDEBUG && _d("Subquery $n nested in next subquery");
|
|
$len_adj += $len - length $token;
|
|
$struct->{nested} = $i + 1;
|
|
}
|
|
else {
|
|
PTDEBUG && _d("Subquery $n not nested");
|
|
$len_adj = 0;
|
|
if ( $subqueries[-1] && $subqueries[-1]->{nested} ) {
|
|
PTDEBUG && _d("Outermost subquery");
|
|
}
|
|
}
|
|
|
|
# Get subquery context: scalar, list or identifier.
|
|
if ( $query =~ m/(?:=|>|<|>=|<=|<>|!=|<=>)\s*$token/ ) {
|
|
$struct->{context} = 'scalar';
|
|
}
|
|
elsif ( $query =~ m/\b(?:IN|ANY|SOME|ALL|EXISTS)\s*$token/i ) {
|
|
# Add ( ) around __SQn__ for things like "IN(__SQn__)"
|
|
# unless they're already there.
|
|
if ( $query !~ m/\($token\)/ ) {
|
|
$query =~ s/$token/\($token\)/;
|
|
$len_adj -= 2 if $struct->{nested};
|
|
}
|
|
$struct->{context} = 'list';
|
|
}
|
|
else {
|
|
# If the subquery is not preceded by an operator (=, >, etc.)
|
|
# or IN(), EXISTS(), etc. then it should be an identifier,
|
|
# either a derived table or column.
|
|
$struct->{context} = 'identifier';
|
|
}
|
|
PTDEBUG && _d("Subquery $n context:", $struct->{context});
|
|
|
|
# Remove ( ) around subquery so it can be parsed by a parse_TYPE sub.
|
|
$subquery =~ s/^\s*\(//;
|
|
$subquery =~ s/\s*\)\s*$//;
|
|
|
|
# Save subquery to struct after modifications above.
|
|
$struct->{query} = $subquery;
|
|
push @subqueries, $struct;
|
|
$n++;
|
|
}
|
|
|
|
return $query, @subqueries;
|
|
}
|
|
|
|
sub remove_using_columns {
|
|
my ($self, $from) = @_;
|
|
return unless $from;
|
|
PTDEBUG && _d('Removing cols from USING clauses');
|
|
my $using = qr/
|
|
\bUSING
|
|
\s*
|
|
\(
|
|
([^\)]+)
|
|
\)
|
|
/xi;
|
|
my @cols;
|
|
$from =~ s/$using/push @cols, $1; "USING ($#cols)"/eg;
|
|
PTDEBUG && _d('FROM:', $from, Dumper(\@cols));
|
|
return $from, \@cols;
|
|
}
|
|
|
|
sub replace_function {
|
|
my ($func, $funcs) = @_;
|
|
my ($func_name) = $func =~ m/^(\w+)/;
|
|
if ( !$ignore_function{uc $func_name} ) {
|
|
my $n = scalar @$funcs;
|
|
push @$funcs, $func;
|
|
return "__FUNC${n}__";
|
|
}
|
|
return $func;
|
|
}
|
|
|
|
sub remove_functions {
|
|
my ($self, $clause) = @_;
|
|
return unless $clause;
|
|
PTDEBUG && _d('Removing functions from clause:', $clause);
|
|
my @funcs;
|
|
$clause =~ s/$function_ident/replace_function($1, \@funcs)/eg;
|
|
PTDEBUG && _d('Function-stripped clause:', $clause, Dumper(\@funcs));
|
|
return $clause, \@funcs;
|
|
}
|
|
|
|
# Sub: parse_identifiers
|
|
# Parse an arrayref of identifiers into their parts. Identifiers can be
|
|
# column names (optionally qualified), expressions, or constants.
|
|
# GROUP BY and ORDER BY specify a list of identifiers.
|
|
#
|
|
# Parameters:
|
|
# $idents - Arrayref of indentifiers
|
|
#
|
|
# Returns:
|
|
# Arrayref of hashes with each identifier's parts, depending on what kind
|
|
# of identifier it is.
|
|
sub parse_identifiers {
|
|
my ( $self, $idents ) = @_;
|
|
return unless $idents;
|
|
PTDEBUG && _d("Parsing identifiers");
|
|
|
|
my @ident_parts;
|
|
foreach my $ident ( @$idents ) {
|
|
PTDEBUG && _d("Identifier:", $ident);
|
|
my $parts = {};
|
|
|
|
if ( $ident =~ s/\s+(ASC|DESC)\s*$//i ) {
|
|
$parts->{sort} = uc $1; # XXX
|
|
}
|
|
|
|
if ( $ident =~ m/^\d+$/ ) { # Position like 5
|
|
PTDEBUG && _d("Positional ident");
|
|
$parts->{position} = $ident;
|
|
}
|
|
elsif ( $ident =~ m/^\w+\(/ ) { # Function like MIN(col)
|
|
PTDEBUG && _d("Expression ident");
|
|
my ($func, $expr) = $ident =~ m/^(\w+)\(([^\)]*)\)/;
|
|
$parts->{function} = uc $func;
|
|
$parts->{expression} = $expr if $expr;
|
|
}
|
|
else { # Ref like (table.)column
|
|
PTDEBUG && _d("Table/column ident");
|
|
my ($tbl, $col) = $self->split_unquote($ident);
|
|
$parts->{table} = $tbl if $tbl;
|
|
$parts->{column} = $col;
|
|
}
|
|
push @ident_parts, $parts;
|
|
}
|
|
|
|
return \@ident_parts;
|
|
}
|
|
|
|
sub parse_identifier {
|
|
my ( $self, $type, $ident ) = @_;
|
|
return unless $type && $ident;
|
|
PTDEBUG && _d("Parsing", $type, "identifier:", $ident);
|
|
|
|
my ($func, $expr);
|
|
if ( $ident =~ m/^\w+\(/ ) { # Function like MIN(col)
|
|
($func, $expr) = $ident =~ m/^(\w+)\(([^\)]*)\)/;
|
|
PTDEBUG && _d('Function', $func, 'arg', $expr);
|
|
return { col => $ident } unless $expr; # NOW()
|
|
$ident = $expr; # col from MAX(col)
|
|
}
|
|
|
|
my %ident_struct;
|
|
my @ident_parts = map { s/`//g; $_; } split /[.]/, $ident;
|
|
if ( @ident_parts == 3 ) {
|
|
@ident_struct{qw(db tbl col)} = @ident_parts;
|
|
}
|
|
elsif ( @ident_parts == 2 ) {
|
|
my @parts_for_type = $type eq 'column' ? qw(tbl col)
|
|
: $type eq 'table' ? qw(db tbl)
|
|
: die "Invalid identifier type: $type";
|
|
@ident_struct{@parts_for_type} = @ident_parts;
|
|
}
|
|
elsif ( @ident_parts == 1 ) {
|
|
my $part = $type eq 'column' ? 'col' : 'tbl';
|
|
@ident_struct{($part)} = @ident_parts;
|
|
}
|
|
else {
|
|
die "Invalid number of parts in $type reference: $ident";
|
|
}
|
|
|
|
if ( $self->{Schema} ) {
|
|
if ( $type eq 'column' && (!$ident_struct{tbl} || !$ident_struct{db}) ) {
|
|
my $qcol = $self->{Schema}->find_column(%ident_struct);
|
|
if ( $qcol && @$qcol == 1 ) {
|
|
@ident_struct{qw(db tbl)} = @{$qcol->[0]}{qw(db tbl)};
|
|
}
|
|
}
|
|
elsif ( !$ident_struct{db} ) {
|
|
my $qtbl = $self->{Schema}->find_table(%ident_struct);
|
|
if ( $qtbl && @$qtbl == 1 ) {
|
|
$ident_struct{db} = $qtbl->[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( $func ) {
|
|
$ident_struct{func} = uc $func;
|
|
}
|
|
|
|
PTDEBUG && _d($type, "identifier struct:", Dumper(\%ident_struct));
|
|
return \%ident_struct;
|
|
}
|
|
|
|
# Sub: split_unquote
|
|
# Split and unquote a table name. The table name can be database-qualified
|
|
# or not, like `db`.`table`. The table name can be backtick-quoted or not.
|
|
#
|
|
# Parameters:
|
|
# $db_tbl - Table name
|
|
# $default_db - Default database name to return if $db_tbl is not
|
|
# database-qualified
|
|
#
|
|
# Returns:
|
|
# Array: unquoted database (possibly undef), unquoted table
|
|
sub split_unquote {
|
|
my ( $self, $db_tbl, $default_db ) = @_;
|
|
$db_tbl =~ s/`//g;
|
|
my ( $db, $tbl ) = split(/[.]/, $db_tbl);
|
|
if ( !$tbl ) {
|
|
$tbl = $db;
|
|
$db = $default_db;
|
|
}
|
|
return ($db, $tbl);
|
|
}
|
|
|
|
# Sub: is_identifier
|
|
# Determine if something is a schema object identifier.
|
|
# E.g.: `tbl` is an identifier, but "tbl" is a string and 1 is a number.
|
|
# See <http://dev.mysql.com/doc/refman/5.1/en/identifiers.html>
|
|
#
|
|
# Parameters:
|
|
# $thing - Name of something, including any quoting as it appears in a query.
|
|
#
|
|
# Returns:
|
|
# True of $thing is an identifier, else false.
|
|
sub is_identifier {
|
|
my ( $self, $thing ) = @_;
|
|
|
|
# Nothing is not an ident.
|
|
return 0 unless $thing;
|
|
|
|
# Tables, columns, FUNCTIONS(), etc. cannot be 'quoted' or "quoted"
|
|
# because that would make them strings, not idents.
|
|
return 0 if $thing =~ m/\s*['"]/;
|
|
|
|
# Numbers, ints or floats, are not identifiers.
|
|
return 0 if $thing =~ m/^\s*\d+(?:\.\d+)?\s*$/;
|
|
|
|
# Keywords are not identifiers.
|
|
return 0 if $thing =~ m/^\s*(?>
|
|
NULL
|
|
|DUAL
|
|
)\s*$/xi;
|
|
|
|
# The column ident really matches everything: db, db.tbl, db.tbl.col,
|
|
# function(), @@var, etc.
|
|
return 1 if $thing =~ m/^\s*$column_ident\s*$/;
|
|
|
|
# If the thing isn't quoted and doesn't match our ident pattern, then
|
|
# it's probably not an ident.
|
|
return 0;
|
|
}
|
|
|
|
sub set_Schema {
|
|
my ( $self, $sq ) = @_;
|
|
$self->{Schema} = $sq;
|
|
return;
|
|
}
|
|
|
|
sub _d {
|
|
my ($package, undef, $line) = caller 0;
|
|
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
|
|
map { defined $_ ? $_ : 'undef' }
|
|
@_;
|
|
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
|
|
}
|
|
|
|
1;
|
|
}
|
|
# ###########################################################################
|
|
# End SQLParser package
|
|
# ###########################################################################
|