# This program is copyright 2010-2012 Percona Ireland Ltd. # Feedback and improvements are welcome. # # THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, version 2; OR the Perl Artistic License. On UNIX and similar # systems, you can issue `man perlgpl' or `man perlartistic' to read these # licenses. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 Temple # Place, Suite 330, Boston, MA 02111-1307 USA. # ########################################################################### # SQLParser package # ########################################################################### { # Package: SQLParser # SQLParser parses common MySQL SQL statements into data structures. # This parser is MySQL-specific and intentionally meant to handle only # "common" cases. Although there are many limitations (like UNION, CASE, # etc.), many complex cases are handled that no other free, Perl SQL # parser at the time of writing can parse, notably subqueries in all their # places and varieties. # # This package has not been profiled and since it relies heavily on # mildly complex regex, so do not expect amazing performance. # # See SQLParser.t for examples of the various data structures. There are # many and they vary a lot depending on the statement parsed, so documentation # in this file is not exhaustive. # # This package differs from QueryParser because here we parse the entire SQL # statement (thus giving access to all its parts), whereas QueryParser extracts # just needed parts (and ignores all the rest). package SQLParser; use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); use constant PTDEBUG => $ENV{PTDEBUG} || 0; use Data::Dumper; $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Quotekeys = 0; # Basic identifiers for database, table, column and function names. my $quoted_ident = qr/`[^`]+`/; my $unquoted_ident = qr/ \@{0,2} # optional @ or @@ for variables \w+ # the ident name (?:\([^\)]*\))? # optional function params /x; my $ident_alias = qr/ \s+ # space before alias (?:(AS)\s+)? # optional AS keyword ((?>$quoted_ident|$unquoted_ident)) # alias /xi; # A table is identified by 1 or 2 identifiers separated by a period # and optionally followed by an alias. See parse_table_reference() # for why an optional index hint is not included here. my $table_ident = qr/(?: ((?:(?>$quoted_ident|$unquoted_ident)\.?){1,2}) # table (?:$ident_alias)? # optional alias )/xo; # A column is identified by 1 to 3 identifiers separated by periods # and optionally followed by an alias. my $column_ident = qr/(?: ((?:(?>$quoted_ident|$unquoted_ident|\*)\.?){1,3}) # column (?:$ident_alias)? # optional alias )/xo; my $function_ident = qr/ \b ( \w+ # function name \( # opening parenthesis [^\)]+ # function args, if any \) # closing parenthesis ) /x; my %ignore_function = ( INDEX => 1, KEY => 1, ); # Sub: new # Create a SQLParser object. # # Parameters: # %args - Arguments # # Optional Arguments: # Schema - object. Can be set later by calling . # # Returns: # SQLParser object sub new { my ( $class, %args ) = @_; my $self = { %args, }; return bless $self, $class; } # Sub: parse # Parse a SQL statement. Only statements of $allowed_types are parsed. # This sub recurses to parse subqueries. # # Parameters: # $query - SQL statement # # Returns: # A complex hashref of the parsed SQL statement. All keys and almost all # values are lowercase for consistency. The struct is roughly: # (start code) # { # type => '', # one of $allowed_types # clauses => {}, # raw, unparsed text of clauses # => struct # parsed clause struct, e.g. from => [] # keywords => {}, # LOW_PRIORITY, DISTINCT, SQL_CACHE, etc. # functions => {}, # MAX(), SUM(), NOW(), etc. # select => {}, # SELECT struct for INSERT/REPLACE ... SELECT # subqueries => [], # pointers to subquery structs # } # (end code) # It varies, of course, depending on the query. If something is missing # it means the query doesn't have that part. E.g. INSERT has an INTO clause # but DELETE does not, and only DELETE and SELECT have FROM clauses. Each # clause struct is different; see their respective parse_CLAUSE subs. sub parse { my ( $self, $query ) = @_; return unless $query; # Only these types of statements are parsed. my $allowed_types = qr/(?: DELETE |INSERT |REPLACE |SELECT |UPDATE |CREATE )/xi; # Flatten and clean query. $query = $self->clean_query($query); # Remove first word, should be the statement type. The parse_TYPE subs # expect that this is already removed. my $type; if ( $query =~ s/^(\w+)\s+// ) { $type = lc $1; PTDEBUG && _d('Query type:', $type); die "Cannot parse " . uc($type) . " queries" unless $type =~ m/$allowed_types/i; } else { die "Query does not begin with a word"; # shouldn't happen } $query = $self->normalize_keyword_spaces($query); # If query has any subqueries, remove/save them and replace them. # They'll be parsed later, after the main outer query. my @subqueries; if ( $query =~ m/(\(SELECT )/i ) { PTDEBUG && _d('Removing subqueries'); @subqueries = $self->remove_subqueries($query); $query = shift @subqueries; } elsif ( $type eq 'create' && $query =~ m/\s+SELECT/ ) { PTDEBUG && _d('CREATE..SELECT'); ($subqueries[0]->{query}) = $query =~ m/\s+(SELECT .+)/; $query =~ s/\s+SELECT.+//; } # Parse raw text parts from query. The parse_TYPE subs only do half # the work: parsing raw text parts of clauses, tables, functions, etc. # Since these parts are invariant (e.g. a LIMIT clause is same for any # type of SQL statement) they are parsed later via other parse_CLAUSE # subs, instead of parsing them individually in each parse_TYPE sub. my $parse_func = "parse_$type"; my $struct = $self->$parse_func($query); if ( !$struct ) { PTDEBUG && _d($parse_func, 'failed to parse query'); return; } $struct->{type} = $type; $self->_parse_clauses($struct); # TODO: parse functions if ( @subqueries ) { PTDEBUG && _d('Parsing subqueries'); foreach my $subquery ( @subqueries ) { my $subquery_struct = $self->parse($subquery->{query}); @{$subquery_struct}{keys %$subquery} = values %$subquery; push @{$struct->{subqueries}}, $subquery_struct; } } PTDEBUG && _d('Query struct:', Dumper($struct)); return $struct; } # Sub: _parse_clauses # Parse raw text of clauses into data structures. This sub recurses # to parse the clauses of subqueries. The clauses are read from # and their data structures saved into the $struct parameter. # # Parameters: # $struct - Hashref from which clauses are read (%{$struct->{clauses}}) # and into which data structs are saved (e.g. $struct->{from}=...). sub _parse_clauses { my ( $self, $struct ) = @_; # Parse raw text of clauses and functions. foreach my $clause ( keys %{$struct->{clauses}} ) { # Rename/remove clauses with space in their names, like ORDER BY. if ( $clause =~ m/ / ) { (my $clause_no_space = $clause) =~ s/ /_/g; $struct->{clauses}->{$clause_no_space} = $struct->{clauses}->{$clause}; delete $struct->{clauses}->{$clause}; $clause = $clause_no_space; } my $parse_func = "parse_$clause"; $struct->{$clause} = $self->$parse_func($struct->{clauses}->{$clause}); if ( $clause eq 'select' ) { PTDEBUG && _d('Parsing subquery clauses'); $struct->{select}->{type} = 'select'; $self->_parse_clauses($struct->{select}); } } return; } # Sub: clean_query # Remove spaces, flatten, and normalize some patterns for easier parsing. # # Parameters: # $query - SQL statement # # Returns: # Cleaned $query sub clean_query { my ( $self, $query ) = @_; return unless $query; # Whitespace and comments. $query =~ s/^\s*--.*$//gm; # -- comments $query =~ s/\s+/ /g; # extra spaces/flatten $query =~ s!/\*.*?\*/!!g; # /* comments */ $query =~ s/^\s+//; # leading spaces $query =~ s/\s+$//; # trailing spaces return $query; } # Sub: normalize_keyword_spaces # Normalize spaces around certain SQL keywords. Spaces are added and # removed around certain SQL keywords to make parsing easier. # # Parameters: # $query - SQL statement # # Returns: # Normalized $query sub normalize_keyword_spaces { my ( $self, $query ) = @_; # Add spaces between important tokens to help the parse_* subs. $query =~ s/\b(VALUE(?:S)?)\(/$1 (/i; $query =~ s/\bON\(/on (/gi; $query =~ s/\bUSING\(/using (/gi; # Start of (SELECT subquery). $query =~ s/\(\s+SELECT\s+/(SELECT /gi; return $query; } # Sub: _parse_query # This sub is called by the parse_TYPE subs except parse_insert. # It does two things: remove, save the given keywords, all of which # should appear at the beginning of the query; and, save (but not # remove) the given clauses. The query should start with the values # for the first clause because the query's first word was removed # in parse(). So for "SELECT cols FROM ...", the query given here # is "cols FROM ..." where "cols" belongs to the first clause "columns". # Then the query is walked clause-by-clause, saving each. # # Parameters: # $query - SQL statement with first word (SELECT, INSERT, etc.) removed # $keywords - Compiled regex of keywords that can appear in $query # $first_clause - First clause word to expect in $query # $clauses - Compiled regex of clause words that can appear in $query # # Returns: # Hashref with raw text of clauses sub _parse_query { my ( $self, $query, $keywords, $first_clause, $clauses ) = @_; return unless $query; my $struct = {}; # Save, remove keywords. 1 while $query =~ s/$keywords\s+/$struct->{keywords}->{lc $1}=1, ''/gie; # Go clausing. my @clause = grep { defined $_ } ($query =~ m/\G(.+?)(?:$clauses\s+|\Z)/gci); my $clause = $first_clause, my $value = shift @clause; $struct->{clauses}->{$clause} = $value; PTDEBUG && _d('Clause:', $clause, $value); # All other clauses. while ( @clause ) { $clause = shift @clause; $value = shift @clause; $struct->{clauses}->{lc $clause} = $value; PTDEBUG && _d('Clause:', $clause, $value); } ($struct->{unknown}) = ($query =~ m/\G(.+)/); return $struct; } sub parse_delete { my ( $self, $query ) = @_; if ( $query =~ s/FROM\s+//i ) { my $keywords = qr/(LOW_PRIORITY|QUICK|IGNORE)/i; my $clauses = qr/(FROM|WHERE|ORDER BY|LIMIT)/i; return $self->_parse_query($query, $keywords, 'from', $clauses); } else { die "DELETE without FROM: $query"; } } sub parse_insert { my ( $self, $query ) = @_; return unless $query; my $struct = {}; # Save, remove keywords. my $keywords = qr/(LOW_PRIORITY|DELAYED|HIGH_PRIORITY|IGNORE)/i; 1 while $query =~ s/$keywords\s+/$struct->{keywords}->{lc $1}=1, ''/gie; if ( $query =~ m/ON DUPLICATE KEY UPDATE (.+)/i ) { my $values = $1; die "No values after ON DUPLICATE KEY UPDATE: $query" unless $values; $struct->{clauses}->{on_duplicate} = $values; PTDEBUG && _d('Clause: on duplicate key update', $values); # This clause can be confused for JOIN ... ON in INSERT-SELECT queries, # so we remove the ON DUPLICATE KEY UPDATE clause after extracting its # values. $query =~ s/\s+ON DUPLICATE KEY UPDATE.+//; } # Parse INTO clause. Literal "INTO" is optional. if ( my @into = ($query =~ m/ (?=.*?(?:VALUE|SE(?:T|LECT))) # Avoid a backtracking explosion (?:INTO\s+)? # INTO, optional (.+?)\s+ # table ref (\([^\)]+\)\s+)? # column list, optional (VALUE.?|SET|SELECT)\s+ # start of next clause /xgci) ) { my $tbl = shift @into; # table ref $struct->{clauses}->{into} = $tbl; PTDEBUG && _d('Clause: into', $tbl); my $cols = shift @into; # columns, maybe if ( $cols ) { $cols =~ s/[\(\)]//g; $struct->{clauses}->{columns} = $cols; PTDEBUG && _d('Clause: columns', $cols); } my $next_clause = lc(shift @into); # VALUES, SET or SELECT die "INSERT/REPLACE without clause after table: $query" unless $next_clause; $next_clause = 'values' if $next_clause eq 'value'; my ($values) = ($query =~ m/\G(.+)/gci); die "INSERT/REPLACE without values: $query" unless $values; $struct->{clauses}->{$next_clause} = $values; PTDEBUG && _d('Clause:', $next_clause, $values); } # Save any leftovers. If there are any, parsing missed something. ($struct->{unknown}) = ($query =~ m/\G(.+)/); return $struct; } { # Suppress warnings like "Name "SQLParser::parse_set" used only once: # possible typo at SQLParser.pm line 480." caused by the fact that we # don't call these aliases directly, they're called indirectly using # $parse_func, hence Perl can't see their being called a compile time. no warnings; # INSERT and REPLACE are so similar that they are both parsed # in parse_insert(). *parse_replace = \&parse_insert; } sub parse_select { my ( $self, $query ) = @_; # Keywords are expected to be at the start of the query, so these # that appear at the end are handled separately. Afaik, SELECT is # only statement with optional keywords at the end. Also, these # appear to be the only keywords with spaces instead of _. my @keywords; my $final_keywords = qr/(FOR UPDATE|LOCK IN SHARE MODE)/i; 1 while $query =~ s/\s+$final_keywords/(push @keywords, $1), ''/gie; my $keywords = qr/( ALL |DISTINCT |DISTINCTROW |HIGH_PRIORITY |STRAIGHT_JOIN |SQL_SMALL_RESULT |SQL_BIG_RESULT |SQL_BUFFER_RESULT |SQL_CACHE |SQL_NO_CACHE |SQL_CALC_FOUND_ROWS )/xi; my $clauses = qr/( FROM |WHERE |GROUP\sBY |HAVING |ORDER\sBY |LIMIT |PROCEDURE |INTO OUTFILE )/xi; my $struct = $self->_parse_query($query, $keywords, 'columns', $clauses); # Add final keywords, if any. map { s/ /_/g; $struct->{keywords}->{lc $_} = 1; } @keywords; return $struct; } sub parse_update { my $keywords = qr/(LOW_PRIORITY|IGNORE)/i; my $clauses = qr/(SET|WHERE|ORDER BY|LIMIT)/i; return _parse_query(@_, $keywords, 'tables', $clauses); } sub parse_create { my ($self, $query) = @_; my ($obj, $name) = $query =~ m/ (\S+)\s+ (?:IF NOT EXISTS\s+)? (\S+) /xi; return { object => lc $obj, name => $name, unknown => undef, }; } # Sub: parse_from # Parse a FROM clause, a.k.a. the table references. Does not handle # nested joins. See http://dev.mysql.com/doc/refman/5.1/en/join.html # # Parameters: # $from - FROM clause (with the word "FROM") # # Returns: # Arrayref of hashrefs, one hashref for each table in the order that # the tables appear, like: # (start code) # { # name => 't2', -- table's real name # alias => 'b', -- table's alias, if any # explicit_alias => 1, -- if explicitly aliased with AS # join => { -- if joined to another table, all but first # -- table are because comma implies INNER JOIN # to => 't1', -- table name on left side of join, if this is # -- LEFT JOIN then this is the inner table, if # -- RIGHT JOIN then this is outer table # type => '', -- left, right, inner, outer, cross, natural # condition => 'using', -- on or using, if applicable # columns => ['id'], -- columns for USING condition, if applicable # ansi => 1, -- true of ANSI JOIN, i.e. true if not implicit # -- INNER JOIN due to following a comma # }, # }, # { # name => 't3', # join => { # to => 't2', # type => 'left', # condition => 'on', -- an ON condition is like a WHERE clause so # where => [...] -- this arrayref of predicates appears, see # -- for its structure # }, # }, # (end code) sub parse_from { my ( $self, $from ) = @_; return unless $from; PTDEBUG && _d('Parsing FROM', $from); # Extract the column list from USING(col, ...) clauses else # the inner commas will be captured by $comma_join. my $using_cols; ($from, $using_cols) = $self->remove_using_columns($from); my $funcs; ($from, $funcs) = $self->remove_functions($from); # Table references in a FROM clause are separated either by commas # (comma/theta join, implicit INNER join) or the JOIN keyword (ansi # join). JOIN can be preceded by other keywords like LEFT, RIGHT, # OUTER, etc. There must be spaces before and after JOIN and its # keywords, but there does not have to be spaces before or after a # comma. See http://dev.mysql.com/doc/refman/5.5/en/join.html my $comma_join = qr/(?>\s*,\s*)/; my $ansi_join = qr/(?> \s+ (?:(?:INNER|CROSS|STRAIGHT_JOIN|LEFT|RIGHT|OUTER|NATURAL)\s+)* JOIN \s+ )/xi; my @tbls; # all table refs, a hashref for each my $tbl_ref; # current table ref hashref my $join; # join info hahsref for current table ref foreach my $thing ( split /($comma_join|$ansi_join)/io, $from ) { # We shouldn't parse empty things. die "Error parsing FROM clause" unless $thing; # Strip leading and trailing spaces. $thing =~ s/^\s+//; $thing =~ s/\s+$//; PTDEBUG && _d('Table thing:', $thing); if ( $thing =~ m/\s+(?:ON|USING)\s+/i ) { PTDEBUG && _d("JOIN condition"); # This join condition follows a JOIN (comma joins don't have # conditions). It includes a table ref, ON|USING, and then # the value to ON|USING. my ($tbl_ref_txt, $join_condition_verb, $join_condition_value) = $thing =~ m/^(.+?)\s+(ON|USING)\s+(.+)/i; $tbl_ref = $self->parse_table_reference($tbl_ref_txt); $join->{condition} = lc $join_condition_verb; if ( $join->{condition} eq 'on' ) { # The value for ON can be, as the MySQL manual says, is just # like a WHERE clause. $join->{where} = $self->parse_where($join_condition_value, $funcs); } else { # USING # Although calling parse_columns() works, it's overkill. # This is not a columns def as in "SELECT col1, col2", it's # a simple csv list of column names without aliases, etc. $join->{columns} = $self->_parse_csv(shift @$using_cols); } } elsif ( $thing =~ m/(?:,|JOIN)/i ) { # A comma or JOIN signals the end of the current table ref and # the beginning of the next table ref. Save the current table ref. if ( $join ) { $tbl_ref->{join} = $join; } push @tbls, $tbl_ref; PTDEBUG && _d("Complete table reference:", Dumper($tbl_ref)); # Reset vars for the next table ref. $tbl_ref = undef; $join = {}; # Next table ref becomes the current table ref. It's joined to # the previous table ref either implicitly (comma join) or explicitly # (ansi join). $join->{to} = $tbls[-1]->{tbl}; if ( $thing eq ',' ) { $join->{type} = 'inner'; $join->{ansi} = 0; } else { # ansi join my $type = $thing =~ m/^(.+?)\s+JOIN$/i ? lc $1 : 'inner'; $join->{type} = $type; $join->{ansi} = 1; } } else { # First table ref and comma-joined tables. $tbl_ref = $self->parse_table_reference($thing); PTDEBUG && _d('Table reference:', Dumper($tbl_ref)); } } # Save the last table ref. It's not completed in the loop above because # there's no comma or JOIN after it. if ( $tbl_ref ) { if ( $join ) { $tbl_ref->{join} = $join; } push @tbls, $tbl_ref; PTDEBUG && _d("Complete table reference:", Dumper($tbl_ref)); } return \@tbls; } # Parse a table ref like "tbl", "tbl alias" or "tbl AS alias", where # tbl can be optionally "db." qualified. Also handles FORCE|USE|IGNORE # INDEX hints. Does not handle "FOR JOIN" hint because "JOIN" here gets # confused with the "JOIN" thing in parse_from(). sub parse_table_reference { my ( $self, $tbl_ref ) = @_; return unless $tbl_ref; PTDEBUG && _d('Parsing table reference:', $tbl_ref); my %tbl; # First, check for an index hint. Remove and save it if present. # This can't be included in the $table_ident regex because, for example, # `tbl` FORCE INDEX (foo), makes FORCE look like an implicit alias. if ( $tbl_ref =~ s/ \s+( (?:FORCE|USE|IGNORE)\s (?:INDEX|KEY) \s*\([^\)]+\)\s* )//xi) { $tbl{index_hint} = $1; PTDEBUG && _d('Index hint:', $tbl{index_hint}); } if ( $tbl_ref =~ m/$table_ident/ ) { my ($db_tbl, $as, $alias) = ($1, $2, $3); # XXX my $ident_struct = $self->parse_identifier('table', $db_tbl); $alias =~ s/`//g if $alias; @tbl{keys %$ident_struct} = values %$ident_struct; $tbl{explicit_alias} = 1 if $as; $tbl{alias} = $alias if $alias; } else { die "Table ident match failed"; # shouldn't happen } return \%tbl; } { no warnings; # Why? See same line above. *parse_into = \&parse_from; *parse_tables = \&parse_from; } # This is not your traditional parser, but it works for simple to rather # complex cases, with a few noted and intentional limitations. First, # the limitations: # # * probably doesn't handle every possible operator (see $op) # * doesn't care about grouping with parentheses # * not "fully" tested because the possibilities are infinite # # It works in four steps; let's take this WHERE clause as an example: # # i="x and y" or j in ("and", "or") and x is not null or a between 1 and 10 and sz="this 'and' foo" # # The first step splits the string on and|or, the only two keywords I'm # aware of that join the separate predicates. This step doesn't care if # and|or is really between two predicates or in a string or something else. # The second step is done while the first step is being done: check predicate # "fragments" (from step 1) for operators; save which ones have and don't # have at least one operator. So the result of step 1 and 2 is: # # PREDICATE FRAGMENT OPERATOR # ================================ ======== # i="x Y # and y" N # or j in (" Y # and", " N # or") N # and x is not null Y # or a between 1 Y # and 10 N # and sz="this ' Y # and' foo" N # # The third step runs through the list of pred frags backwards and joins # the current frag to the preceding frag if it does not have an operator. # The result is: # # PREDICATE FRAGMENT OPERATOR # ================================ ======== # i="x and y" Y # N # or j in ("and", "or") Y # N # N # and x is not null Y # or a between 1 and 10 Y # N # and sz="this 'and' foo" Y # N # # The fourth step is similar but not shown: pred frags with unbalanced ' or " # are joined to the preceding pred frag. This fixes cases where a pred frag # has multiple and|or in a string value; e.g. "foo and bar or dog". # # After the pred frags are complete, the parts of these predicates are parsed # and returned in an arrayref of hashrefs like: # # { # predicate => 'and', # column => 'id', # operator => '>=', # value => '42', # } # # Invalid predicates, or valid ones that we can't parse, will cause # the sub to die. sub parse_where { my ( $self, $where, $functions ) = @_; return unless $where; PTDEBUG && _d("Parsing WHERE", $where); # Not all the operators listed at # http://dev.mysql.com/doc/refman/5.1/en/non-typed-operators.html # are supported. E.g. - (minus) is an op but does it ever show up # in a where clause? "col-3=2" is valid (where col=5), but we're # not interested in weird stuff like that. my $op_symbol = qr/ (?: <=(?:>)? |>= |<> |!= |< |> |= )/xi; my $op_verb = qr/ (?: (?:(?:NOT\s)?LIKE) |(?:IS(?:\sNOT\s)?) |(?:(?:\sNOT\s)?BETWEEN) |(?:(?:NOT\s)?IN) ) /xi; my $op_pat = qr/ ( (?> (?:$op_symbol) # don't need spaces around the symbols, e.g.: col=1 |(?:\s+$op_verb) # must have space before verb op, e.g.: col LIKE ... ) )/x; # Step 1 and 2: split on and|or and look for operators. my $offset = 0; my $pred = ""; my @pred; my @has_op; while ( $where =~ m/\b(and|or)\b/gi ) { my $pos = (pos $where) - (length $1); # pos at and|or, not after $pred = substr $where, $offset, ($pos-$offset); push @pred, $pred; push @has_op, $pred =~ m/$op_pat/o ? 1 : 0; $offset = $pos; } # Final predicate fragment: last and|or to end of string. $pred = substr $where, $offset; push @pred, $pred; push @has_op, $pred =~ m/$op_pat/o ? 1 : 0; PTDEBUG && _d("Predicate fragments:", Dumper(\@pred)); PTDEBUG && _d("Predicate frags with operators:", @has_op); # Step 3: join pred frags without ops to preceding pred frag. my $n = scalar @pred - 1; for my $i ( 1..$n ) { $i *= -1; my $j = $i - 1; # preceding pred frag # Two constants in a row, like "TRUE or FALSE", are a special case. # The current pred ($i) will not have an op but in this case it's # not a continuation of the preceding pred ($j) so we don't want to # join them. And there's a special case within this special case: # "BETWEEN 1 AND 10". _is_constant() strips leading AND or OR so # 10 is going to look like an independent constant but really it's # part of the BETWEEN op, so this whole special check is skipped # if the preceding pred contains BETWEEN. Yes, parsing SQL is tricky. next if $pred[$j] !~ m/\s+between\s+/i && $self->_is_constant($pred[$i]); if ( !$has_op[$i] ) { $pred[$j] .= $pred[$i]; $pred[$i] = undef; } } PTDEBUG && _d("Predicate fragments joined:", Dumper(\@pred)); # Step 4: join pred frags with unbalanced ' or " to preceding pred frag. for my $i ( 0..@pred ) { $pred = $pred[$i]; next unless defined $pred; my $n_single_quotes = ($pred =~ tr/'//); my $n_double_quotes = ($pred =~ tr/"//); if ( ($n_single_quotes % 2) || ($n_double_quotes % 2) ) { $pred[$i] .= $pred[$i + 1]; $pred[$i + 1] = undef; } } PTDEBUG && _d("Predicate fragments balanced:", Dumper(\@pred)); # Parse, clean up and save the complete predicates. my @predicates; foreach my $pred ( @pred ) { next unless defined $pred; $pred =~ s/^\s+//; $pred =~ s/\s+$//; my $conj; if ( $pred =~ s/^(and|or)\s+//i ) { $conj = lc $1; } my ($col, $op, $val) = $pred =~ m/^(.+?)$op_pat(.+)$/o; if ( !$col || !$op ) { if ( $self->_is_constant($pred) ) { $val = lc $pred; } else { die "Failed to parse WHERE condition: $pred"; } } # Remove whitespace and lowercase some keywords. if ( $col ) { $col =~ s/\s+$//; $col =~ s/^\(+//; # no unquoted column name begins with ( } if ( $op ) { $op = lc $op; $op =~ s/^\s+//; $op =~ s/\s+$//; } $val =~ s/^\s+//; # No unquoted value ends with ) except FUNCTION(...) if ( ($op || '') !~ m/IN/i && $val !~ m/^\w+\([^\)]+\)$/ ) { $val =~ s/\)+$//; } if ( $val =~ m/NULL|TRUE|FALSE/i ) { $val = lc $val; } if ( $functions ) { $col = shift @$functions if $col =~ m/__FUNC\d+__/; $val = shift @$functions if $val =~ m/__FUNC\d+__/; } push @predicates, { predicate => $conj, left_arg => $col, operator => $op, right_arg => $val, }; } return \@predicates; } # Returns true if the value is a constant. Constants are TRUE, FALSE, # and any signed number. A leading AND or OR keyword is removed first. sub _is_constant { my ( $self, $val ) = @_; return 0 unless defined $val; $val =~ s/^\s*(?:and|or)\s+//; return $val =~ m/^\s*(?:TRUE|FALSE)\s*$/i || $val =~ m/^\s*-?\d+\s*$/ ? 1 : 0; } sub parse_having { my ( $self, $having ) = @_; # TODO return $having; } # GROUP BY {col_name | expr | position} [ASC | DESC], ... [WITH ROLLUP] sub parse_group_by { my ( $self, $group_by ) = @_; return unless $group_by; PTDEBUG && _d('Parsing GROUP BY', $group_by); # Remove special "WITH ROLLUP" clause so we're left with a simple csv list. my $with_rollup = $group_by =~ s/\s+WITH ROLLUP\s*//i; # Parse the identifiers. my $idents = $self->parse_identifiers( $self->_parse_csv($group_by) ); $idents->{with_rollup} = 1 if $with_rollup; return $idents; } # [ORDER BY {col_name | expr | position} [ASC | DESC], ...] sub parse_order_by { my ( $self, $order_by ) = @_; return unless $order_by; PTDEBUG && _d('Parsing ORDER BY', $order_by); my $idents = $self->parse_identifiers( $self->_parse_csv($order_by) ); return $idents; } # [LIMIT {[offset,] row_count | row_count OFFSET offset}] sub parse_limit { my ( $self, $limit ) = @_; return unless $limit; my $struct = { row_count => undef, }; if ( $limit =~ m/(\S+)\s+OFFSET\s+(\S+)/i ) { $struct->{explicit_offset} = 1; $struct->{row_count} = $1; $struct->{offset} = $2; } else { my ($offset, $cnt) = $limit =~ m/(?:(\S+),\s+)?(\S+)/i; $struct->{row_count} = $cnt; $struct->{offset} = $offset if defined $offset; } return $struct; } # Parses the list of values after, e.g., INSERT tbl VALUES (...), (...). # Does not currently parse each set of values; it just splits the list. sub parse_values { my ( $self, $values ) = @_; return unless $values; $values =~ s/^\s*\(//; $values =~ s/\s*\)//; my $vals = $self->_parse_csv( $values, quoted_values => 1, remove_quotes => 0, ); return $vals; } sub parse_set { my ( $self, $set ) = @_; PTDEBUG && _d("Parse SET", $set); return unless $set; my $vals = $self->_parse_csv($set); return unless $vals && @$vals; my @set; foreach my $col_val ( @$vals ) { # Do not remove quotes around the val because quotes let us determine # the value's type. E.g. tbl might be a table, but "tbl" is a string, # and NOW() is the function, but 'NOW()' is a string. my ($col, $val) = $col_val =~ m/^([^=]+)\s*=\s*(.+)/; my $ident_struct = $self->parse_identifier('column', $col); my $set_struct = { %$ident_struct, value => $val, }; PTDEBUG && _d("SET:", Dumper($set_struct)); push @set, $set_struct; } return \@set; } # Split any comma-separated list of values, removing leading # and trailing spaces. sub _parse_csv { my ( $self, $vals, %args ) = @_; return unless $vals; my @vals; if ( $args{quoted_values} ) { # If the vals are quoted, then they can contain commas, like: # "hello, world!", 'batman'. If only we could use Text::CSV, # then we wouldn't write yet another csv parser to handle this, # but Percona Toolkit doesn't like package dependencies, so here's # our light implementation of this classic problem. my $quote_char = ''; VAL: foreach my $val ( split(',', $vals) ) { PTDEBUG && _d("Next value:", $val); # If there's a quote char, then this val is the rest of a previously # quoted and split value. if ( $quote_char ) { PTDEBUG && _d("Value is part of previous quoted value"); # split() removed the comma inside the quoted value, # so add it back else "hello, world" is incorrectly # returned as "hello world". $vals[-1] .= ",$val"; # Quoted and split value is complete when a val ends with the # same quote char that began the split value. if ( $val =~ m/[^\\]*$quote_char$/ ) { if ( $args{remove_quotes} ) { $vals[-1] =~ s/^\s*$quote_char//; $vals[-1] =~ s/$quote_char\s*$//; } PTDEBUG && _d("Previous quoted value is complete:", $vals[-1]); $quote_char = ''; } next VAL; } # Start of new value so strip leading spaces but not trailing # spaces yet because if the next check determines that this is # a quoted and split val, then trailing space is actually space # inside the quoted val, so we want to preserve it. $val =~ s/^\s+//; # A value is quoted *and* split (because there's a comma in the # quoted value) if the vale begins with a quote char and does not # end with that char. E.g.: "foo but not "foo". The val "foo is # the first part of the split value, e.g. "foo, bar". if ( $val =~ m/^(['"])/ ) { PTDEBUG && _d("Value is quoted"); $quote_char = $1; # XXX if ( $val =~ m/.$quote_char$/ ) { PTDEBUG && _d("Value is complete"); $quote_char = ''; if ( $args{remove_quotes} ) { $vals[-1] =~ s/^\s*$quote_char//; $vals[-1] =~ s/$quote_char\s*$//; } } else { PTDEBUG && _d("Quoted value is not complete"); } } else { $val =~ s/\s+$//; } # Save complete value (e.g. foo or "foo" without the quotes), # or save the first part of a quoted and split value; the rest # of such a value will be joined back above. PTDEBUG && _d("Saving value", ($quote_char ? "fragment" : "")); push @vals, $val; } } else { @vals = map { s/^\s+//; s/\s+$//; $_ } split(',', $vals); } return \@vals; } { no warnings; # Why? See same line above. *parse_on_duplicate = \&_parse_csv; } sub parse_columns { my ( $self, $cols ) = @_; PTDEBUG && _d('Parsing columns list:', $cols); my @cols; pos $cols = 0; while (pos $cols < length $cols) { if ($cols =~ m/\G\s*$column_ident\s*(?>,|\Z)/gcxo) { my ($db_tbl_col, $as, $alias) = ($1, $2, $3); # XXX my $ident_struct = $self->parse_identifier('column', $db_tbl_col); $alias =~ s/`//g if $alias; my $col_struct = { %$ident_struct, ($as ? (explicit_alias => 1) : ()), ($alias ? (alias => $alias) : ()), }; push @cols, $col_struct; } else { die "Column ident match failed"; # shouldn't happen } } return \@cols; } # Remove subqueries from query, return modified query and list of subqueries. # Each subquery is replaced with the special token __SQn__ where n is the # subquery's ID. Subqueries are parsed and removed in to out, last to first; # i.e. the last, inner-most subquery is ID 0 and the first, outermost # subquery has the greatest ID. Each subquery ID corresponds to its index in # the list of returned subquery hashrefs after the modified query. __SQ2__ # is subqueries[2]. Each hashref is like: # * query Subquery text # * context scalar, list or identifier # * nested (optional) 1 if nested # This sub does not handle UNION and it expects to that subqueries start # with "(SELECT ". See SQLParser.t for examples. sub remove_subqueries { my ( $self, $query ) = @_; # Find starting pos of all subqueries. my @start_pos; while ( $query =~ m/(\(SELECT )/gi ) { my $pos = (pos $query) - (length $1); push @start_pos, $pos; } # Starting with the inner-most, last subquery, find ending pos of # all subqueries. This is done by counting open and close parentheses # until all are closed. The last closing ) should close the ( that # opened the subquery. No sane regex can help us here for cases like: # (select max(id) from t where col in(1,2,3) and foo='(bar)'). @start_pos = reverse @start_pos; my @end_pos; for my $i ( 0..$#start_pos ) { my $closed = 0; pos $query = $start_pos[$i]; while ( $query =~ m/([\(\)])/cg ) { my $c = $1; $closed += ($c eq '(' ? 1 : -1); last unless $closed; } push @end_pos, pos $query; } # Replace each subquery with a __SQn__ token. my @subqueries; my $len_adj = 0; my $n = 0; for my $i ( 0..$#start_pos ) { PTDEBUG && _d('Query:', $query); my $offset = $start_pos[$i]; my $len = $end_pos[$i] - $start_pos[$i] - $len_adj; PTDEBUG && _d("Subquery $n start", $start_pos[$i], 'orig end', $end_pos[$i], 'adj', $len_adj, 'adj end', $offset + $len, 'len', $len); my $struct = {}; my $token = '__SQ' . $n . '__'; my $subquery = substr($query, $offset, $len, $token); PTDEBUG && _d("Subquery $n:", $subquery); # Adjust len for next outer subquery. This is required because the # subqueries' start/end pos are found relative to one another, so # when a subquery is replaced with its shorter __SQn__ token the end # pos for the other subqueries decreases. The token is shorter than # any valid subquery so the end pos should only decrease. my $outer_start = $start_pos[$i + 1]; my $outer_end = $end_pos[$i + 1]; if ( $outer_start && ($outer_start < $start_pos[$i]) && $outer_end && ($outer_end > $end_pos[$i]) ) { PTDEBUG && _d("Subquery $n nested in next subquery"); $len_adj += $len - length $token; $struct->{nested} = $i + 1; } else { PTDEBUG && _d("Subquery $n not nested"); $len_adj = 0; if ( $subqueries[-1] && $subqueries[-1]->{nested} ) { PTDEBUG && _d("Outermost subquery"); } } # Get subquery context: scalar, list or identifier. if ( $query =~ m/(?:=|>|<|>=|<=|<>|!=|<=>)\s*$token/ ) { $struct->{context} = 'scalar'; } elsif ( $query =~ m/\b(?:IN|ANY|SOME|ALL|EXISTS)\s*$token/i ) { # Add ( ) around __SQn__ for things like "IN(__SQn__)" # unless they're already there. if ( $query !~ m/\($token\)/ ) { $query =~ s/$token/\($token\)/; $len_adj -= 2 if $struct->{nested}; } $struct->{context} = 'list'; } else { # If the subquery is not preceded by an operator (=, >, etc.) # or IN(), EXISTS(), etc. then it should be an identifier, # either a derived table or column. $struct->{context} = 'identifier'; } PTDEBUG && _d("Subquery $n context:", $struct->{context}); # Remove ( ) around subquery so it can be parsed by a parse_TYPE sub. $subquery =~ s/^\s*\(//; $subquery =~ s/\s*\)\s*$//; # Save subquery to struct after modifications above. $struct->{query} = $subquery; push @subqueries, $struct; $n++; } return $query, @subqueries; } sub remove_using_columns { my ($self, $from) = @_; return unless $from; PTDEBUG && _d('Removing cols from USING clauses'); my $using = qr/ \bUSING \s* \( ([^\)]+) \) /xi; my @cols; $from =~ s/$using/push @cols, $1; "USING ($#cols)"/eg; PTDEBUG && _d('FROM:', $from, Dumper(\@cols)); return $from, \@cols; } sub replace_function { my ($func, $funcs) = @_; my ($func_name) = $func =~ m/^(\w+)/; if ( !$ignore_function{uc $func_name} ) { my $n = scalar @$funcs; push @$funcs, $func; return "__FUNC${n}__"; } return $func; } sub remove_functions { my ($self, $clause) = @_; return unless $clause; PTDEBUG && _d('Removing functions from clause:', $clause); my @funcs; $clause =~ s/$function_ident/replace_function($1, \@funcs)/eg; PTDEBUG && _d('Function-stripped clause:', $clause, Dumper(\@funcs)); return $clause, \@funcs; } # Sub: parse_identifiers # Parse an arrayref of identifiers into their parts. Identifiers can be # column names (optionally qualified), expressions, or constants. # GROUP BY and ORDER BY specify a list of identifiers. # # Parameters: # $idents - Arrayref of indentifiers # # Returns: # Arrayref of hashes with each identifier's parts, depending on what kind # of identifier it is. sub parse_identifiers { my ( $self, $idents ) = @_; return unless $idents; PTDEBUG && _d("Parsing identifiers"); my @ident_parts; foreach my $ident ( @$idents ) { PTDEBUG && _d("Identifier:", $ident); my $parts = {}; if ( $ident =~ s/\s+(ASC|DESC)\s*$//i ) { $parts->{sort} = uc $1; # XXX } if ( $ident =~ m/^\d+$/ ) { # Position like 5 PTDEBUG && _d("Positional ident"); $parts->{position} = $ident; } elsif ( $ident =~ m/^\w+\(/ ) { # Function like MIN(col) PTDEBUG && _d("Expression ident"); my ($func, $expr) = $ident =~ m/^(\w+)\(([^\)]*)\)/; $parts->{function} = uc $func; $parts->{expression} = $expr if $expr; } else { # Ref like (table.)column PTDEBUG && _d("Table/column ident"); my ($tbl, $col) = $self->split_unquote($ident); $parts->{table} = $tbl if $tbl; $parts->{column} = $col; } push @ident_parts, $parts; } return \@ident_parts; } sub parse_identifier { my ( $self, $type, $ident ) = @_; return unless $type && $ident; PTDEBUG && _d("Parsing", $type, "identifier:", $ident); my ($func, $expr); if ( $ident =~ m/^\w+\(/ ) { # Function like MIN(col) ($func, $expr) = $ident =~ m/^(\w+)\(([^\)]*)\)/; PTDEBUG && _d('Function', $func, 'arg', $expr); return { col => $ident } unless $expr; # NOW() $ident = $expr; # col from MAX(col) } my %ident_struct; my @ident_parts = map { s/`//g; $_; } split /[.]/, $ident; if ( @ident_parts == 3 ) { @ident_struct{qw(db tbl col)} = @ident_parts; } elsif ( @ident_parts == 2 ) { my @parts_for_type = $type eq 'column' ? qw(tbl col) : $type eq 'table' ? qw(db tbl) : die "Invalid identifier type: $type"; @ident_struct{@parts_for_type} = @ident_parts; } elsif ( @ident_parts == 1 ) { my $part = $type eq 'column' ? 'col' : 'tbl'; @ident_struct{($part)} = @ident_parts; } else { die "Invalid number of parts in $type reference: $ident"; } if ( $self->{Schema} ) { if ( $type eq 'column' && (!$ident_struct{tbl} || !$ident_struct{db}) ) { my $qcol = $self->{Schema}->find_column(%ident_struct); if ( $qcol && @$qcol == 1 ) { @ident_struct{qw(db tbl)} = @{$qcol->[0]}{qw(db tbl)}; } } elsif ( !$ident_struct{db} ) { my $qtbl = $self->{Schema}->find_table(%ident_struct); if ( $qtbl && @$qtbl == 1 ) { $ident_struct{db} = $qtbl->[0]; } } } if ( $func ) { $ident_struct{func} = uc $func; } PTDEBUG && _d($type, "identifier struct:", Dumper(\%ident_struct)); return \%ident_struct; } # Sub: split_unquote # Split and unquote a table name. The table name can be database-qualified # or not, like `db`.`table`. The table name can be backtick-quoted or not. # # Parameters: # $db_tbl - Table name # $default_db - Default database name to return if $db_tbl is not # database-qualified # # Returns: # Array: unquoted database (possibly undef), unquoted table sub split_unquote { my ( $self, $db_tbl, $default_db ) = @_; $db_tbl =~ s/`//g; my ( $db, $tbl ) = split(/[.]/, $db_tbl); if ( !$tbl ) { $tbl = $db; $db = $default_db; } return ($db, $tbl); } # Sub: is_identifier # Determine if something is a schema object identifier. # E.g.: `tbl` is an identifier, but "tbl" is a string and 1 is a number. # See # # Parameters: # $thing - Name of something, including any quoting as it appears in a query. # # Returns: # True of $thing is an identifier, else false. sub is_identifier { my ( $self, $thing ) = @_; # Nothing is not an ident. return 0 unless $thing; # Tables, columns, FUNCTIONS(), etc. cannot be 'quoted' or "quoted" # because that would make them strings, not idents. return 0 if $thing =~ m/\s*['"]/; # Numbers, ints or floats, are not identifiers. return 0 if $thing =~ m/^\s*\d+(?:\.\d+)?\s*$/; # Keywords are not identifiers. return 0 if $thing =~ m/^\s*(?> NULL |DUAL )\s*$/xi; # The column ident really matches everything: db, db.tbl, db.tbl.col, # function(), @@var, etc. return 1 if $thing =~ m/^\s*$column_ident\s*$/; # If the thing isn't quoted and doesn't match our ident pattern, then # it's probably not an ident. return 0; } sub set_Schema { my ( $self, $sq ) = @_; $self->{Schema} = $sq; return; } sub _d { my ($package, undef, $line) = caller 0; @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } map { defined $_ ? $_ : 'undef' } @_; print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; } 1; } # ########################################################################### # End SQLParser package # ###########################################################################