Include serialize_list() and deserialize_list() to roundtrip through a DB.

2025-12-12 02:00:49 +08:00 · 2011-12-28 16:44:49 -03:00
parent 56375da166
commit 88e82d665d
2 changed files with 135 additions and 1 deletions
--- a/lib/Quoter.pm
+++ b/lib/Quoter.pm
@@ -144,6 +144,68 @@ sub join_quote {
   return $db ? "$db.$tbl" : $tbl;
 }
 # Nothing much going on here. Return the list passed in,
 # with the elements passed through quotemeta, and the results
 # concatenated with ','
 sub serialize_list {
   my @args = @_;
   if ( @args && $args[-1] eq '' ) {
      # If the last element is an empty string, it conflicts
      # with the assumptions of the somewhat lax regex below,
      # which always leaves an empty element in the end.
      # We could fix the regex, but it's a lot of extra
      # complexity for little gain, or we could add a
      # special-case here. Just by tagging another empty
      # string, we get the desired result.
      push @args, '';
   }
   return join ',', map { quotemeta } @args;
 }
 sub deserialize_list {
   my ( $string )      = @_;
   my @escaped_parts   = $string =~ /
         \G             # Start of string, or end of previous match.
         (              # Each of these is an element in the original list.
            [^\\,]*     # Anything not a backslash or a comma
            (?:         # When we get here, we found one of the above.
               \\.      # A backslash followed by something means we can continue
               [^\\,]*  # Same as above.
            )*          # Repeat zero of more times.
         )
         (?:,|\z)       # Comma dividing elements, or absolute end of the string.
      /sxg;
   pop @escaped_parts; # Last element will always be empty. Flaw in the regex.
                       # But easier to fix this way. Faster, too.
   my @unescaped_parts = map {
         # Undo the quotemeta().
         my $part = $_;
         # Here be weirdness. Unfortunately quotemeta() is broken, and exposes
         # the internal representation of scalars. Namely, the latin-1 range,
         # \128-\377 (\p{Latin1} in newer Perls) is all escaped in downgraded
         # strings, but left alone in UTF-8 strings. Thus, this.
            # TODO: quotemeta() might change in 5.16 to mean
            # qr/(?=\p{ASCII})\W|\p{Pattern_Syntax}/
            # And also fix this whole weird behavior under
            # use feature 'unicode_strings' --  If/once that's
            # implemented, this will have to change.
         my $char_class = utf8::is_utf8($part)  # If it's a UTF-8 string,
                        ? qr/(?=\p{ASCII})\W/   # We only care about non-word
                                                # characters in the ASCII range
                        : qr/(?=\p{ASCII})\W|[\x{80}-\x{FF}]/; # Otherwise,
                                                # same as above, but also
                                                # unescape the latin-1 range.
         $part =~ s/\\($char_class)/$1/g;
                  # As a somewhat uplifting note, all of the above is more
                  # or less fixed in newer Perls! quotemeta() is still
                  # broken, but regexen can deal with it more naturally.
         $part;
      } @escaped_parts;
   return @unescaped_parts;
 }
 1;
 }
 # ###########################################################################
--- a/t/lib/Quoter.t
+++ b/t/lib/Quoter.t
@@ -9,7 +9,7 @@ BEGIN {
 use strict;
 use warnings FATAL => 'all';
 use English qw(-no_match_vars);
-use Test::More tests => 31;
+use Test::More tests => 63;
 use Quoter;
 use PerconaTest;
@@ -99,4 +99,76 @@ is( $q->join_quote('`db`', '`tbl`'), '`db`.`tbl`', 'join_merge(`db`, `tbl`)' );
 is( $q->join_quote(undef, '`tbl`'), '`tbl`', 'join_merge(undef, `tbl`)'  );
 is( $q->join_quote('`db`', '`foo`.`tbl`'), '`foo`.`tbl`', 'join_merge(`db`, `foo`.`tbl`)' );
 my @serialize_tests = (
   [ 'a', 'b', ],
   [ 'a,', 'b', ],
   [ "a,\\\nc\nas", 'b', ],
   [ 'a\\\,a', 'c', ],
   [ 'a\\\\,a', 'c', ],
   [ 'a\\\\\,aa', 'c', ],
   [ 'a\\\\\\,aa', 'c', ],
   [ 'a\\\,a,a', 'c,d,e,d,', ],
   [ "\\\,\x{e8},a", '!!!!__!*`,`\\', ], # Latin-1
   [ "\x{30cb}\\\,\x{e8},a", '!!!!__!*`,`\\', ], # UTF-8
   [ ",,,,,,,,,,,,,,", ",", ],
   [ "\\,\\,\\,\\,\\,\\,\\,\\,\\,\\,\\,,,,\\", ":(", ],
   [ "asdfa", "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\,a", ],
   [ 1, 2 ],
   [ 7, 9 ],
   [ '', '', '', ],
 );
 for my $test ( @serialize_tests ) {
   my $ser = Quoter::serialize_list( @$test );
   is_deeply(
      [Quoter::deserialize_list($ser)],
      $test,
      "serialize then deserialize works"
   );
 }
 use DSNParser;
 use Sandbox;
 my $dp = new DSNParser(opts=>$dsn_opts);
 my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
 my $dbh = $sb->get_dbh_for('master');
 SKIP: {
   skip 'Cannot connect to sandbox master', 1 unless $dbh;
   $dbh->do('CREATE DATABASE IF NOT EXISTS serialize_test');
   $dbh->do('DROP TABLE IF EXISTS serialize_test.serialize');
   $dbh->do('CREATE TABLE serialize_test.serialize (id INT, foo TEXT)');
   my $sth    = $dbh->prepare(
      "INSERT INTO serialize_test.serialize (id, foo) VALUES (?, ?)"
   );
   my $selsth = $dbh->prepare(
      "SELECT foo FROM serialize_test.serialize WHERE id=? LIMIT 1"
   );
   for my $test_index ( 0..$#serialize_tests ) {
      my $ser = Quoter::serialize_list( @{$serialize_tests[$test_index]} );
      # Bit of a hack, but we want to test both of Perl's internal encodings
      # for correctness.
      local $dbh->{'mysql_enable_utf8'} = 1 if utf8::is_utf8($ser);
      $sth->execute($test_index, $ser);
      $selsth->execute($test_index);
      is_deeply(
         [Quoter::deserialize_list($selsth->fetchrow_array())],
         $serialize_tests[$test_index],
         "serialize then deserialize through the DB works"
      );
   }
   $sth->finish();
   $selsth->finish();
   $dbh->do("DROP DATABASE serialize_test");
   $dbh->disconnect();
 };
 exit;