diff --git a/bin/pt-query-digest b/bin/pt-query-digest index cc2c7cfe..43cc22cb 100755 --- a/bin/pt-query-digest +++ b/bin/pt-query-digest @@ -11745,6 +11745,9 @@ sub add { push @{$self->{procs}}, $process; push @{$self->{names}}, $name; + if ( my $n = $args{retry_on_error} ) { + $self->{retries}->{$name} = $n; + } if ( $self->{instrument} ) { $self->{instrumentation}->{$name} = { time => 0, calls => 0 }; } @@ -11809,10 +11812,28 @@ sub execute { } }; if ( $EVAL_ERROR ) { - warn "Pipeline process $procno (" - . ($self->{names}->[$procno] || "") - . ") caused an error: $EVAL_ERROR"; - die $EVAL_ERROR unless $self->{continue_on_error}; + my $name = $self->{names}->[$procno] || ""; + my $msg = "Pipeline process " . ($procno + 1) + . " ($name) caused an error: " + . $EVAL_ERROR; + if ( defined $self->{retries}->{$name} ) { + my $n = $self->{retries}->{$name}; + if ( $n ) { + warn $msg . "Will retry pipeline process $procno ($name) " + . "$n more " . ($n > 1 ? "times" : "time") . ".\n"; + $self->{retries}->{$name}--; + } + else { + die $msg . "Terminating pipeline because process $procno " + . "($name) caused too many errors.\n"; + } + } + elsif ( !$self->{continue_on_error} ) { + die $msg; + } + else { + warn $msg; + } } } @@ -12562,8 +12583,14 @@ sub main { { # iteration $pipeline->add( - name => 'iteration', - process => sub { + # This is a critical proc: if we die here, we probably need + # to stop, else an infinite loop can develop: + # https://bugs.launchpad.net/percona-toolkit/+bug/888114 + # We'll retry twice in case the problem is just one bad + # query class, or something like that. + retry_on_error => 2, + name => 'iteration', + process => sub { my ( $args ) = @_; # Start the (next) iteration. @@ -13239,6 +13266,11 @@ sub print_reports { ); if ( $o->get('report') ) { + # XXX There's a bug here: --expected-range '','' will cause + # Use of uninitialized value in numeric lt (<) + # This bug is intentionally left unfixed at the moment because + # we exploit it to test a more serious bug: an infinite loop: + # https://bugs.launchpad.net/percona-toolkit/+bug/888114 my $expected_range = $o->get('expected-range'); my $explain_why = $expected_range && ( @$worst < $expected_range->[0] diff --git a/lib/Pipeline.pm b/lib/Pipeline.pm index 1ebd01a5..9a890716 100644 --- a/lib/Pipeline.pm +++ b/lib/Pipeline.pm @@ -71,6 +71,9 @@ sub add { push @{$self->{procs}}, $process; push @{$self->{names}}, $name; + if ( my $n = $args{retry_on_error} ) { + $self->{retries}->{$name} = $n; + } if ( $self->{instrument} ) { $self->{instrumentation}->{$name} = { time => 0, calls => 0 }; } @@ -156,10 +159,28 @@ sub execute { } }; if ( $EVAL_ERROR ) { - warn "Pipeline process $procno (" - . ($self->{names}->[$procno] || "") - . ") caused an error: $EVAL_ERROR"; - die $EVAL_ERROR unless $self->{continue_on_error}; + my $name = $self->{names}->[$procno] || ""; + my $msg = "Pipeline process " . ($procno + 1) + . " ($name) caused an error: " + . $EVAL_ERROR; + if ( defined $self->{retries}->{$name} ) { + my $n = $self->{retries}->{$name}; + if ( $n ) { + warn $msg . "Will retry pipeline process $procno ($name) " + . "$n more " . ($n > 1 ? "times" : "time") . ".\n"; + $self->{retries}->{$name}--; + } + else { + die $msg . "Terminating pipeline because process $procno " + . "($name) caused too many errors.\n"; + } + } + elsif ( !$self->{continue_on_error} ) { + die $msg; + } + else { + warn $msg; + } } } diff --git a/t/lib/Pipeline.t b/t/lib/Pipeline.t index 720e40ab..6cb5f0c3 100644 --- a/t/lib/Pipeline.t +++ b/t/lib/Pipeline.t @@ -9,7 +9,7 @@ BEGIN { use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); -use Test::More tests => 10; +use Test::More tests => 11; use Time::HiRes qw(usleep); @@ -206,11 +206,13 @@ is( $oktorun = 1; $pipeline = new Pipeline(continue_on_error=>1); -my @die = qw(1 0); +my @called = (); +my @die = qw(1 0); $pipeline->add( name => 'proc1', process => sub { my ( $args ) = @_; + push @called, 'proc1'; die "I'm an error" if shift @die; return $args; }, @@ -218,7 +220,7 @@ $pipeline->add( $pipeline->add( name => 'proc2', process => sub { - print "proc2"; + push @called, 'proc2'; $oktorun = 0; return; }, @@ -229,12 +231,48 @@ $output = output( stderr => 1, ); -like( - $output, - qr/0 \(proc1\) caused an error.+proc2/s, +is_deeply( + \@called, + [qw(proc1 proc1 proc2)], "Continues on error" ); +# Override global for critical procs that must kill the pipeline if they die. +$oktorun = 1; +$pipeline = new Pipeline(continue_on_error=>1); + +@called = (); +$pipeline->add( + name => 'proc1', + process => sub { + my ( $args ) = @_; + push @called, 'proc1'; + $oktorun = 0 if @called > 8; + return $args; + }, +); +$pipeline->add( + retry_on_error => 2, + name => 'proc2', + process => sub { + push @called, 'proc2'; + die; + }, +); + +$output = output( + sub { $pipeline->execute(%args) }, + stderr => 1, +); + +is_deeply( + \@called, + [qw(proc1 proc2), # first attempt + qw(proc1 proc2), # retry 1/2 + qw(proc1 proc2)], # retry 2/2 + "Retry proc" +) or diag($output); + # ############################################################################# # Done. # ############################################################################# diff --git a/t/pt-query-digest/continue_on_error.t b/t/pt-query-digest/continue_on_error.t index 2590c3bb..dd7ad714 100644 --- a/t/pt-query-digest/continue_on_error.t +++ b/t/pt-query-digest/continue_on_error.t @@ -9,9 +9,12 @@ BEGIN { use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); -use Test::More tests => 2; +use Test::More tests => 3; use PerconaTest; +shift @INC; # our unshift (above) +shift @INC; # PerconaTest's unshift +require "$trunk/bin/pt-query-digest"; my $output; @@ -30,6 +33,25 @@ like( 'Continues on error by default' ); +# ############################################################################# +# Infinite loop in pt-query-digest if a report crashe +# https://bugs.launchpad.net/percona-toolkit/+bug/888114 +# ############################################################################# + +# This bug is due to the fact that --continue-on-error is on by default. +# To reproduce the problem, we must intentionally crash pt-query-digest +# in the right place, which means we're using another bug:a +$output = output( + sub { pt_query_digest::main("$trunk/t/lib/samples/slowlogs/slow002.txt", + "--expected-range", "'',''") }, + stderr => 1, +); + +like( + $output, + qr/Argument \S+ isn't numeric/, + "Report crashed, but no infinite loop (bug 888114)" +); # ############################################################################# # Done. diff --git a/t/pt-query-digest/mirror.t b/t/pt-query-digest/mirror.t index 1b021c5b..23d851c5 100644 --- a/t/pt-query-digest/mirror.t +++ b/t/pt-query-digest/mirror.t @@ -36,9 +36,6 @@ my $cmd; my $pid_file = "/tmp/pt-query-digest-mirror-test.pid"; diag(`rm $pid_file 2>/dev/null`); -my $pid_file = '/tmp/pt-query-digest.test.pid'; -`rm -rf $pid_file >/dev/null`; - # ########################################################################## # Tests for swapping --processlist and --execute # ##########################################################################