diff --git a/bin/pt-stalk b/bin/pt-stalk index 93c79cc5..5a360b7d 100755 --- a/bin/pt-stalk +++ b/bin/pt-stalk @@ -1210,6 +1210,37 @@ stalk() { purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME" fi done + + # Before exiting, the last collector may still be running. + # Wait for it to finish in case the tool is part of a script, + # or part of a test, so the caller has access to the collected + # data when the tool exists. collect() waits an additional + # --run-time seconds for itself to complete, which means we + # have to wait for 2 * run-time like it plus some overhead else + # we may get in sync with the collector and kill it a microsecond + # before it kills itself, thus 3 * run-time. + # https://bugs.launchpad.net/percona-toolkit/+bug/1070434 + if [ "$(jobs)" ]; then + local sleep_time=$((OPT_RUN_TIME * 3)) + log "Waiting up to $sleep_time seconds for collectors to finish..." + local slept=0 + while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do + sleep 1 + slept=$((slept + 1)) + done + + if [ "$(jobs)" ]; then + for pid in $(jobs -p); do + # This isn't an warning (we don't want exit status 1) because + # the system may be running slowly so it's just "natural" that + # a collector may get stuck or run really slowly. + log "Killing collector $pid" + kill $pid >/dev/null 2>&1 + done + else + log "All collectors have finished" + fi + fi } # ########################################################################### diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index ed905eb0..05aacd09 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -134,43 +134,55 @@ diag(`rm $dest/* 2>/dev/null`); my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'"); my $threshold = $uptime + 2; -$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); +$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -sleep 3; +PerconaTest::wait_until(sub { !-f $pid_file }); -$output = `cat $dest/*-trigger`; +$output = `cat $dest/*-trigger 2>/dev/null`; like( $output, qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/, "Collect triggered" +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, ); # There is some nondeterminism here. Sometimes it'll run for 2 samples because # the samples may not be precisely 1 second apart. -chomp($output = `cat $dest/*-df | grep -c '^TS'`); +chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`); ok( $output >= 1 && $output <= 3, "Collect ran for --run-time" -) or diag("Run time: $output"); +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, +); ok( PerconaTest::not_running("pt-stalk --iterations 1"), "pt-stalk is not running" ); -$output = `cat $dest/*-trigger`; +$output = `cat $dest/*-trigger 2>/dev/null`; like( $output, qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/, "Trigger file logs how pt-stalk was ran" ); -chomp($output = `cat $log_file | grep 'Collector PID'`); +chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`); like( $output, qr/Collector PID \d+/, "Collector PID logged" -); +) or diag('output', $output, 'log file', `cat $log_file 2>/dev/null`); # ########################################################################### # Triggered but --no-collect. @@ -184,7 +196,7 @@ $threshold = $uptime + 2; $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -sleep 2; +PerconaTest::wait_until(sub { !-f $pid_file }); $output = `cat $log_file`; like(