diff --git a/bin/pt-stalk b/bin/pt-stalk index 93c79cc5..b8a99e9c 100755 --- a/bin/pt-stalk +++ b/bin/pt-stalk @@ -1210,6 +1210,37 @@ stalk() { purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME" fi done + + # Before exiting, the last collector may still be running. + # Wait for it to finish in case the tool is part of a script, + # or part of a test, so the caller has access to the collected + # data when the tool exists. collect() waits an additional + # --run-time seconds for itself to complete, which means we + # have to wait for 2 * run-time like it plus some overhead else + # we may get in sync with the collector and kill it a microsecond + # before it kills itself, thus 3 * run-time. + # https://bugs.launchpad.net/percona-toolkit/+bug/1070434 + if [ "$(jobs)" ]; then + local sleep_time=$((OPT_RUN_TIME * 3)) + log "Waiting up to $sleep_time seconds for collectors to finish..." + local slept=0 + while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do + sleep 1 + slept=$((slept + 1)) + done + + if [ "$(jobs)" ]; then + for pid in $(jobs -p); do + # This isn't an warning (we don't want exit status 1) because + # the system may be running slowly so it's just "natural" that + # a collector may get stuck or run really slowly. + log "Killing collector $pid" + kill $pid >/dev/null 2>&1 + done + else + log "All collectors have finished" + fi + fi } # ########################################################################### @@ -1754,6 +1785,18 @@ than L<"--sleep">. It is usually not necessary to change this; if the default 30 seconds hasn't gathered enough diagnostic data, running longer is not likely to do so. In fact, in many cases a shorter collection period is appropriate. +This value is used two other times. After collecting, the collect subprocess +will wait another L<"--run-time"> seconds for its commands to finish. Some +commands can take awhile if the system is running very slowly (which can +likely be the case given that a collection was triggered). Since empty files +are deleted, the extra wait gives commands time to finish and write their +data. The value is potentially used again just before the tool exits to wait +again for any collect subprocesses to finish. In most cases this won't +happen because of the aforementioned extra wait. If it happens, the tool +will log "Waiting up to N seconds for collectors to finish..." where N is +three times L<"--run-time">. In both cases, after waiting, the tool kills +all of its subprocesses. + =item --sleep type: int; default: 300 diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index ed905eb0..30048726 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -86,20 +86,21 @@ ok( "Creates --dest (collect) dir" ); -chomp($pid = `cat $pid_file`); +chomp($pid = `cat $pid_file 2>/dev/null`); $retval = system("kill -0 $pid"); is( $retval >> 0, 0, - "pt-stalk is running ($pid)" + "pt-stalk is running" ); -$output = `cat $log_file`; +PerconaTest::wait_for_sh("grep -q 'Check results' $log_file >/dev/null"); +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/, "Check results logged" -); +) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`); $retval = system("kill $pid 2>/dev/null"); is( @@ -108,19 +109,19 @@ is( "Killed pt-stalk" ); -sleep 1; +PerconaTest::wait_until(sub { !-f $pid_file }); ok( ! -f $pid_file, "Removes PID file" ); -$output = `cat $log_file`; +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Caught signal, exiting/, "Caught signal logged" -); +) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`); # ########################################################################### # Test collect. @@ -134,42 +135,61 @@ diag(`rm $dest/* 2>/dev/null`); my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'"); my $threshold = $uptime + 2; -$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); +$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -sleep 3; +PerconaTest::wait_until(sub { !-f $pid_file }); -$output = `cat $dest/*-trigger`; +$output = `cat $dest/*-trigger 2>/dev/null`; like( $output, qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/, "Collect triggered" +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, ); # There is some nondeterminism here. Sometimes it'll run for 2 samples because # the samples may not be precisely 1 second apart. -chomp($output = `cat $dest/*-df | grep -c '^TS'`); +chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`); ok( $output >= 1 && $output <= 3, "Collect ran for --run-time" -) or diag("Run time: $output"); +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, +); ok( PerconaTest::not_running("pt-stalk --iterations 1"), "pt-stalk is not running" ); -$output = `cat $dest/*-trigger`; +$output = `cat $dest/*-trigger 2>/dev/null`; like( $output, qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/, "Trigger file logs how pt-stalk was ran" ); -chomp($output = `cat $log_file | grep 'Collector PID'`); +chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`); like( $output, qr/Collector PID \d+/, "Collector PID logged" +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, ); # ########################################################################### @@ -184,9 +204,9 @@ $threshold = $uptime + 2; $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -sleep 2; +PerconaTest::wait_until(sub { !-f $pid_file }); -$output = `cat $log_file`; +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Collect triggered/, @@ -196,7 +216,7 @@ like( ok( ! -f "$dest/*", "No files collected" -); +) or diag(`ls -l $dest/ 2>/dev/null`); ok( PerconaTest::not_running("pt-stalk --no-collect"), @@ -221,7 +241,7 @@ is( "Killed pt-stalk" ); -$output = `cat $log_file`; +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Check results: Aborted_connects=|variable=Aborted_connects/, @@ -238,23 +258,32 @@ diag(`rm $pid_file 2>/dev/null`); diag(`rm $log_file 2>/dev/null`); diag(`rm $dest/* 2>/dev/null`); -$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk -- --defaults-file=$cnf >$log_file 2>&1"); +$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -PerconaTest::wait_for_files("$dest/nostalk-trigger"); -$output = `cat $dest/nostalk-trigger`; +PerconaTest::wait_until(sub { !-f $pid_file }); + +$output = `cat $dest/nostalk-trigger 2>/dev/null`; like( $output, qr/Not stalking/, "Not stalking, collect triggered" +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, ); -PerconaTest::wait_for_files("$dest/nostalk-hostname"); -PerconaTest::wait_for_sh("test \$(grep -c '^TS' $dest/nostalk-df) -ge 2"); -chomp($output = `grep -c '^TS' $dest/nostalk-df`); +chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`); is( $output, 2, "Not stalking, collect ran for --run-time" +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, ); my $vmstat = `which vmstat 2>/dev/null`; @@ -269,14 +298,24 @@ SKIP: { }; is( - `cat $dest/nostalk-hostname`, + `cat $dest/nostalk-hostname 2>/dev/null`, `hostname`, "Not stalking, collect gathered data" +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, ); ok( PerconaTest::not_running("pt-stalk --no-stalk"), "Not stalking, pt-stalk is not running" +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, ); # ############################################################################