From 8ef6ee451f754495760c1c5afb4b6c0069483455 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 23 Oct 2012 14:42:41 -0600 Subject: [PATCH 1/6] Wait for collectors before exiting. Use wait_until instead of sleep in pt-stalk.t, and add lots of diag info when tests fail. --- bin/pt-stalk | 31 +++++++++++++++++++++++++++++++ t/pt-stalk/pt-stalk.t | 30 +++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/bin/pt-stalk b/bin/pt-stalk index 93c79cc5..5a360b7d 100755 --- a/bin/pt-stalk +++ b/bin/pt-stalk @@ -1210,6 +1210,37 @@ stalk() { purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME" fi done + + # Before exiting, the last collector may still be running. + # Wait for it to finish in case the tool is part of a script, + # or part of a test, so the caller has access to the collected + # data when the tool exists. collect() waits an additional + # --run-time seconds for itself to complete, which means we + # have to wait for 2 * run-time like it plus some overhead else + # we may get in sync with the collector and kill it a microsecond + # before it kills itself, thus 3 * run-time. + # https://bugs.launchpad.net/percona-toolkit/+bug/1070434 + if [ "$(jobs)" ]; then + local sleep_time=$((OPT_RUN_TIME * 3)) + log "Waiting up to $sleep_time seconds for collectors to finish..." + local slept=0 + while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do + sleep 1 + slept=$((slept + 1)) + done + + if [ "$(jobs)" ]; then + for pid in $(jobs -p); do + # This isn't an warning (we don't want exit status 1) because + # the system may be running slowly so it's just "natural" that + # a collector may get stuck or run really slowly. + log "Killing collector $pid" + kill $pid >/dev/null 2>&1 + done + else + log "All collectors have finished" + fi + fi } # ########################################################################### diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index ed905eb0..05aacd09 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -134,43 +134,55 @@ diag(`rm $dest/* 2>/dev/null`); my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'"); my $threshold = $uptime + 2; -$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); +$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -sleep 3; +PerconaTest::wait_until(sub { !-f $pid_file }); -$output = `cat $dest/*-trigger`; +$output = `cat $dest/*-trigger 2>/dev/null`; like( $output, qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/, "Collect triggered" +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, ); # There is some nondeterminism here. Sometimes it'll run for 2 samples because # the samples may not be precisely 1 second apart. -chomp($output = `cat $dest/*-df | grep -c '^TS'`); +chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`); ok( $output >= 1 && $output <= 3, "Collect ran for --run-time" -) or diag("Run time: $output"); +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, +); ok( PerconaTest::not_running("pt-stalk --iterations 1"), "pt-stalk is not running" ); -$output = `cat $dest/*-trigger`; +$output = `cat $dest/*-trigger 2>/dev/null`; like( $output, qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/, "Trigger file logs how pt-stalk was ran" ); -chomp($output = `cat $log_file | grep 'Collector PID'`); +chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`); like( $output, qr/Collector PID \d+/, "Collector PID logged" -); +) or diag('output', $output, 'log file', `cat $log_file 2>/dev/null`); # ########################################################################### # Triggered but --no-collect. @@ -184,7 +196,7 @@ $threshold = $uptime + 2; $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -sleep 2; +PerconaTest::wait_until(sub { !-f $pid_file }); $output = `cat $log_file`; like( From 36ca6bbc234f01eb2649300631af564656288595 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 23 Oct 2012 16:54:45 -0600 Subject: [PATCH 2/6] Make "not stalking" tests more reliable. --- t/pt-stalk/pt-stalk.t | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index 05aacd09..3e5ac8da 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -198,7 +198,7 @@ $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest PerconaTest::wait_until(sub { !-f $pid_file }); -$output = `cat $log_file`; +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Collect triggered/, @@ -250,19 +250,18 @@ diag(`rm $pid_file 2>/dev/null`); diag(`rm $log_file 2>/dev/null`); diag(`rm $dest/* 2>/dev/null`); -$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk -- --defaults-file=$cnf >$log_file 2>&1"); +$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); -PerconaTest::wait_for_files("$dest/nostalk-trigger"); -$output = `cat $dest/nostalk-trigger`; +PerconaTest::wait_until(sub { !-f $pid_file }); + +$output = `cat $dest/nostalk-trigger 2>/dev/null`; like( $output, qr/Not stalking/, "Not stalking, collect triggered" ); -PerconaTest::wait_for_files("$dest/nostalk-hostname"); -PerconaTest::wait_for_sh("test \$(grep -c '^TS' $dest/nostalk-df) -ge 2"); -chomp($output = `grep -c '^TS' $dest/nostalk-df`); +chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`); is( $output, 2, @@ -281,7 +280,7 @@ SKIP: { }; is( - `cat $dest/nostalk-hostname`, + `cat $dest/nostalk-hostname 2>/dev/null`, `hostname`, "Not stalking, collect gathered data" ); From 78e1f95257e8bb91e8d1b382b75759b348f0f198 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 23 Oct 2012 17:20:05 -0600 Subject: [PATCH 3/6] Really wait for expected data in files. --- t/pt-stalk/pt-stalk.t | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index 3e5ac8da..166fb7ab 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -86,20 +86,21 @@ ok( "Creates --dest (collect) dir" ); -chomp($pid = `cat $pid_file`); +chomp($pid = `cat $pid_file 2>/dev/null`); $retval = system("kill -0 $pid"); is( $retval >> 0, 0, - "pt-stalk is running ($pid)" + "pt-stalk is running" ); -$output = `cat $log_file`; +PerconaTest::wait_for_sh("grep -q 'Check results' $log_file >/dev/null"); +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/, "Check results logged" -); +) or diag(`cat $log_file 2>/dev/null`); $retval = system("kill $pid 2>/dev/null"); is( @@ -108,19 +109,19 @@ is( "Killed pt-stalk" ); -sleep 1; +PerconaTest::wait_until(sub { !-f $pid_file }); ok( ! -f $pid_file, "Removes PID file" ); -$output = `cat $log_file`; +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Caught signal, exiting/, "Caught signal logged" -); +) or diag(`cat $log_file 2>/dev/null`); # ########################################################################### # Test collect. From d97912ef1c084c228bab3c1db4c619cec4687e08 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 23 Oct 2012 17:31:05 -0600 Subject: [PATCH 4/6] Even more waiting for _really_ slow Ubuntu 12 box. --- t/pt-stalk/pt-stalk.t | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index 166fb7ab..8f18155a 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -260,14 +260,14 @@ like( $output, qr/Not stalking/, "Not stalking, collect triggered" -); +) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`); is( $output, 2, "Not stalking, collect ran for --run-time" -); +) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); my $vmstat = `which vmstat 2>/dev/null`; SKIP: { @@ -284,12 +284,12 @@ is( `cat $dest/nostalk-hostname 2>/dev/null`, `hostname`, "Not stalking, collect gathered data" -); +) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); ok( PerconaTest::not_running("pt-stalk --no-stalk"), "Not stalking, pt-stalk is not running" -); +) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); # ############################################################################ # bad "find" usage in purge_samples gives From c077b1eb3d57754d8024cf1c850992cbd37d325f Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 23 Oct 2012 17:51:59 -0600 Subject: [PATCH 5/6] Include collector output in diag. --- t/pt-stalk/pt-stalk.t | 61 +++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/t/pt-stalk/pt-stalk.t b/t/pt-stalk/pt-stalk.t index 8f18155a..30048726 100644 --- a/t/pt-stalk/pt-stalk.t +++ b/t/pt-stalk/pt-stalk.t @@ -100,7 +100,7 @@ like( $output, qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/, "Check results logged" -) or diag(`cat $log_file 2>/dev/null`); +) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`); $retval = system("kill $pid 2>/dev/null"); is( @@ -121,7 +121,7 @@ like( $output, qr/Caught signal, exiting/, "Caught signal logged" -) or diag(`cat $log_file 2>/dev/null`); +) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`); # ########################################################################### # Test collect. @@ -146,10 +146,11 @@ like( "Collect triggered" ) or diag( - 'output', $output, - 'log file', `cat $log_file 2>/dev/null`, - 'dest', `ls -l $dest/ 2>/dev/null`, - 'df', `cat $dest/*-df 2>/dev/null`, + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, ); # There is some nondeterminism here. Sometimes it'll run for 2 samples because @@ -160,10 +161,11 @@ ok( "Collect ran for --run-time" ) or diag( - 'output', $output, - 'log file', `cat $log_file 2>/dev/null`, - 'dest', `ls -l $dest/ 2>/dev/null`, - 'df', `cat $dest/*-df 2>/dev/null`, + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, + 'dest', `ls -l $dest/ 2>/dev/null`, + 'df', `cat $dest/*-df 2>/dev/null`, ); ok( @@ -183,7 +185,12 @@ like( $output, qr/Collector PID \d+/, "Collector PID logged" -) or diag('output', $output, 'log file', `cat $log_file 2>/dev/null`); +) +or diag( + 'output', $output, + 'log file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, +); # ########################################################################### # Triggered but --no-collect. @@ -209,7 +216,7 @@ like( ok( ! -f "$dest/*", "No files collected" -); +) or diag(`ls -l $dest/ 2>/dev/null`); ok( PerconaTest::not_running("pt-stalk --no-collect"), @@ -234,7 +241,7 @@ is( "Killed pt-stalk" ); -$output = `cat $log_file`; +$output = `cat $log_file 2>/dev/null`; like( $output, qr/Check results: Aborted_connects=|variable=Aborted_connects/, @@ -260,14 +267,24 @@ like( $output, qr/Not stalking/, "Not stalking, collect triggered" -) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, +); chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`); is( $output, 2, "Not stalking, collect ran for --run-time" -) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, +); my $vmstat = `which vmstat 2>/dev/null`; SKIP: { @@ -284,12 +301,22 @@ is( `cat $dest/nostalk-hostname 2>/dev/null`, `hostname`, "Not stalking, collect gathered data" -) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, +); ok( PerconaTest::not_running("pt-stalk --no-stalk"), "Not stalking, pt-stalk is not running" -) or diag(`cat $log_file 2>/dev/null`, `ls $dest/ 2>/dev/null`); +) +or diag( + 'dest', `ls -l $dest/ 2>/dev/null`, + 'log_file', `cat $log_file 2>/dev/null`, + 'collector', `cat $dest/*-output 2>/dev/null`, +); # ############################################################################ # bad "find" usage in purge_samples gives From f6527695b810cf4f1570d64717362588ead15a67 Mon Sep 17 00:00:00 2001 From: Daniel Nichter Date: Tue, 23 Oct 2012 18:21:23 -0600 Subject: [PATCH 6/6] Document extra --run-time-related waits. --- bin/pt-stalk | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bin/pt-stalk b/bin/pt-stalk index 5a360b7d..b8a99e9c 100755 --- a/bin/pt-stalk +++ b/bin/pt-stalk @@ -1785,6 +1785,18 @@ than L<"--sleep">. It is usually not necessary to change this; if the default 30 seconds hasn't gathered enough diagnostic data, running longer is not likely to do so. In fact, in many cases a shorter collection period is appropriate. +This value is used two other times. After collecting, the collect subprocess +will wait another L<"--run-time"> seconds for its commands to finish. Some +commands can take awhile if the system is running very slowly (which can +likely be the case given that a collection was triggered). Since empty files +are deleted, the extra wait gives commands time to finish and write their +data. The value is potentially used again just before the tool exits to wait +again for any collect subprocesses to finish. In most cases this won't +happen because of the aforementioned extra wait. If it happens, the tool +will log "Waiting up to N seconds for collectors to finish..." where N is +three times L<"--run-time">. In both cases, after waiting, the tool kills +all of its subprocesses. + =item --sleep type: int; default: 300