Merge pt-stalk-iter-1-bug-1070434.

This commit is contained in:
Daniel Nichter
2012-10-23 18:25:37 -06:00
2 changed files with 107 additions and 25 deletions

View File

@@ -1210,6 +1210,37 @@ stalk() {
purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME"
fi
done
# Before exiting, the last collector may still be running.
# Wait for it to finish in case the tool is part of a script,
# or part of a test, so the caller has access to the collected
# data when the tool exists. collect() waits an additional
# --run-time seconds for itself to complete, which means we
# have to wait for 2 * run-time like it plus some overhead else
# we may get in sync with the collector and kill it a microsecond
# before it kills itself, thus 3 * run-time.
# https://bugs.launchpad.net/percona-toolkit/+bug/1070434
if [ "$(jobs)" ]; then
local sleep_time=$((OPT_RUN_TIME * 3))
log "Waiting up to $sleep_time seconds for collectors to finish..."
local slept=0
while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do
sleep 1
slept=$((slept + 1))
done
if [ "$(jobs)" ]; then
for pid in $(jobs -p); do
# This isn't an warning (we don't want exit status 1) because
# the system may be running slowly so it's just "natural" that
# a collector may get stuck or run really slowly.
log "Killing collector $pid"
kill $pid >/dev/null 2>&1
done
else
log "All collectors have finished"
fi
fi
}
# ###########################################################################
@@ -1754,6 +1785,18 @@ than L<"--sleep">. It is usually not necessary to change this; if the default 30
seconds hasn't gathered enough diagnostic data, running longer is not likely to
do so. In fact, in many cases a shorter collection period is appropriate.
This value is used two other times. After collecting, the collect subprocess
will wait another L<"--run-time"> seconds for its commands to finish. Some
commands can take awhile if the system is running very slowly (which can
likely be the case given that a collection was triggered). Since empty files
are deleted, the extra wait gives commands time to finish and write their
data. The value is potentially used again just before the tool exits to wait
again for any collect subprocesses to finish. In most cases this won't
happen because of the aforementioned extra wait. If it happens, the tool
will log "Waiting up to N seconds for collectors to finish..." where N is
three times L<"--run-time">. In both cases, after waiting, the tool kills
all of its subprocesses.
=item --sleep
type: int; default: 300

View File

@@ -86,20 +86,21 @@ ok(
"Creates --dest (collect) dir"
);
chomp($pid = `cat $pid_file`);
chomp($pid = `cat $pid_file 2>/dev/null`);
$retval = system("kill -0 $pid");
is(
$retval >> 0,
0,
"pt-stalk is running ($pid)"
"pt-stalk is running"
);
$output = `cat $log_file`;
PerconaTest::wait_for_sh("grep -q 'Check results' $log_file >/dev/null");
$output = `cat $log_file 2>/dev/null`;
like(
$output,
qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/,
"Check results logged"
);
) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`);
$retval = system("kill $pid 2>/dev/null");
is(
@@ -108,19 +109,19 @@ is(
"Killed pt-stalk"
);
sleep 1;
PerconaTest::wait_until(sub { !-f $pid_file });
ok(
! -f $pid_file,
"Removes PID file"
);
$output = `cat $log_file`;
$output = `cat $log_file 2>/dev/null`;
like(
$output,
qr/Caught signal, exiting/,
"Caught signal logged"
);
) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`);
# ###########################################################################
# Test collect.
@@ -134,42 +135,61 @@ diag(`rm $dest/* 2>/dev/null`);
my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'");
my $threshold = $uptime + 2;
$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
sleep 3;
PerconaTest::wait_until(sub { !-f $pid_file });
$output = `cat $dest/*-trigger`;
$output = `cat $dest/*-trigger 2>/dev/null`;
like(
$output,
qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/,
"Collect triggered"
)
or diag(
'output', $output,
'log file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
'dest', `ls -l $dest/ 2>/dev/null`,
'df', `cat $dest/*-df 2>/dev/null`,
);
# There is some nondeterminism here. Sometimes it'll run for 2 samples because
# the samples may not be precisely 1 second apart.
chomp($output = `cat $dest/*-df | grep -c '^TS'`);
chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`);
ok(
$output >= 1 && $output <= 3,
"Collect ran for --run-time"
) or diag("Run time: $output");
)
or diag(
'output', $output,
'log file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
'dest', `ls -l $dest/ 2>/dev/null`,
'df', `cat $dest/*-df 2>/dev/null`,
);
ok(
PerconaTest::not_running("pt-stalk --iterations 1"),
"pt-stalk is not running"
);
$output = `cat $dest/*-trigger`;
$output = `cat $dest/*-trigger 2>/dev/null`;
like(
$output,
qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/,
"Trigger file logs how pt-stalk was ran"
);
chomp($output = `cat $log_file | grep 'Collector PID'`);
chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`);
like(
$output,
qr/Collector PID \d+/,
"Collector PID logged"
)
or diag(
'output', $output,
'log file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
);
# ###########################################################################
@@ -184,9 +204,9 @@ $threshold = $uptime + 2;
$retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
sleep 2;
PerconaTest::wait_until(sub { !-f $pid_file });
$output = `cat $log_file`;
$output = `cat $log_file 2>/dev/null`;
like(
$output,
qr/Collect triggered/,
@@ -196,7 +216,7 @@ like(
ok(
! -f "$dest/*",
"No files collected"
);
) or diag(`ls -l $dest/ 2>/dev/null`);
ok(
PerconaTest::not_running("pt-stalk --no-collect"),
@@ -221,7 +241,7 @@ is(
"Killed pt-stalk"
);
$output = `cat $log_file`;
$output = `cat $log_file 2>/dev/null`;
like(
$output,
qr/Check results: Aborted_connects=|variable=Aborted_connects/,
@@ -238,23 +258,32 @@ diag(`rm $pid_file 2>/dev/null`);
diag(`rm $log_file 2>/dev/null`);
diag(`rm $dest/* 2>/dev/null`);
$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk -- --defaults-file=$cnf >$log_file 2>&1");
$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
PerconaTest::wait_for_files("$dest/nostalk-trigger");
$output = `cat $dest/nostalk-trigger`;
PerconaTest::wait_until(sub { !-f $pid_file });
$output = `cat $dest/nostalk-trigger 2>/dev/null`;
like(
$output,
qr/Not stalking/,
"Not stalking, collect triggered"
)
or diag(
'dest', `ls -l $dest/ 2>/dev/null`,
'log_file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
);
PerconaTest::wait_for_files("$dest/nostalk-hostname");
PerconaTest::wait_for_sh("test \$(grep -c '^TS' $dest/nostalk-df) -ge 2");
chomp($output = `grep -c '^TS' $dest/nostalk-df`);
chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`);
is(
$output,
2,
"Not stalking, collect ran for --run-time"
)
or diag(
'dest', `ls -l $dest/ 2>/dev/null`,
'log_file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
);
my $vmstat = `which vmstat 2>/dev/null`;
@@ -269,14 +298,24 @@ SKIP: {
};
is(
`cat $dest/nostalk-hostname`,
`cat $dest/nostalk-hostname 2>/dev/null`,
`hostname`,
"Not stalking, collect gathered data"
)
or diag(
'dest', `ls -l $dest/ 2>/dev/null`,
'log_file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
);
ok(
PerconaTest::not_running("pt-stalk --no-stalk"),
"Not stalking, pt-stalk is not running"
)
or diag(
'dest', `ls -l $dest/ 2>/dev/null`,
'log_file', `cat $log_file 2>/dev/null`,
'collector', `cat $dest/*-output 2>/dev/null`,
);
# ############################################################################