mirror of
https://github.com/percona/percona-toolkit.git
synced 2025-12-24 02:01:42 +08:00
Merge pt-stalk-iter-1-bug-1070434.
This commit is contained in:
43
bin/pt-stalk
43
bin/pt-stalk
@@ -1210,6 +1210,37 @@ stalk() {
|
||||
purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME"
|
||||
fi
|
||||
done
|
||||
|
||||
# Before exiting, the last collector may still be running.
|
||||
# Wait for it to finish in case the tool is part of a script,
|
||||
# or part of a test, so the caller has access to the collected
|
||||
# data when the tool exists. collect() waits an additional
|
||||
# --run-time seconds for itself to complete, which means we
|
||||
# have to wait for 2 * run-time like it plus some overhead else
|
||||
# we may get in sync with the collector and kill it a microsecond
|
||||
# before it kills itself, thus 3 * run-time.
|
||||
# https://bugs.launchpad.net/percona-toolkit/+bug/1070434
|
||||
if [ "$(jobs)" ]; then
|
||||
local sleep_time=$((OPT_RUN_TIME * 3))
|
||||
log "Waiting up to $sleep_time seconds for collectors to finish..."
|
||||
local slept=0
|
||||
while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do
|
||||
sleep 1
|
||||
slept=$((slept + 1))
|
||||
done
|
||||
|
||||
if [ "$(jobs)" ]; then
|
||||
for pid in $(jobs -p); do
|
||||
# This isn't an warning (we don't want exit status 1) because
|
||||
# the system may be running slowly so it's just "natural" that
|
||||
# a collector may get stuck or run really slowly.
|
||||
log "Killing collector $pid"
|
||||
kill $pid >/dev/null 2>&1
|
||||
done
|
||||
else
|
||||
log "All collectors have finished"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# ###########################################################################
|
||||
@@ -1754,6 +1785,18 @@ than L<"--sleep">. It is usually not necessary to change this; if the default 30
|
||||
seconds hasn't gathered enough diagnostic data, running longer is not likely to
|
||||
do so. In fact, in many cases a shorter collection period is appropriate.
|
||||
|
||||
This value is used two other times. After collecting, the collect subprocess
|
||||
will wait another L<"--run-time"> seconds for its commands to finish. Some
|
||||
commands can take awhile if the system is running very slowly (which can
|
||||
likely be the case given that a collection was triggered). Since empty files
|
||||
are deleted, the extra wait gives commands time to finish and write their
|
||||
data. The value is potentially used again just before the tool exits to wait
|
||||
again for any collect subprocesses to finish. In most cases this won't
|
||||
happen because of the aforementioned extra wait. If it happens, the tool
|
||||
will log "Waiting up to N seconds for collectors to finish..." where N is
|
||||
three times L<"--run-time">. In both cases, after waiting, the tool kills
|
||||
all of its subprocesses.
|
||||
|
||||
=item --sleep
|
||||
|
||||
type: int; default: 300
|
||||
|
||||
@@ -86,20 +86,21 @@ ok(
|
||||
"Creates --dest (collect) dir"
|
||||
);
|
||||
|
||||
chomp($pid = `cat $pid_file`);
|
||||
chomp($pid = `cat $pid_file 2>/dev/null`);
|
||||
$retval = system("kill -0 $pid");
|
||||
is(
|
||||
$retval >> 0,
|
||||
0,
|
||||
"pt-stalk is running ($pid)"
|
||||
"pt-stalk is running"
|
||||
);
|
||||
|
||||
$output = `cat $log_file`;
|
||||
PerconaTest::wait_for_sh("grep -q 'Check results' $log_file >/dev/null");
|
||||
$output = `cat $log_file 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/,
|
||||
"Check results logged"
|
||||
);
|
||||
) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`);
|
||||
|
||||
$retval = system("kill $pid 2>/dev/null");
|
||||
is(
|
||||
@@ -108,19 +109,19 @@ is(
|
||||
"Killed pt-stalk"
|
||||
);
|
||||
|
||||
sleep 1;
|
||||
PerconaTest::wait_until(sub { !-f $pid_file });
|
||||
|
||||
ok(
|
||||
! -f $pid_file,
|
||||
"Removes PID file"
|
||||
);
|
||||
|
||||
$output = `cat $log_file`;
|
||||
$output = `cat $log_file 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/Caught signal, exiting/,
|
||||
"Caught signal logged"
|
||||
);
|
||||
) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`);
|
||||
|
||||
# ###########################################################################
|
||||
# Test collect.
|
||||
@@ -134,42 +135,61 @@ diag(`rm $dest/* 2>/dev/null`);
|
||||
my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'");
|
||||
my $threshold = $uptime + 2;
|
||||
|
||||
$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
|
||||
sleep 3;
|
||||
PerconaTest::wait_until(sub { !-f $pid_file });
|
||||
|
||||
$output = `cat $dest/*-trigger`;
|
||||
$output = `cat $dest/*-trigger 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/,
|
||||
"Collect triggered"
|
||||
)
|
||||
or diag(
|
||||
'output', $output,
|
||||
'log file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
'dest', `ls -l $dest/ 2>/dev/null`,
|
||||
'df', `cat $dest/*-df 2>/dev/null`,
|
||||
);
|
||||
|
||||
# There is some nondeterminism here. Sometimes it'll run for 2 samples because
|
||||
# the samples may not be precisely 1 second apart.
|
||||
chomp($output = `cat $dest/*-df | grep -c '^TS'`);
|
||||
chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`);
|
||||
ok(
|
||||
$output >= 1 && $output <= 3,
|
||||
"Collect ran for --run-time"
|
||||
) or diag("Run time: $output");
|
||||
)
|
||||
or diag(
|
||||
'output', $output,
|
||||
'log file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
'dest', `ls -l $dest/ 2>/dev/null`,
|
||||
'df', `cat $dest/*-df 2>/dev/null`,
|
||||
);
|
||||
|
||||
ok(
|
||||
PerconaTest::not_running("pt-stalk --iterations 1"),
|
||||
"pt-stalk is not running"
|
||||
);
|
||||
|
||||
$output = `cat $dest/*-trigger`;
|
||||
$output = `cat $dest/*-trigger 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/,
|
||||
"Trigger file logs how pt-stalk was ran"
|
||||
);
|
||||
|
||||
chomp($output = `cat $log_file | grep 'Collector PID'`);
|
||||
chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`);
|
||||
like(
|
||||
$output,
|
||||
qr/Collector PID \d+/,
|
||||
"Collector PID logged"
|
||||
)
|
||||
or diag(
|
||||
'output', $output,
|
||||
'log file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
);
|
||||
|
||||
# ###########################################################################
|
||||
@@ -184,9 +204,9 @@ $threshold = $uptime + 2;
|
||||
|
||||
$retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
|
||||
sleep 2;
|
||||
PerconaTest::wait_until(sub { !-f $pid_file });
|
||||
|
||||
$output = `cat $log_file`;
|
||||
$output = `cat $log_file 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/Collect triggered/,
|
||||
@@ -196,7 +216,7 @@ like(
|
||||
ok(
|
||||
! -f "$dest/*",
|
||||
"No files collected"
|
||||
);
|
||||
) or diag(`ls -l $dest/ 2>/dev/null`);
|
||||
|
||||
ok(
|
||||
PerconaTest::not_running("pt-stalk --no-collect"),
|
||||
@@ -221,7 +241,7 @@ is(
|
||||
"Killed pt-stalk"
|
||||
);
|
||||
|
||||
$output = `cat $log_file`;
|
||||
$output = `cat $log_file 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/Check results: Aborted_connects=|variable=Aborted_connects/,
|
||||
@@ -238,23 +258,32 @@ diag(`rm $pid_file 2>/dev/null`);
|
||||
diag(`rm $log_file 2>/dev/null`);
|
||||
diag(`rm $dest/* 2>/dev/null`);
|
||||
|
||||
$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
|
||||
PerconaTest::wait_for_files("$dest/nostalk-trigger");
|
||||
$output = `cat $dest/nostalk-trigger`;
|
||||
PerconaTest::wait_until(sub { !-f $pid_file });
|
||||
|
||||
$output = `cat $dest/nostalk-trigger 2>/dev/null`;
|
||||
like(
|
||||
$output,
|
||||
qr/Not stalking/,
|
||||
"Not stalking, collect triggered"
|
||||
)
|
||||
or diag(
|
||||
'dest', `ls -l $dest/ 2>/dev/null`,
|
||||
'log_file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
);
|
||||
|
||||
PerconaTest::wait_for_files("$dest/nostalk-hostname");
|
||||
PerconaTest::wait_for_sh("test \$(grep -c '^TS' $dest/nostalk-df) -ge 2");
|
||||
chomp($output = `grep -c '^TS' $dest/nostalk-df`);
|
||||
chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`);
|
||||
is(
|
||||
$output,
|
||||
2,
|
||||
"Not stalking, collect ran for --run-time"
|
||||
)
|
||||
or diag(
|
||||
'dest', `ls -l $dest/ 2>/dev/null`,
|
||||
'log_file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
);
|
||||
|
||||
my $vmstat = `which vmstat 2>/dev/null`;
|
||||
@@ -269,14 +298,24 @@ SKIP: {
|
||||
};
|
||||
|
||||
is(
|
||||
`cat $dest/nostalk-hostname`,
|
||||
`cat $dest/nostalk-hostname 2>/dev/null`,
|
||||
`hostname`,
|
||||
"Not stalking, collect gathered data"
|
||||
)
|
||||
or diag(
|
||||
'dest', `ls -l $dest/ 2>/dev/null`,
|
||||
'log_file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
);
|
||||
|
||||
ok(
|
||||
PerconaTest::not_running("pt-stalk --no-stalk"),
|
||||
"Not stalking, pt-stalk is not running"
|
||||
)
|
||||
or diag(
|
||||
'dest', `ls -l $dest/ 2>/dev/null`,
|
||||
'log_file', `cat $log_file 2>/dev/null`,
|
||||
'collector', `cat $dest/*-output 2>/dev/null`,
|
||||
);
|
||||
|
||||
# ############################################################################
|
||||
|
||||
Reference in New Issue
Block a user