mirror of
https://github.com/percona/percona-toolkit.git
synced 2025-09-18 17:58:55 +00:00
Merge lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665.
This commit is contained in:
351
bin/pt-stalk
351
bin/pt-stalk
@@ -54,6 +54,58 @@ _d () {
|
||||
# End log_warn_die package
|
||||
# ###########################################################################
|
||||
|
||||
# ###########################################################################
|
||||
# subshell package
|
||||
# This package is a copy without comments from the original. The original
|
||||
# with comments and its test file can be found in the Bazaar repository at,
|
||||
# lib/bash/subshell.sh
|
||||
# t/lib/bash/subshell.sh
|
||||
# See https://launchpad.net/percona-toolkit for more information.
|
||||
# ###########################################################################
|
||||
|
||||
|
||||
set -u
|
||||
|
||||
wait_for_subshells() {
|
||||
local max_wait=$1
|
||||
if [ "$(jobs)" ]; then
|
||||
log "Waiting up to $max_wait seconds for subprocesses to finish..."
|
||||
local slept=0
|
||||
while [ -n "$(jobs)" ]; do
|
||||
local subprocess_still_running=""
|
||||
for pid in $(jobs -p); do
|
||||
if kill -0 $pid >/dev/null 2>&1; then
|
||||
subprocess_still_running=1
|
||||
fi
|
||||
done
|
||||
if [ "$subprocess_still_running" ]; then
|
||||
sleep 1
|
||||
slept=$((slept + 1))
|
||||
[ $slept -ge $max_wait ] && break
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
kill_all_subshells() {
|
||||
if [ "$(jobs)" ]; then
|
||||
for pid in $(jobs -p); do
|
||||
if kill -0 $pid >/dev/null 2>&1; then
|
||||
log "Killing subprocess $pid"
|
||||
kill $pid >/dev/null 2>&1
|
||||
fi
|
||||
done
|
||||
else
|
||||
log "All subprocesses have finished"
|
||||
fi
|
||||
}
|
||||
|
||||
# ###########################################################################
|
||||
# End subshell package
|
||||
# ###########################################################################
|
||||
|
||||
# ###########################################################################
|
||||
# parse_options package
|
||||
# This package is a copy without comments from the original. The original
|
||||
@@ -871,16 +923,8 @@ collect() {
|
||||
|
||||
hostname > "$d/$p-hostname"
|
||||
|
||||
local slept=0
|
||||
while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do
|
||||
sleep 1
|
||||
slept=$((slept + 1))
|
||||
done
|
||||
|
||||
for pid in $(jobs -p); do
|
||||
kill $pid >/dev/null 2>&1
|
||||
done
|
||||
|
||||
wait_for_subshells $OPT_RUN_TIME
|
||||
kill_all_subshells
|
||||
for file in "$d/$p-"*; do
|
||||
if [ -z "$(grep -v '^TS ' --max-count 1 "$file")" ]; then
|
||||
log "Removing empty file $file";
|
||||
@@ -1079,7 +1123,7 @@ sleep_ok() {
|
||||
local seconds="$1"
|
||||
local msg="${2:-""}"
|
||||
if oktorun; then
|
||||
[ "$msg" ] && info "$msg"
|
||||
[ "$msg" ] && log "$msg"
|
||||
sleep $seconds
|
||||
fi
|
||||
}
|
||||
@@ -1244,27 +1288,8 @@ stalk() {
|
||||
# we may get in sync with the collector and kill it a microsecond
|
||||
# before it kills itself, thus 3 * run-time.
|
||||
# https://bugs.launchpad.net/percona-toolkit/+bug/1070434
|
||||
if [ "$(jobs)" ]; then
|
||||
local sleep_time=$((OPT_RUN_TIME * 3))
|
||||
log "Waiting up to $sleep_time seconds for collectors to finish..."
|
||||
local slept=0
|
||||
while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do
|
||||
sleep 1
|
||||
slept=$((slept + 1))
|
||||
done
|
||||
|
||||
if [ "$(jobs)" ]; then
|
||||
for pid in $(jobs -p); do
|
||||
# This isn't an warning (we don't want exit status 1) because
|
||||
# the system may be running slowly so it's just "natural" that
|
||||
# a collector may get stuck or run really slowly.
|
||||
log "Killing collector $pid"
|
||||
kill $pid >/dev/null 2>&1
|
||||
done
|
||||
else
|
||||
log "All collectors have finished"
|
||||
fi
|
||||
fi
|
||||
wait_for_subshells $((OPT_RUN_TIME * 3))
|
||||
kill_all_subshells
|
||||
}
|
||||
|
||||
# ###########################################################################
|
||||
@@ -1333,10 +1358,8 @@ if [ "${0##*/}" = "$TOOL" ] \
|
||||
|
||||
if [ -z "$OPT_STALK" -a "$OPT_COLLECT" ]; then
|
||||
# Not stalking; do immediate collect once.
|
||||
OPT_ITERATIONS=1
|
||||
OPT_CYCLES=0
|
||||
OPT_SLEEP=0
|
||||
OPT_INTERVAL=0
|
||||
echo "[iter=$OPT_ITERATIONS] [cycle=$OPT_CYCLES] [sleep=$OPT_SLEEP] [interval=$OPT_INTERVAL]"
|
||||
fi
|
||||
|
||||
usage_or_errors "$0"
|
||||
@@ -1412,17 +1435,17 @@ fi
|
||||
|
||||
=head1 NAME
|
||||
|
||||
pt-stalk - Gather forensic data about MySQL when a problem occurs.
|
||||
pt-stalk - Collect forensic data about MySQL when problems occur.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Usage: pt-stalk [OPTIONS] [-- MYSQL OPTIONS]
|
||||
|
||||
pt-stalk watches for a trigger condition to become true, and then collects data
|
||||
to help in diagnosing problems. It is designed to run as a daemon with root
|
||||
pt-stalk waits for a trigger condition to occur, then collects data
|
||||
to help diagnose problems. The tool is designed to run as a daemon with root
|
||||
privileges, so that you can diagnose intermittent problems that you cannot
|
||||
observe directly. You can also use it to execute a custom command, or to gather
|
||||
the data on demand without waiting for the trigger to happen.
|
||||
observe directly. You can also use it to execute a custom command, or to
|
||||
collect data on demand without waiting for the trigger to occur.
|
||||
|
||||
=head1 RISKS
|
||||
|
||||
@@ -1453,16 +1476,20 @@ chance to see the system when it happens. How do you solve intermittent MySQL
|
||||
problems when you can't observe them? That's why pt-stalk exists. In addition to
|
||||
using it when there's a known problem on your servers, it is a good idea to run
|
||||
pt-stalk all the time, even when you think nothing is wrong. You will
|
||||
appreciate the data it gathers when a problem occurs, because problems such as
|
||||
MySQL lockups or spikes of activity typically leave no evidence to use in root
|
||||
appreciate the data it collects when a problem occurs, because problems such as
|
||||
MySQL lockups or spikes in activity typically leave no evidence to use in root
|
||||
cause analysis.
|
||||
|
||||
This tool does two things: it watches a server (typically MySQL) for a trigger
|
||||
to occur, and it gathers diagnostic data. To use it effectively, you need to
|
||||
define a good trigger condition. A good trigger is sensitive enough to fire
|
||||
reliably when a problem occurs, so that you don't miss a chance to solve
|
||||
problems. On the other hand, a good trigger isn't prone to false positives, so
|
||||
you don't gather information when the server is functioning normally.
|
||||
pt-stalk does two things: it watches a MySQL server and waits for a trigger
|
||||
condition to occur, and it collects diagnostic data when that trigger occurs.
|
||||
To avoid false-positives caused by short-lived problems, the trigger condition
|
||||
must be true at least L<"--cycles"> times before a L<"--collect"> is triggered.
|
||||
|
||||
To use pt-stalk effectively, you need to define a good trigger. A good trigger
|
||||
is sensitive enough to fire reliably when a problem occurs, so that you don't
|
||||
miss a chance to solve problems. On the other hand, a good trigger isn't
|
||||
prone to false positives, so you don't gather information when the server
|
||||
is functioning normally.
|
||||
|
||||
The most reliable triggers for MySQL tend to be the number of connections to the
|
||||
server, and the number of queries running concurrently. These are available in
|
||||
@@ -1472,55 +1499,76 @@ Threads_running usually is. Your job, as the tool's user, is to define an
|
||||
appropriate trigger condition for the tool. Choose carefully, because the
|
||||
quality of your results will depend on the trigger you choose.
|
||||
|
||||
You can define the trigger with the L<"--function">, L<"--variable">, and
|
||||
L<"--threshold"> options, among others. Please read the documentation for
|
||||
--function to learn how to do this.
|
||||
You define the trigger with the L<"--function">, L<"--variable">,
|
||||
L<"--threshold">, and L<"--cycles"> options. The default values
|
||||
for these options define a reasonable trigger, but you should adjust
|
||||
or change them to suite your particular system and needs.
|
||||
|
||||
The pt-stalk tool, by default, simply watches MySQL repeatedly until the trigger
|
||||
becomes true. It then gathers diagnostics for a while, and sleeps afterwards for
|
||||
some time to prevent repeatedly gathering data if the condition remains true.
|
||||
In crude pseudocode, omitting some subtleties,
|
||||
By default, pt-stalk tool watches MySQL forever until the trigger occurs,
|
||||
then it collects diagnostic data for a while, and sleeps afterwards to avoid
|
||||
repeatedly collecting data if the trigger remains true. The general order of
|
||||
operations is:
|
||||
|
||||
while true; do
|
||||
if --variable from --function is greater than --threshold; then
|
||||
observations++
|
||||
if observations is greater than --cycles; then
|
||||
capture diagnostics for --run-time seconds
|
||||
exit if --iterations is exceeded
|
||||
sleep for --sleep seconds
|
||||
done
|
||||
done
|
||||
clean up data that's older than --retention-time
|
||||
sleep for --interval seconds
|
||||
done
|
||||
while true; do
|
||||
if --variable from --function > --threshold; then
|
||||
cycles_true++
|
||||
if cycles_true >= --cycles; then
|
||||
--notify-by-email
|
||||
if --collect; then
|
||||
if --disk-bytes-free and --disk-pct-free ok; then
|
||||
(--collect for --run-time seconds) &
|
||||
fi
|
||||
rm files in --dest older than --retention-time
|
||||
fi
|
||||
iter++
|
||||
cycles_true=0
|
||||
fi
|
||||
if iter < --iterations; then
|
||||
sleep --sleep seconds
|
||||
else
|
||||
break
|
||||
fi
|
||||
else
|
||||
if iter < --iterations; then
|
||||
sleep --interval seconds
|
||||
else
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
rm old --dest files older than --retention-time
|
||||
if --collect process are still running; then
|
||||
wait up to --run-time * 3 seconds
|
||||
kill any remaining --collect processes
|
||||
fi
|
||||
|
||||
The diagnostic data is written to files whose names begin with a timestamp, so
|
||||
you can distinguish samples from each other in case the tool collects data
|
||||
multiple times. The pt-sift tool is designed to help you browse and analyze the
|
||||
resulting samples of data.
|
||||
multiple times. The pt-sift tool is designed to help you browse and analyze
|
||||
the resulting data samples.
|
||||
|
||||
Although this sounds simple enough, in practice there are a number of
|
||||
subtleties, such as detecting when the disk is beginning to fill up so that the
|
||||
tool doesn't cause the server to run out of disk space. This tool handles these
|
||||
types of potential problems, so it's a good idea to use this tool instead of
|
||||
writing something from scratch and possibly experiencing some of the hazards
|
||||
this tool is designed to prevent.
|
||||
this tool is designed to avoid.
|
||||
|
||||
=head1 CONFIGURING
|
||||
|
||||
You can use standard Percona Toolkit configuration files to set commandline
|
||||
You can use standard Percona Toolkit configuration files to set command line
|
||||
options.
|
||||
|
||||
You will probably want to run the tool as a daemon and customize at least the
|
||||
diagnostic threshold. Here's a sample configuration file for triggering when
|
||||
L<"--threshold">. Here's a sample configuration file for triggering when
|
||||
there are more than 20 queries running at once:
|
||||
|
||||
daemonize
|
||||
threshold=20
|
||||
|
||||
If you're not running the tool as it's designed (as a root user, daemonized)
|
||||
then you'll need to set several options, such as L<"--dest">, to locations that
|
||||
are writable by non-root users.
|
||||
If you don't run the tool as root, then you will need specify several options,
|
||||
such as L<"--pid">, L<"--log">, and L<"--dest">, else the tool will probably
|
||||
fail to start.
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
@@ -1530,8 +1578,8 @@ are writable by non-root users.
|
||||
|
||||
default: yes; negatable: yes
|
||||
|
||||
Collect system information. You can negate this option to make the tool watch
|
||||
the system but not actually gather any diagnostic data.
|
||||
Collect diagnostic data when the trigger occurs. Specify C<--no-collect>
|
||||
to make the tool watch the system but not collect data.
|
||||
|
||||
See also L<"--stalk">.
|
||||
|
||||
@@ -1581,9 +1629,8 @@ first option on the command line.
|
||||
|
||||
type: int; default: 5
|
||||
|
||||
The number of times the trigger condition must be true before collecting data.
|
||||
This helps prevent false positives, and makes the trigger condition less likely
|
||||
to fire when the problem recovers quickly.
|
||||
How many times L<"--variable"> must be greater than L<"--threshold"> before triggering L<"--collect">. This helps prevent false positives, and makes
|
||||
the trigger condition less likely to fire when the problem recovers quickly.
|
||||
|
||||
=item --daemonize
|
||||
|
||||
@@ -1594,14 +1641,15 @@ its output as specified in --log.
|
||||
|
||||
type: string; default: /var/lib/pt-stalk
|
||||
|
||||
Where to store the diagnostic data. Each time the tool collects data, it writes
|
||||
to a new set of files, which are named with the current system timestamp.
|
||||
Where to save diagnostic data from L<"--collect">. Each time the tool
|
||||
collects data, it writes to a new set of files, which are named with the
|
||||
current system timestamp.
|
||||
|
||||
=item --disk-bytes-free
|
||||
|
||||
type: size; default: 100M
|
||||
|
||||
Don't collect data if the disk has less than this much free space.
|
||||
Do not L<"--collect"> if the disk has less than this much free space.
|
||||
This prevents the tool from filling up the disk with diagnostic data.
|
||||
|
||||
If the L<"--dest"> directory contains a previously captured sample of data,
|
||||
@@ -1618,7 +1666,7 @@ Valid size value suffixes are k, M, G, and T.
|
||||
|
||||
type: int; default: 5
|
||||
|
||||
Don't collect data if the disk has less than this percent free space.
|
||||
Do not L<"--collect"> if the disk has less than this percent free space.
|
||||
This prevents the tool from filling up the disk with diagnostic data.
|
||||
|
||||
This option works similarly to L<"--disk-bytes-free"> but specifies a
|
||||
@@ -1630,57 +1678,57 @@ margins are satisfied.
|
||||
|
||||
type: string; default: status
|
||||
|
||||
Specifies what to watch for a diagnostic trigger. The default value watches
|
||||
SHOW GLOBAL STATUS, but you can also watch SHOW PROCESSLIST or supply a plugin
|
||||
file with your own custom code. This function supplies the value of
|
||||
What to watch for the trigger. The default value watches
|
||||
C<SHOW GLOBAL STATUS>, but you can also watch C<SHOW PROCESSLIST> and specify
|
||||
a file with your own custom code. This function supplies the value of
|
||||
L<"--variable">, which is then compared against L<"--threshold"> to see if the
|
||||
trigger condition is met. Additional options may be required as well; see
|
||||
below. Possible values:
|
||||
the trigger condition is met. Additional options may be required as
|
||||
well; see below. Possible values are:
|
||||
|
||||
=over
|
||||
|
||||
=item * status
|
||||
|
||||
This value specifies that the source of data for the diagnostic trigger is SHOW
|
||||
GLOBAL STATUS. The value of L<"--variable"> then defines which status counter
|
||||
is the trigger.
|
||||
Watch C<SHOW GLOBAL STATUS> for the trigger. The value of
|
||||
L<"--variable"> then defines which status counter is the trigger.
|
||||
|
||||
=item * processlist
|
||||
|
||||
This value specifies that the data for the diagnostic trigger comes from SHOW
|
||||
FULL PROCESSLIST. The trigger value is the count of processes whose
|
||||
L<"--variable"> column matches the L<"--match"> option. For example, to trigger
|
||||
when more than 10 processes are in the "statistics" state, use the following
|
||||
options:
|
||||
Watch C<SHOW FULL PROCESSLIST> for the trigger. The trigger
|
||||
value is the count of processes whose L<"--variable"> column matches the
|
||||
L<"--match"> option. For example, to trigger L<"--collect"> when more than
|
||||
10 processes are in the "statistics" state, specify:
|
||||
|
||||
--function processlist --variable State \
|
||||
--match statistics --threshold 10
|
||||
--function processlist \
|
||||
--variable State \
|
||||
--match statistics \
|
||||
--threshold 10
|
||||
|
||||
=back
|
||||
|
||||
In addition, you can specify a file that contains your custom trigger function,
|
||||
written in Unix shell script. This can be a wrapper that executes anything you
|
||||
wish. If the argument to --function is a file, then it takes precedence over
|
||||
builtin functions, so if there is a file in the working directory named "status"
|
||||
or "processlist" then the tool will use that file as a plugin, even though those
|
||||
are otherwise recognized as reserved words for this option.
|
||||
In addition, you can specify a file that contains your custom trigger
|
||||
function, written in Unix shell script. This can be a wrapper that executes
|
||||
anything you wish. If the argument to L<"--function"> is a file, then it
|
||||
takes precedence over built-in functions, so if there is a file in the working
|
||||
directory named "status" or "processlist" then the tool will use that file
|
||||
even though are valid built-in values.
|
||||
|
||||
The plugin file works by providing a function called C<trg_plugin>, and the tool
|
||||
simply sources the file and executes the function. For example, the function
|
||||
might look like the following:
|
||||
The file works by providing a function called C<trg_plugin>, and the tool
|
||||
simply sources the file and executes the function. For example, the file
|
||||
might contain:
|
||||
|
||||
trg_plugin() {
|
||||
mysql $EXT_ARGV -e "SHOW ENGINE INNODB STATUS" \
|
||||
| grep -c "has waited at"
|
||||
}
|
||||
|
||||
This snippet will count the number of mutex waits inside of InnoDB. It
|
||||
This snippet will count the number of mutex waits inside InnoDB. It
|
||||
illustrates the general principle: the function must output a number, which is
|
||||
then compared to the threshold as usual. The $EXT_ARGV variable contains the
|
||||
MySQL options mentioned in the L<"SYNOPSIS"> above.
|
||||
then compared to L<"--threshold"> as usual. The C<$EXT_ARGV> variable
|
||||
contains the MySQL options mentioned in the L<"SYNOPSIS"> above.
|
||||
|
||||
The plugin should not alter the tool's existing global variables. Prefix any
|
||||
plugin-specific global variables with "PLUGIN_" or make them local.
|
||||
The file should not alter the tool's existing global variables. Prefix any
|
||||
file-specific global variables with "PLUGIN_" or make them local.
|
||||
|
||||
=item --help
|
||||
|
||||
@@ -1690,15 +1738,17 @@ Print help and exit.
|
||||
|
||||
type: int; default: 1
|
||||
|
||||
Interval between checks for the diagnostic trigger.
|
||||
How often to check the if trigger is true, in seconds.
|
||||
|
||||
=item --iterations
|
||||
|
||||
type: int
|
||||
|
||||
Exit after collecting diagnostics this many times. By default, the tool
|
||||
will continue to watch the server forever, but this is useful for scenarios
|
||||
where you want to capture once and then exit, for example.
|
||||
How many times to L<"--collect"> diagnostic data. By default, the tool
|
||||
runs forever and collects data every time the trigger occurs.
|
||||
Specify L<"--iterations"> to collect data a limited number of times.
|
||||
This option is also useful with C<--no-stalk> to collect data once and
|
||||
exit, for example.
|
||||
|
||||
=item --log
|
||||
|
||||
@@ -1710,14 +1760,14 @@ Print all output to this file when daemonized.
|
||||
|
||||
type: string
|
||||
|
||||
The pattern to use when watching SHOW PROCESSLIST. See the documentation for
|
||||
L<"--function"> for details.
|
||||
The pattern to use when watching SHOW PROCESSLIST. See L<"--function">
|
||||
for details.
|
||||
|
||||
=item --notify-by-email
|
||||
|
||||
type: string
|
||||
|
||||
Send mail to this list of addresses when data is collected.
|
||||
Send an email to these addresses for every L<"--collect">.
|
||||
|
||||
=item --pid
|
||||
|
||||
@@ -1746,8 +1796,8 @@ Called before stalking.
|
||||
|
||||
=item before_collect
|
||||
|
||||
Called when the stalk condition is triggered, before running a collector
|
||||
process as a backgrounded subshell.
|
||||
Called when the trigger occurs, before running a L<"--collect">
|
||||
subprocesses in the background.
|
||||
|
||||
=item after_collect
|
||||
|
||||
@@ -1771,10 +1821,10 @@ this hook is only called if L<"--iterations"> is specified.
|
||||
|
||||
=back
|
||||
|
||||
For example, a very simple plugin that touches a file when a collector
|
||||
process is triggered:
|
||||
For example, a very simple plugin that touches a file when L<"--collect">
|
||||
is triggered:
|
||||
|
||||
before_colllect() {
|
||||
before_collect() {
|
||||
touch /tmp/foo
|
||||
}
|
||||
|
||||
@@ -1797,9 +1847,9 @@ be set to indicate why the tool was stopped.
|
||||
|
||||
type: string
|
||||
|
||||
The filename prefix for diagnostic samples. By default, samples have a timestamp
|
||||
prefix based on the current local time, such as 2011_12_06_14_02_02, which is
|
||||
December 6, 2011 at 14:02:02.
|
||||
The filename prefix for diagnostic samples. By default, all files created
|
||||
by the same L<"--collect"> instance have a timestamp prefix based on the current
|
||||
local time, like C<2011_12_06_14_02_02>, which is December 6, 2011 at 14:02:02.
|
||||
|
||||
=item --retention-time
|
||||
|
||||
@@ -1812,10 +1862,12 @@ purged.
|
||||
|
||||
type: int; default: 30
|
||||
|
||||
How long the tool will collect data when it triggers. This should not be longer
|
||||
than L<"--sleep">. It is usually not necessary to change this; if the default 30
|
||||
seconds hasn't gathered enough diagnostic data, running longer is not likely to
|
||||
do so. In fact, in many cases a shorter collection period is appropriate.
|
||||
How long to L<"--collect"> diagnostic data when the trigger occurs.
|
||||
The value is in seconds and should not be longer than L<"--sleep">. It is
|
||||
usually not necessary to change this; if the default 30 seconds doesn't
|
||||
collect enough data, running longer is not likely to help because the system
|
||||
or MySQL server is probably too busy to respond. In fact, in many cases a
|
||||
shorter collection period is appropriate.
|
||||
|
||||
This value is used two other times. After collecting, the collect subprocess
|
||||
will wait another L<"--run-time"> seconds for its commands to finish. Some
|
||||
@@ -1825,7 +1877,7 @@ are deleted, the extra wait gives commands time to finish and write their
|
||||
data. The value is potentially used again just before the tool exits to wait
|
||||
again for any collect subprocesses to finish. In most cases this won't
|
||||
happen because of the aforementioned extra wait. If it happens, the tool
|
||||
will log "Waiting up to N seconds for collectors to finish..." where N is
|
||||
will log "Waiting up to N seconds for subprocesses to finish..." where N is
|
||||
three times L<"--run-time">. In both cases, after waiting, the tool kills
|
||||
all of its subprocesses.
|
||||
|
||||
@@ -1833,8 +1885,8 @@ all of its subprocesses.
|
||||
|
||||
type: int; default: 300
|
||||
|
||||
How long to sleep after collecting data. This prevents the tool from triggering
|
||||
continuously, which might be a problem if the collection process is intrusive.
|
||||
How long to sleep after L<"--collect">. This prevents the tool
|
||||
from triggering continuously, which might be a problem if the collection process is intrusive.
|
||||
It also prevents filling up the disk or gathering too much data to analyze
|
||||
reasonably.
|
||||
|
||||
@@ -1842,14 +1894,16 @@ reasonably.
|
||||
|
||||
default: yes; negatable: yes
|
||||
|
||||
Watch the server and wait for the trigger to occur. You can negate this option
|
||||
to make the tool immediately gather any diagnostic data once and exit. This is
|
||||
useful if a problem is already happening, but pt-stalk is not running, so
|
||||
you only want to collect diagnostic data.
|
||||
Watch the server and wait for the trigger to occur. Specify C<--no-stalk>
|
||||
to collect diagnostic data immediately, that is, without waiting for the
|
||||
trigger to occur. You probably also want to specify values for
|
||||
L<"--interval">, L<"--iterations">, and L<"--sleep">. For example, to
|
||||
immediately collect data for 1 minute then exit, specify:
|
||||
|
||||
If this option is negate, L<"--daemonize">, L<"--log">, L<"--pid">, and other
|
||||
stalking-related options have no effect; the tool simply collects diagnostic
|
||||
data and exits. Safeguard options, like L<"--disk-bytes-free"> and
|
||||
--no-stalk --run-time 60 --iterations 1
|
||||
|
||||
L<"--cycles">, L<"--daemonize">, L<"--log"> and L<"--pid"> have no effect
|
||||
with C<--no-stalk>. Safeguard options, like L<"--disk-bytes-free"> and
|
||||
L<"--disk-pct-free">, are still respected.
|
||||
|
||||
See also L<"--collect">.
|
||||
@@ -1858,14 +1912,18 @@ See also L<"--collect">.
|
||||
|
||||
type: int; default: 25
|
||||
|
||||
The threshold at which the diagnostic trigger should fire. See L<"--function">
|
||||
for details.
|
||||
The maximum acceptable value for L<"--variable">. L<"--collect"> is
|
||||
triggered when the value of L<"--variable"> is greater than L<"--threshold">
|
||||
for L<"--cycles"> many times. Currently, there is no way to define a lower
|
||||
threshold to check for a L<"--variable"> value that is too low.
|
||||
|
||||
See also L<"--function">.
|
||||
|
||||
=item --variable
|
||||
|
||||
type: string; default: Threads_running
|
||||
|
||||
The variable to compare against the threshold. See L<"--function"> for details.
|
||||
The variable to compare against L<"--threshold">. See also L<"--function">.
|
||||
|
||||
=item --verbose
|
||||
|
||||
@@ -1995,7 +2053,8 @@ Replace C<TOOL> with the name of any tool.
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
Baron Schwartz, Justin Swanhart, Fernando Ipar, and Daniel Nichter
|
||||
Baron Schwartz, Justin Swanhart, Fernando Ipar, Daniel Nichter,
|
||||
and Brian Fraser.
|
||||
|
||||
=head1 ABOUT PERCONA TOOLKIT
|
||||
|
||||
|
@@ -22,7 +22,7 @@
|
||||
# collect collects system information.
|
||||
|
||||
# XXX
|
||||
# THIS LIB REQUIRES log_warn_die.sh, safeguards.sh, and alt_cmds.sh!
|
||||
# THIS LIB REQUIRES log_warn_die, safeguards, alt_cmds, and subshell!
|
||||
# XXX
|
||||
|
||||
set -u
|
||||
@@ -289,16 +289,8 @@ collect() {
|
||||
# it may leave an empty file. But first wait another --run-time
|
||||
# seconds for any slow process to finish:
|
||||
# https://bugs.launchpad.net/percona-toolkit/+bug/1047701
|
||||
local slept=0
|
||||
while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do
|
||||
sleep 1
|
||||
slept=$((slept + 1))
|
||||
done
|
||||
|
||||
for pid in $(jobs -p); do
|
||||
kill $pid >/dev/null 2>&1
|
||||
done
|
||||
|
||||
wait_for_subshells $OPT_RUN_TIME
|
||||
kill_all_subshells
|
||||
for file in "$d/$p-"*; do
|
||||
# If there's not at least 1 line that's not a TS,
|
||||
# then the file is empty.
|
||||
|
66
lib/bash/subshell.sh
Normal file
66
lib/bash/subshell.sh
Normal file
@@ -0,0 +1,66 @@
|
||||
# This program is copyright 2013 Percona Ireland Ltd.
|
||||
# Feedback and improvements are welcome.
|
||||
#
|
||||
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
|
||||
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free Software
|
||||
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
|
||||
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
|
||||
# licenses.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along with
|
||||
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
|
||||
# Place, Suite 330, Boston, MA 02111-1307 USA.
|
||||
# ###########################################################################
|
||||
# subshell package
|
||||
# ###########################################################################
|
||||
|
||||
# Package: subshell
|
||||
|
||||
set -u
|
||||
|
||||
wait_for_subshells() {
|
||||
local max_wait=$1
|
||||
if [ "$(jobs)" ]; then
|
||||
log "Waiting up to $max_wait seconds for subprocesses to finish..."
|
||||
local slept=0
|
||||
while [ -n "$(jobs)" ]; do
|
||||
local subprocess_still_running=""
|
||||
for pid in $(jobs -p); do
|
||||
if kill -0 $pid >/dev/null 2>&1; then
|
||||
subprocess_still_running=1
|
||||
fi
|
||||
done
|
||||
if [ "$subprocess_still_running" ]; then
|
||||
sleep 1
|
||||
slept=$((slept + 1))
|
||||
[ $slept -ge $max_wait ] && break
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
kill_all_subshells() {
|
||||
if [ "$(jobs)" ]; then
|
||||
for pid in $(jobs -p); do
|
||||
if kill -0 $pid >/dev/null 2>&1; then
|
||||
# This isn't an warning (we don't want exit status 1) because
|
||||
# the system may be running slowly so it's just "natural" that
|
||||
# a collector may get stuck or run really slowly.
|
||||
log "Killing subprocess $pid"
|
||||
kill $pid >/dev/null 2>&1
|
||||
fi
|
||||
done
|
||||
else
|
||||
log "All subprocesses have finished"
|
||||
fi
|
||||
}
|
||||
|
||||
# ###########################################################################
|
||||
# End subshell package
|
||||
# ###########################################################################
|
@@ -10,6 +10,7 @@ TOOL="pt-stalk"
|
||||
mkdir "$PT_TMPDIR/collect" 2>/dev/null
|
||||
|
||||
source "$LIB_DIR/log_warn_die.sh"
|
||||
source "$LIB_DIR/subshell.sh"
|
||||
source "$LIB_DIR/parse_options.sh"
|
||||
source "$LIB_DIR/safeguards.sh"
|
||||
source "$LIB_DIR/alt_cmds.sh"
|
||||
|
@@ -317,7 +317,11 @@ diag(`cp $ENV{HOME}/.pt-stalk.conf.original $ENV{HOME}/.pt-stalk.conf 2>/dev/nul
|
||||
|
||||
cleanup();
|
||||
|
||||
$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
# As of 2.2, --no-stalk means just that: don't stalk, just collect, so
|
||||
# we have to specify --iterations=1 else the tool will continue to run,
|
||||
# whereas in 2.1 --no-stalk implied/forced "collect once and exit".
|
||||
|
||||
$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file --iterations 1 -- --defaults-file=$cnf >$log_file 2>&1");
|
||||
|
||||
PerconaTest::wait_until(sub { !-f $pid_file });
|
||||
|
||||
|
Reference in New Issue
Block a user