Merge lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665.

2025-09-18 17:58:55 +00:00 · 2013-03-04 18:28:37 -07:00
parent 45813e082d 84adf451de
commit ba315c3353
5 changed files with 280 additions and 158 deletions
--- a/bin/pt-stalk
+++ b/bin/pt-stalk
@@ -54,6 +54,58 @@ _d () {
 # End log_warn_die package
 # ###########################################################################

+# ###########################################################################
+# subshell package
+# This package is a copy without comments from the original.  The original
+# with comments and its test file can be found in the Bazaar repository at,
+#   lib/bash/subshell.sh
+#   t/lib/bash/subshell.sh
+# See https://launchpad.net/percona-toolkit for more information.
+# ###########################################################################
+
+
+set -u
+
+wait_for_subshells() {
+   local max_wait=$1
+   if [ "$(jobs)" ]; then
+      log "Waiting up to $max_wait seconds for subprocesses to finish..."
+      local slept=0
+      while [ -n "$(jobs)" ]; do
+         local subprocess_still_running=""
+         for pid in $(jobs -p); do
+            if kill -0 $pid >/dev/null 2>&1; then
+               subprocess_still_running=1
+            fi
+         done
+         if [ "$subprocess_still_running" ]; then
+            sleep 1
+            slept=$((slept + 1))
+            [ $slept -ge $max_wait ] && break
+         else
+            break
+         fi
+      done
+   fi
+}
+
+kill_all_subshells() {
+   if [ "$(jobs)" ]; then
+      for pid in $(jobs -p); do
+         if kill -0 $pid >/dev/null 2>&1; then
+            log "Killing subprocess $pid"
+            kill $pid >/dev/null 2>&1
+         fi
+      done
+   else
+      log "All subprocesses have finished"
+   fi
+}
+
+# ###########################################################################
+# End subshell package
+# ###########################################################################
+
 # ###########################################################################
 # parse_options package
 # This package is a copy without comments from the original.  The original
@@ -871,16 +923,8 @@ collect() {

   hostname > "$d/$p-hostname"

-   local slept=0
-   while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do
-      sleep 1
-      slept=$((slept + 1))
-   done
-
-   for pid in $(jobs -p); do
-      kill $pid >/dev/null 2>&1
-   done
-
+   wait_for_subshells $OPT_RUN_TIME
+   kill_all_subshells
   for file in "$d/$p-"*; do
      if [ -z "$(grep -v '^TS ' --max-count 1 "$file")" ]; then
         log "Removing empty file $file";
@@ -1079,7 +1123,7 @@ sleep_ok() {
   local seconds="$1"
   local msg="${2:-""}"
   if oktorun; then
-      [ "$msg" ] && info "$msg"
+      [ "$msg" ] && log "$msg"
      sleep $seconds
   fi
 }
@@ -1244,27 +1288,8 @@ stalk() {
   # we may get in sync with the collector and kill it a microsecond
   # before it kills itself, thus 3 * run-time.
   # https://bugs.launchpad.net/percona-toolkit/+bug/1070434
-   if [ "$(jobs)" ]; then
-      local sleep_time=$((OPT_RUN_TIME * 3))
-      log "Waiting up to $sleep_time seconds for collectors to finish..."
-      local slept=0
-      while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do
-         sleep 1
-         slept=$((slept + 1))
-      done
-
-      if [ "$(jobs)" ]; then
-         for pid in $(jobs -p); do
-            # This isn't an warning (we don't want exit status 1) because
-            # the system may be running slowly so it's just "natural" that
-            # a collector may get stuck or run really slowly.
-            log "Killing collector $pid"
-            kill $pid >/dev/null 2>&1
-         done
-      else
-         log "All collectors have finished"
-      fi
-   fi
+   wait_for_subshells $((OPT_RUN_TIME * 3))
+   kill_all_subshells
 }

 # ###########################################################################
@@ -1333,10 +1358,8 @@ if    [ "${0##*/}" = "$TOOL" ] \

   if [ -z "$OPT_STALK" -a "$OPT_COLLECT" ]; then
      # Not stalking; do immediate collect once.
-      OPT_ITERATIONS=1
      OPT_CYCLES=0
-      OPT_SLEEP=0
-      OPT_INTERVAL=0
+      echo "[iter=$OPT_ITERATIONS] [cycle=$OPT_CYCLES] [sleep=$OPT_SLEEP] [interval=$OPT_INTERVAL]"
   fi

   usage_or_errors "$0"
@@ -1412,17 +1435,17 @@ fi

 =head1 NAME

-pt-stalk - Gather forensic data about MySQL when a problem occurs.
+pt-stalk - Collect forensic data about MySQL when problems occur.

 =head1 SYNOPSIS

 Usage: pt-stalk [OPTIONS] [-- MYSQL OPTIONS]

-pt-stalk watches for a trigger condition to become true, and then collects data
-to help in diagnosing problems. It is designed to run as a daemon with root
+pt-stalk waits for a trigger condition to occur, then collects data
+to help diagnose problems.  The tool is designed to run as a daemon with root
 privileges, so that you can diagnose intermittent problems that you cannot
-observe directly. You can also use it to execute a custom command, or to gather
-the data on demand without waiting for the trigger to happen.
+observe directly.  You can also use it to execute a custom command, or to
+collect data on demand without waiting for the trigger to occur.

 =head1 RISKS

@@ -1453,16 +1476,20 @@ chance to see the system when it happens. How do you solve intermittent MySQL
 problems when you can't observe them? That's why pt-stalk exists. In addition to
 using it when there's a known problem on your servers, it is a good idea to run
 pt-stalk all the time, even when you think nothing is wrong.  You will
-appreciate the data it gathers when a problem occurs, because problems such as
-MySQL lockups or spikes of activity typically leave no evidence to use in root
+appreciate the data it collects when a problem occurs, because problems such as
+MySQL lockups or spikes in activity typically leave no evidence to use in root
 cause analysis.

-This tool does two things: it watches a server (typically MySQL) for a trigger
-to occur, and it gathers diagnostic data.  To use it effectively, you need to
-define a good trigger condition. A good trigger is sensitive enough to fire
-reliably when a problem occurs, so that you don't miss a chance to solve
-problems. On the other hand, a good trigger isn't prone to false positives, so
-you don't gather information when the server is functioning normally.
+pt-stalk does two things: it watches a MySQL server and waits for a trigger
+condition to occur, and it collects diagnostic data when that trigger occurs.
+To avoid false-positives caused by short-lived problems, the trigger condition
+must be true at least L<"--cycles"> times before a L<"--collect"> is triggered.
+
+To use pt-stalk effectively, you need to define a good trigger.  A good trigger
+is sensitive enough to fire reliably when a problem occurs, so that you don't
+miss a chance to solve problems.  On the other hand, a good trigger isn't
+prone to false positives, so you don't gather information when the server
+is functioning normally.

 The most reliable triggers for MySQL tend to be the number of connections to the
 server, and the number of queries running concurrently. These are available in
@@ -1472,55 +1499,76 @@ Threads_running usually is.  Your job, as the tool's user, is to define an
 appropriate trigger condition for the tool.  Choose carefully, because the
 quality of your results will depend on the trigger you choose.

-You can define the trigger with the L<"--function">, L<"--variable">, and
-L<"--threshold"> options, among others.  Please read the documentation for
--function to learn how to do this.
+You define the trigger with the L<"--function">, L<"--variable">, 
+L<"--threshold">, and L<"--cycles"> options.  The default values
+for these options define a reasonable trigger, but you should adjust
+or change them to suite your particular system and needs.

-The pt-stalk tool, by default, simply watches MySQL repeatedly until the trigger
-becomes true. It then gathers diagnostics for a while, and sleeps afterwards for
-some time to prevent repeatedly gathering data if the condition remains true.
-In crude pseudocode, omitting some subtleties,
+By default, pt-stalk tool watches MySQL forever until the trigger occurs,
+then it collects diagnostic data for a while, and sleeps afterwards to avoid
+repeatedly collecting data if the trigger remains true.  The general order of
+operations is:

-  while true; do
-    if --variable from --function is greater than --threshold; then
-      observations++
-      if observations is greater than --cycles; then
-        capture diagnostics for --run-time seconds
-        exit if --iterations is exceeded
-        sleep for --sleep seconds
-      done
-    done
-    clean up data that's older than --retention-time
-    sleep for --interval seconds
-  done
+   while true; do
+      if --variable from --function > --threshold; then
+         cycles_true++
+         if cycles_true >= --cycles; then
+            --notify-by-email
+            if --collect; then
+               if --disk-bytes-free and --disk-pct-free ok; then
+                  (--collect for --run-time seconds) &
+               fi
+               rm files in --dest older than --retention-time
+            fi
+            iter++
+            cycles_true=0
+         fi
+         if iter < --iterations; then
+            sleep --sleep seconds
+         else
+            break
+         fi
+      else
+         if iter < --iterations; then
+            sleep --interval seconds
+         else
+            break
+         fi
+      fi
+   done
+   rm old --dest files older than --retention-time
+   if --collect process are still running; then
+      wait up to --run-time * 3 seconds
+      kill any remaining --collect processes 
+   fi

 The diagnostic data is written to files whose names begin with a timestamp, so
 you can distinguish samples from each other in case the tool collects data
-multiple times.  The pt-sift tool is designed to help you browse and analyze the
-resulting samples of data.
+multiple times.  The pt-sift tool is designed to help you browse and analyze
+the resulting data samples.

 Although this sounds simple enough, in practice there are a number of
 subtleties, such as detecting when the disk is beginning to fill up so that the
 tool doesn't cause the server to run out of disk space.  This tool handles these
 types of potential problems, so it's a good idea to use this tool instead of
 writing something from scratch and possibly experiencing some of the hazards
-this tool is designed to prevent.
+this tool is designed to avoid.

 =head1 CONFIGURING

-You can use standard Percona Toolkit configuration files to set commandline
+You can use standard Percona Toolkit configuration files to set command line
 options.

 You will probably want to run the tool as a daemon and customize at least the
-diagnostic threshold.  Here's a sample configuration file for triggering when
+L<"--threshold">.  Here's a sample configuration file for triggering when
 there are more than 20 queries running at once:

  daemonize
  threshold=20

-If you're not running the tool as it's designed (as a root user, daemonized)
-then you'll need to set several options, such as L<"--dest">, to locations that
-are writable by non-root users.
+If you don't run the tool as root, then you will need specify several options,
+such as L<"--pid">, L<"--log">, and L<"--dest">, else the tool will probably
+fail to start.

 =head1 OPTIONS

@@ -1530,8 +1578,8 @@ are writable by non-root users.

 default: yes; negatable: yes

-Collect system information.  You can negate this option to make the tool watch
-the system but not actually gather any diagnostic data.
+Collect diagnostic data when the trigger occurs.  Specify C<--no-collect>
+to make the tool watch the system but not collect data.

 See also L<"--stalk">.

@@ -1581,9 +1629,8 @@ first option on the command line.

 type: int; default: 5

-The number of times the trigger condition must be true before collecting data.
-This helps prevent false positives, and makes the trigger condition less likely
-to fire when the problem recovers quickly.
+How many times L<"--variable"> must be greater than L<"--threshold"> before triggering L<"--collect">.  This helps prevent false positives, and makes
+the trigger condition less likely to fire when the problem recovers quickly.

 =item --daemonize

@@ -1594,14 +1641,15 @@ its output as specified in --log.

 type: string; default: /var/lib/pt-stalk

-Where to store the diagnostic data.  Each time the tool collects data, it writes
-to a new set of files, which are named with the current system timestamp.
+Where to save diagnostic data from L<"--collect">.  Each time the tool
+collects data, it writes to a new set of files, which are named with the
+current system timestamp.

 =item --disk-bytes-free

 type: size; default: 100M

-Don't collect data if the disk has less than this much free space.
+Do not L<"--collect"> if the disk has less than this much free space.
 This prevents the tool from filling up the disk with diagnostic data.

 If the L<"--dest"> directory contains a previously captured sample of data,
@@ -1618,7 +1666,7 @@ Valid size value suffixes are k, M, G, and T.

 type: int; default: 5

-Don't collect data if the disk has less than this percent free space.
+Do not L<"--collect"> if the disk has less than this percent free space.
 This prevents the tool from filling up the disk with diagnostic data.

 This option works similarly to L<"--disk-bytes-free"> but specifies a
@@ -1630,57 +1678,57 @@ margins are satisfied.

 type: string; default: status

-Specifies what to watch for a diagnostic trigger.  The default value watches
-SHOW GLOBAL STATUS, but you can also watch SHOW PROCESSLIST or supply a plugin
-file with your own custom code.  This function supplies the value of
+What to watch for the trigger.  The default value watches
+C<SHOW GLOBAL STATUS>, but you can also watch C<SHOW PROCESSLIST> and specify
+a file with your own custom code.  This function supplies the value of
 L<"--variable">, which is then compared against L<"--threshold"> to see if the
-trigger condition is met.  Additional options may be required as well; see
-below. Possible values:
+the trigger condition is met.  Additional options may be required as
+well; see below. Possible values are:

 =over

 =item * status

-This value specifies that the source of data for the diagnostic trigger is SHOW
-GLOBAL STATUS.  The value of L<"--variable"> then defines which status counter
-is the trigger.
+Watch C<SHOW GLOBAL STATUS> for the trigger.  The value of
+L<"--variable"> then defines which status counter is the trigger.

 =item * processlist

-This value specifies that the data for the diagnostic trigger comes from SHOW
-FULL PROCESSLIST.  The trigger value is the count of processes whose
-L<"--variable"> column matches the L<"--match"> option.  For example, to trigger
-when more than 10 processes are in the "statistics" state, use the following
-options:
+Watch C<SHOW FULL PROCESSLIST> for the trigger.  The trigger
+value is the count of processes whose L<"--variable"> column matches the
+L<"--match"> option.  For example, to trigger L<"--collect"> when more than
+10 processes are in the "statistics" state, specify:

-  --function processlist --variable State \
-    --match statistics --threshold 10
+   --function processlist \
+   --variable State       \
+   --match statistics     \
+   --threshold 10

 =back

-In addition, you can specify a file that contains your custom trigger function,
-written in Unix shell script.  This can be a wrapper that executes anything you
-wish.  If the argument to --function is a file, then it takes precedence over
-builtin functions, so if there is a file in the working directory named "status"
-or "processlist" then the tool will use that file as a plugin, even though those
-are otherwise recognized as reserved words for this option.
+In addition, you can specify a file that contains your custom trigger
+function, written in Unix shell script.  This can be a wrapper that executes
+anything you wish.  If the argument to L<"--function"> is a file, then it
+takes precedence over built-in functions, so if there is a file in the working
+directory named "status" or "processlist" then the tool will use that file
+even though are valid built-in values.

-The plugin file works by providing a function called C<trg_plugin>, and the tool
-simply sources the file and executes the function.  For example, the function
-might look like the following:
+The file works by providing a function called C<trg_plugin>, and the tool
+simply sources the file and executes the function.  For example, the file
+might contain:

   trg_plugin() {
      mysql $EXT_ARGV -e "SHOW ENGINE INNODB STATUS" \
        | grep -c "has waited at"
   }

-This snippet will count the number of mutex waits inside of InnoDB. It
+This snippet will count the number of mutex waits inside InnoDB.  It
 illustrates the general principle: the function must output a number, which is
-then compared to the threshold as usual.  The $EXT_ARGV variable contains the
-MySQL options mentioned in the L<"SYNOPSIS"> above.
+then compared to L<"--threshold"> as usual.  The C<$EXT_ARGV> variable
+contains the MySQL options mentioned in the L<"SYNOPSIS"> above.

-The plugin should not alter the tool's existing global variables.  Prefix any
-plugin-specific global variables with "PLUGIN_" or make them local.
+The file should not alter the tool's existing global variables.  Prefix any
+file-specific global variables with "PLUGIN_" or make them local.

 =item --help

@@ -1690,15 +1738,17 @@ Print help and exit.

 type: int; default: 1

-Interval between checks for the diagnostic trigger.
+How often to check the if trigger is true, in seconds.

 =item --iterations

 type: int

-Exit after collecting diagnostics this many times.  By default, the tool
-will continue to watch the server forever, but this is useful for scenarios
-where you want to capture once and then exit, for example.
+How many times to L<"--collect"> diagnostic data.  By default, the tool
+runs forever and collects data every time the trigger occurs.
+Specify L<"--iterations"> to collect data a limited number of times.
+This option is also useful with C<--no-stalk> to collect data once and
+exit, for example.

 =item --log

@@ -1710,14 +1760,14 @@ Print all output to this file when daemonized.

 type: string

-The pattern to use when watching SHOW PROCESSLIST. See the documentation for
-L<"--function"> for details.
+The pattern to use when watching SHOW PROCESSLIST.  See L<"--function">
+for details.

 =item --notify-by-email

 type: string

-Send mail to this list of addresses when data is collected.
+Send an email to these addresses for every L<"--collect">.

 =item --pid

@@ -1746,8 +1796,8 @@ Called before stalking.

 =item before_collect

-Called when the stalk condition is triggered, before running a collector
-process as a backgrounded subshell.
+Called when the trigger occurs, before running a L<"--collect">
+subprocesses in the background.

 =item after_collect

@@ -1771,10 +1821,10 @@ this hook is only called if L<"--iterations"> is specified.

 =back

-For example, a very simple plugin that touches a file when a collector
-process is triggered:
+For example, a very simple plugin that touches a file when L<"--collect">
+is triggered:

-   before_colllect() {
+   before_collect() {
      touch /tmp/foo
   }

@@ -1797,9 +1847,9 @@ be set to indicate why the tool was stopped.

 type: string

-The filename prefix for diagnostic samples. By default, samples have a timestamp
-prefix based on the current local time, such as 2011_12_06_14_02_02, which is
-December 6, 2011 at 14:02:02.
+The filename prefix for diagnostic samples.  By default, all files created
+by the same L<"--collect"> instance have a timestamp prefix based on the current
+local time, like C<2011_12_06_14_02_02>, which is December 6, 2011 at 14:02:02.

 =item --retention-time

@@ -1812,10 +1862,12 @@ purged.

 type: int; default: 30

-How long the tool will collect data when it triggers.  This should not be longer
-than L<"--sleep">. It is usually not necessary to change this; if the default 30
-seconds hasn't gathered enough diagnostic data, running longer is not likely to
-do so. In fact, in many cases a shorter collection period is appropriate.
+How long to L<"--collect"> diagnostic data when the trigger occurs.
+The value is in seconds and should not be longer than L<"--sleep">.  It is
+usually not necessary to change this; if the default 30 seconds doesn't
+collect enough data, running longer is not likely to help because the system
+or MySQL server is probably too busy to respond.  In fact, in many cases a
+shorter collection period is appropriate.

 This value is used two other times.  After collecting, the collect subprocess
 will wait another L<"--run-time"> seconds for its commands to finish.  Some
@@ -1825,7 +1877,7 @@ are deleted, the extra wait gives commands time to finish and write their
 data.  The value is potentially used again just before the tool exits to wait
 again for any collect subprocesses to finish.  In most cases this won't
 happen because of the aforementioned extra wait.  If it happens, the tool
-will log "Waiting up to N seconds for collectors to finish..." where N is
+will log "Waiting up to N seconds for subprocesses to finish..." where N is
 three times L<"--run-time">.  In both cases, after waiting, the tool kills
 all of its subprocesses.

@@ -1833,8 +1885,8 @@ all of its subprocesses.

 type: int; default: 300

-How long to sleep after collecting data.  This prevents the tool from triggering
-continuously, which might be a problem if the collection process is intrusive.
+How long to sleep after L<"--collect">.  This prevents the tool
+from triggering continuously, which might be a problem if the collection process is intrusive.
 It also prevents filling up the disk or gathering too much data to analyze
 reasonably.

@@ -1842,14 +1894,16 @@ reasonably.

 default: yes; negatable: yes

-Watch the server and wait for the trigger to occur.  You can negate this option
-to make the tool immediately gather any diagnostic data once and exit.  This is
-useful if a problem is already happening, but pt-stalk is not running, so
-you only want to collect diagnostic data.
+Watch the server and wait for the trigger to occur.  Specify C<--no-stalk>
+to collect diagnostic data immediately, that is, without waiting for the
+trigger to occur.  You probably also want to specify values for
+L<"--interval">, L<"--iterations">, and L<"--sleep">.  For example, to
+immediately collect data for 1 minute then exit, specify:

-If this option is negate, L<"--daemonize">, L<"--log">, L<"--pid">, and other
-stalking-related options have no effect; the tool simply collects diagnostic
-data and exits.  Safeguard options, like L<"--disk-bytes-free"> and
+   --no-stalk --run-time 60 --iterations 1
+
+L<"--cycles">, L<"--daemonize">, L<"--log"> and L<"--pid"> have no effect
+with C<--no-stalk>.  Safeguard options, like L<"--disk-bytes-free"> and
 L<"--disk-pct-free">, are still respected.

 See also L<"--collect">.
@@ -1858,14 +1912,18 @@ See also L<"--collect">.

 type: int; default: 25

-The threshold at which the diagnostic trigger should fire.  See L<"--function">
-for details.
+The maximum acceptable value for L<"--variable">.  L<"--collect"> is
+triggered when the value of L<"--variable"> is greater than L<"--threshold">
+for L<"--cycles"> many times.  Currently, there is no way to define a lower
+threshold to check for a L<"--variable"> value that is too low.
+
+See also L<"--function">.

 =item --variable

 type: string; default: Threads_running

-The variable to compare against the threshold. See L<"--function"> for details.
+The variable to compare against L<"--threshold">.  See also L<"--function">.

 =item --verbose

@@ -1995,7 +2053,8 @@ Replace C<TOOL> with the name of any tool.

 =head1 AUTHORS

-Baron Schwartz, Justin Swanhart, Fernando Ipar, and Daniel Nichter
+Baron Schwartz, Justin Swanhart, Fernando Ipar, Daniel Nichter,
+and Brian Fraser.

 =head1 ABOUT PERCONA TOOLKIT

--- a/lib/bash/collect.sh
+++ b/lib/bash/collect.sh
@@ -22,7 +22,7 @@
 # collect collects system information.

 # XXX
-# THIS LIB REQUIRES log_warn_die.sh, safeguards.sh, and alt_cmds.sh!
+# THIS LIB REQUIRES log_warn_die, safeguards, alt_cmds, and subshell!
 # XXX

 set -u
@@ -289,16 +289,8 @@ collect() {
   # it may leave an empty file.  But first wait another --run-time
   # seconds for any slow process to finish:
   # https://bugs.launchpad.net/percona-toolkit/+bug/1047701
-   local slept=0
-   while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do
-      sleep 1
-      slept=$((slept + 1))
-   done
-
-   for pid in $(jobs -p); do
-      kill $pid >/dev/null 2>&1
-   done
-
+   wait_for_subshells $OPT_RUN_TIME
+   kill_all_subshells
   for file in "$d/$p-"*; do
      # If there's not at least 1 line that's not a TS,
      # then the file is empty.
--- a/lib/bash/subshell.sh
+++ b/lib/bash/subshell.sh
@@ -0,0 +1,66 @@
+# This program is copyright 2013 Percona Ireland Ltd.
+# Feedback and improvements are welcome.
+#
+# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
+# systems, you can issue `man perlgpl' or `man perlartistic' to read these
+# licenses.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA  02111-1307  USA.
+# ###########################################################################
+# subshell package
+# ###########################################################################
+
+# Package: subshell
+
+set -u
+
+wait_for_subshells() {
+   local max_wait=$1
+   if [ "$(jobs)" ]; then
+      log "Waiting up to $max_wait seconds for subprocesses to finish..."
+      local slept=0
+      while [ -n "$(jobs)" ]; do
+         local subprocess_still_running=""
+         for pid in $(jobs -p); do
+            if kill -0 $pid >/dev/null 2>&1; then
+               subprocess_still_running=1
+            fi
+         done
+         if [ "$subprocess_still_running" ]; then
+            sleep 1
+            slept=$((slept + 1))
+            [ $slept -ge $max_wait ] && break
+         else
+            break
+         fi
+      done
+   fi
+}
+
+kill_all_subshells() {
+   if [ "$(jobs)" ]; then
+      for pid in $(jobs -p); do
+         if kill -0 $pid >/dev/null 2>&1; then
+            # This isn't an warning (we don't want exit status 1) because
+            # the system may be running slowly so it's just "natural" that
+            # a collector may get stuck or run really slowly.
+            log "Killing subprocess $pid"
+            kill $pid >/dev/null 2>&1
+         fi
+      done
+   else
+      log "All subprocesses have finished"
+   fi
+}
+
+# ###########################################################################
+# End subshell package
+# ###########################################################################
--- a/t/lib/bash/collect.sh
+++ b/t/lib/bash/collect.sh
@@ -10,6 +10,7 @@ TOOL="pt-stalk"
 mkdir "$PT_TMPDIR/collect" 2>/dev/null

 source "$LIB_DIR/log_warn_die.sh"
+source "$LIB_DIR/subshell.sh"
 source "$LIB_DIR/parse_options.sh"
 source "$LIB_DIR/safeguards.sh"
 source "$LIB_DIR/alt_cmds.sh"
--- a/t/pt-stalk/pt-stalk.t
+++ b/t/pt-stalk/pt-stalk.t
@@ -317,7 +317,11 @@ diag(`cp $ENV{HOME}/.pt-stalk.conf.original $ENV{HOME}/.pt-stalk.conf 2>/dev/nul

 cleanup();

-$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
+# As of 2.2, --no-stalk means just that: don't stalk, just collect, so
+# we have to specify --iterations=1 else the tool will continue to run,
+# whereas in 2.1 --no-stalk implied/forced "collect once and exit".
+
+$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file --iterations 1 -- --defaults-file=$cnf >$log_file 2>&1");

 PerconaTest::wait_until(sub { !-f $pid_file });