PT-2037 - option --system-only for pt-stalk

This commit: 1. Moves all changes to the proper place: lib/bash/collect.sh 2. Refactors pt-stalk so it is more modular 3. Places fix for PT-1734 into the right place: lib/bash/collect.sh 4. Fixes tests for PT-1336
2026-03-07 02:00:50 +08:00 · 2022-01-22 21:21:05 +03:00
parent 1f6c95f524
commit 623fdaec91
4 changed files with 460 additions and 409 deletions
--- a/lib/bash/collect.sh
+++ b/lib/bash/collect.sh
@@ -49,10 +49,77 @@ collect() {
   local d="$1"  # directory to save results in
   local p="$2"  # prefix for each result file

-   local mysqld_pid=""
+   local cnt=$(($OPT_RUN_TIME / $OPT_SLEEP_COLLECT))
+   
+   if [ ! "$OPT_SYSTEM_ONLY" ]; then
+      local mysqld_pid=""
+      local mysql_version=""
+      local mysql_error_log=""
+      local tail_error_log_pid=""
+      local have_lock_waits_table=""
+      local have_oprofile=""
+      local mysqladmin_pid=""
+      local mutex=""
+      local tcpdump_pid=""
+      local ps_instrumentation_enabled=""
+
+      collect_mysql_data_one
+   fi
+
+   # Grab a few general things first.  Background all of these so we can start
+   # them all up as quickly as possible.  
+   if [ ! "$OPT_MYSQL_ONLY" ]; then 
+      collect_system_data
+   fi 
+
+   # This loop gathers data for the rest of the duration, and defines the time
+   # of the whole job.
+   log "Loop start: $(date +'TS %s.%N %F %T')"
+   local start_time=$(date +'%s')
+   local curr_time=$start_time
+   local ts="$(date +"TS %s.%N %F %T")"
+
+   while [ $((curr_time - start_time)) -lt $OPT_RUN_TIME ]; do
+      if [ ! "$OPT_MYSQL_ONLY" ]; then
+         collect_system_data_loop
+      fi
+
+      if [ ! "$OPT_SYSTEM_ONLY" ]; then
+         collect_mysql_data_loop
+      fi
+
+      curr_time=$(date +'%s')
+   done
+   log "Loop end: $(date +'TS %s.%N %F %T')"
+
+   if [ ! "$OPT_SYSTEM_ONLY" ]; then
+      collect_mysql_data_two
+   fi
+
+   # Finally, record what system we collected this data from.
+   hostname > "$d/$p-hostname"
+
+   # Remove "empty" files, i.e. ones that are truly empty or
+   # just contain timestamp lines.  When a command above fails,
+   # it may leave an empty file.  But first wait another --run-time
+   # seconds for any slow process to finish:
+   # https://bugs.launchpad.net/percona-toolkit/+bug/1047701
+   wait_for_subshells $OPT_RUN_TIME
+   kill_all_subshells
+   for file in "$d/$p-"*; do
+      # If there's not at least 1 line that's not a TS,
+      # then the file is empty.
+      if [ -z "$(grep -v '^TS ' --max-count 10 "$file")" ]; then
+         log "Removing empty file $file";
+         rm "$file"
+      fi
+   done
+}
+
+collect_mysql_data_one() {
   # Get pidof mysqld.
   if [ ! "$OPT_MYSQL_ONLY" ]; then
-      port=$(mysql -ss -e 'SELECT @@port')
+      port=$($CMD_MYSQL $EXT_ARGV -ss -e 'SELECT @@port')
      mysqld_pid=$(lsof -i ":${port}" | grep -i listen | cut -f 3 -d" ")
   fi

@@ -84,16 +151,16 @@ collect() {

   # Get the major.minor version number.  Version 3.23 doesn't matter for our
   # purposes, and other releases have x.x.x* version conventions so far.
-   local mysql_version="$(awk '/^version[^_]/{print substr($2,1,3)}' "$d/$p-variables")"
+   mysql_version="$(awk '/^version[^_]/{print substr($2,1,3)}' "$d/$p-variables")"

   # Is MySQL logging its errors to a file?  If so, tail that file.
-   local mysql_error_log="$(awk '/^log_error/{print $2}' "$d/$p-variables")"
+   mysql_error_log="$(awk '/^log_error\s/{print $2}' "$d/$p-variables")"
   if [ -z "$mysql_error_log" -a "$mysqld_pid" ]; then
+      log $mysqld_pid
      # Try getting it from the open filehandle...
      mysql_error_log="$(ls -l /proc/$mysqld_pid/fd | awk '/ 2 ->/{print $NF}')"
   fi

-   local tail_error_log_pid=""
   if [ "$mysql_error_log" -a ! "$OPT_MYSQL_ONLY" ]; then
      log "The MySQL error log seems to be $mysql_error_log"
      tail -f "$mysql_error_log" >"$d/$p-log_error" &
@@ -108,9 +175,9 @@ collect() {
   # Get a sample of these right away, so we can get these without interaction
   # with the other commands we're about to run.
   if [ "${mysql_version}" '>' "5.1" ]; then
-      local mutex="SHOW ENGINE INNODB MUTEX"
+      mutex="SHOW ENGINE INNODB MUTEX"
   else
-      local mutex="SHOW MUTEX STATUS"
+      mutex="SHOW MUTEX STATUS"
   fi
   innodb_status 1
   tokudb_status 1
@@ -120,7 +187,6 @@ collect() {
   open_tables                      >> "$d/$p-opentables1"   &

   # If TCP dumping is specified, start that on the server's port.
-   local tcpdump_pid=""
   if [ "$CMD_TCPDUMP" -a  "$OPT_COLLECT_TCPDUMP" ]; then
      local port=$(awk '/^port/{print $2}' "$d/$p-variables")
      if [ "$port" ]; then
@@ -131,7 +197,6 @@ collect() {

   # Next, start oprofile gathering data during the whole rest of this process.
   # The --init should be a no-op if it has already been init-ed.
-   local have_oprofile=""
   if [ "$CMD_OPCONTROL" -a "$OPT_COLLECT_OPROFILE" ]; then
      if $CMD_OPCONTROL --init; then
         $CMD_OPCONTROL --start --no-vmlinux
@@ -143,38 +208,11 @@ collect() {
      local strace_pid=$!
   fi

-   # Grab a few general things first.  Background all of these so we can start
-   # them all up as quickly as possible.  
-   if [ ! "$OPT_MYSQL_ONLY" ]; then 
-      ps -eaF  >> "$d/$p-ps"  &
-      top -bn${OPT_RUN_TIME} >> "$d/$p-top" &
-
-      [ "$mysqld_pid" ] && _lsof $mysqld_pid >> "$d/$p-lsof" &
-
-      if [ "$CMD_SYSCTL" ]; then
-         $CMD_SYSCTL -a >> "$d/$p-sysctl" &
-      fi
-
-      # collect dmesg events from 60 seconds ago until present
-      if [ "$CMD_DMESG" ]; then
-         local UPTIME=`cat /proc/uptime | awk '{ print $1 }'`
-         local START_TIME=$(echo "$UPTIME 60" | awk '{print ($1 - $2)}')
-         $CMD_DMESG  | perl -ne 'm/\[\s*(\d+)\./; if ($1 > '${START_TIME}') { print }' >> "$d/$p-dmesg" & 
-      fi
-
-      local cnt=$(($OPT_RUN_TIME / $OPT_SLEEP_COLLECT))
-      if [ "$CMD_VMSTAT" ]; then
-         $CMD_VMSTAT $OPT_SLEEP_COLLECT $cnt >> "$d/$p-vmstat" &
-         $CMD_VMSTAT $OPT_RUN_TIME 2 >> "$d/$p-vmstat-overall" &
-      fi
-      if [ "$CMD_IOSTAT" ]; then
-         $CMD_IOSTAT -dx $OPT_SLEEP_COLLECT $cnt >> "$d/$p-iostat" &
-         $CMD_IOSTAT -dx $OPT_RUN_TIME 2 >> "$d/$p-iostat-overall" &
-      fi
-      if [ "$CMD_MPSTAT" ]; then
-         $CMD_MPSTAT -P ALL $OPT_SLEEP_COLLECT $cnt >> "$d/$p-mpstat" &
-         $CMD_MPSTAT -P ALL $OPT_RUN_TIME 1 >> "$d/$p-mpstat-overall" &
-      fi
+   $CMD_MYSQL $EXT_ARGV -e "SHOW TABLES FROM INFORMATION_SCHEMA" \
+      | grep -i "INNODB_LOCK_WAITS" >/dev/null 2>&1
+   if [ $? -eq 0 ]; then
+      have_lock_waits_table="yes"
+   fi

      # Collect multiple snapshots of the status variables.  We use
      # mysqladmin -c even though it is buggy and won't stop on its
@@ -183,92 +221,110 @@ collect() {
      # the database tends to exceed max_connections, so reconnecting
      # in the loop tends not to work very well.
      $CMD_MYSQLADMIN $EXT_ARGV ext -i$OPT_SLEEP_COLLECT -c$cnt >>"$d/$p-mysqladmin" &
-      local mysqladmin_pid=$!
-   fi 
+      mysqladmin_pid=$!

-   local have_lock_waits_table=""
-   $CMD_MYSQL $EXT_ARGV -e "SHOW TABLES FROM INFORMATION_SCHEMA" \
-      | grep -i "INNODB_LOCK_WAITS" >/dev/null 2>&1
-   if [ $? -eq 0 ]; then
-      have_lock_waits_table="yes"
-   fi
-
-   # This loop gathers data for the rest of the duration, and defines the time
-   # of the whole job.
-   log "Loop start: $(date +'TS %s.%N %F %T')"
-   local start_time=$(date +'%s')
-   local curr_time=$start_time
-   local ps_instrumentation_enabled=$($CMD_MYSQL $EXT_ARGV -e 'SELECT ENABLED FROM performance_schema.setup_instruments WHERE NAME = "transaction";' \
+   ps_instrumentation_enabled=$($CMD_MYSQL $EXT_ARGV -e 'SELECT ENABLED FROM performance_schema.setup_instruments WHERE NAME = "transaction";' \
                                      | sed "2q;d" | sed 'y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/')

   if [ $ps_instrumentation_enabled != "yes" ]; then
      log "Performance Schema instrumentation is disabled"
   fi
+}

-   while [ $((curr_time - start_time)) -lt $OPT_RUN_TIME ]; do
-      if [ ! "$OPT_MYSQL_ONLY" ]; then
-         # We check the disk, but don't exit, because we need to stop jobs if we
-         # need to exit.
-         disk_space $d > $d/$p-disk-space
-         check_disk_space          \
-            $d/$p-disk-space       \
-            "$OPT_DISK_BYTES_FREE" \
-            "$OPT_DISK_PCT_FREE"   \
-            || break
+collect_system_data() {
+   ps -eaF  >> "$d/$p-ps"  &
+   top -bn${OPT_RUN_TIME} >> "$d/$p-top" &

-         # Sleep between collect cycles.
-         # Synchronize ourselves onto the clock tick, so the sleeps are 1-second
-         sleep $(date +'%s.%N' | awk "{print $OPT_SLEEP_COLLECT - (\$1 % $OPT_SLEEP_COLLECT)}")
-         local ts="$(date +"TS %s.%N %F %T")"
+   [ "$mysqld_pid" ] && _lsof $mysqld_pid >> "$d/$p-lsof" &

-         # #####################################################################
-         # Collect data for this cycle.
-         # #####################################################################
-         if [ -d "/proc" ]; then
-            if [ -f "/proc/diskstats" ]; then
-               (echo $ts; cat /proc/diskstats) >> "$d/$p-diskstats" &
-            fi
-            if [ -f "/proc/stat" ]; then
-               (echo $ts; cat /proc/stat) >> "$d/$p-procstat" &
-            fi
-            if [ -f "/proc/vmstat" ]; then
-               (echo $ts; cat /proc/vmstat) >> "$d/$p-procvmstat" &
-            fi
-            if [ -f "/proc/meminfo" ]; then
-               (echo $ts; cat /proc/meminfo) >> "$d/$p-meminfo" &
-            fi
-            if [ -f "/proc/slabinfo" ]; then
-               (echo $ts; cat /proc/slabinfo) >> "$d/$p-slabinfo" &
-            fi
-            if [ -f "/proc/interrupts" ]; then
-               (echo $ts; cat /proc/interrupts) >> "$d/$p-interrupts" &
-            fi
-         fi
-         (echo $ts; df -k) >> "$d/$p-df" &
-         (echo $ts; netstat -antp) >> "$d/$p-netstat"   &
-         (echo $ts; netstat -s)    >> "$d/$p-netstat_s" &
+   if [ "$CMD_SYSCTL" ]; then
+      $CMD_SYSCTL -a >> "$d/$p-sysctl" &
+   fi
+
+   # collect dmesg events from 60 seconds ago until present
+   if [ "$CMD_DMESG" ]; then
+      local UPTIME=`cat /proc/uptime | awk '{ print $1 }'`
+      local START_TIME=$(echo "$UPTIME 60" | awk '{print ($1 - $2)}')
+      $CMD_DMESG  | perl -ne 'm/\[\s*(\d+)\./; if ($1 > '${START_TIME}') { print }' >> "$d/$p-dmesg" & 
+   fi
+
+   if [ "$CMD_VMSTAT" ]; then
+      $CMD_VMSTAT $OPT_SLEEP_COLLECT $cnt >> "$d/$p-vmstat" &
+      $CMD_VMSTAT $OPT_RUN_TIME 2 >> "$d/$p-vmstat-overall" &
+   fi
+   if [ "$CMD_IOSTAT" ]; then
+      $CMD_IOSTAT -dx $OPT_SLEEP_COLLECT $cnt >> "$d/$p-iostat" &
+      $CMD_IOSTAT -dx $OPT_RUN_TIME 2 >> "$d/$p-iostat-overall" &
+   fi
+   if [ "$CMD_MPSTAT" ]; then
+      $CMD_MPSTAT -P ALL $OPT_SLEEP_COLLECT $cnt >> "$d/$p-mpstat" &
+      $CMD_MPSTAT -P ALL $OPT_RUN_TIME 1 >> "$d/$p-mpstat-overall" &
+   fi
+}
+
+collect_mysql_data_loop() {
+   (echo $ts; $CMD_MYSQL $EXT_ARGV -e "SHOW FULL PROCESSLIST\G") \
+      >> "$d/$p-processlist" &
+   if [ "$have_lock_waits_table" ]; then
+      (echo $ts; lock_waits)   >>"$d/$p-lock-waits" &
+      (echo $ts; transactions) >>"$d/$p-transactions" &
+   fi
+
+   if [ "${mysql_version}" '>' "5.6" ] && [ $ps_instrumentation_enabled == "yes" ]; then
+      ps_locks_transactions "$d/$p-ps-locks-transactions"
+   fi
+
+   if [ "${mysql_version}" '>' "5.6" ]; then
+      (echo $ts; ps_prepared_statements) >> "$d/$p-prepared-statements" &
+   fi
+
+   slave_status "$d/$p-slave-status" "${mysql_version}" 
+}
+
+collect_system_data_loop() {
+   # We check the disk, but don't exit, because we need to stop jobs if we
+   # need to exit.
+   disk_space $d > $d/$p-disk-space
+   check_disk_space          \
+      $d/$p-disk-space       \
+      "$OPT_DISK_BYTES_FREE" \
+      "$OPT_DISK_PCT_FREE"   \
+      || break
+
+   # Sleep between collect cycles.
+   # Synchronize ourselves onto the clock tick, so the sleeps are 1-second
+   sleep $(date +'%s.%N' | awk "{print $OPT_SLEEP_COLLECT - (\$1 % $OPT_SLEEP_COLLECT)}")
+   ts="$(date +"TS %s.%N %F %T")"
+
+   # #####################################################################
+   # Collect data for this cycle.
+   # #####################################################################
+   if [ -d "/proc" ]; then
+      if [ -f "/proc/diskstats" ]; then
+         (echo $ts; cat /proc/diskstats) >> "$d/$p-diskstats" &
      fi
-      (echo $ts; $CMD_MYSQL $EXT_ARGV -e "SHOW FULL PROCESSLIST\G") \
-         >> "$d/$p-processlist" &
-      if [ "$have_lock_waits_table" ]; then
-         (echo $ts; lock_waits)   >>"$d/$p-lock-waits" &
-         (echo $ts; transactions) >>"$d/$p-transactions" &
+      if [ -f "/proc/stat" ]; then
+         (echo $ts; cat /proc/stat) >> "$d/$p-procstat" &
      fi
-
-      if [ "${mysql_version}" '>' "5.6" ] && [ $ps_instrumentation_enabled == "yes" ]; then
-         ps_locks_transactions "$d/$p-ps-locks-transactions"
+      if [ -f "/proc/vmstat" ]; then
+         (echo $ts; cat /proc/vmstat) >> "$d/$p-procvmstat" &
      fi
-
-      if [ "${mysql_version}" '>' "5.6" ]; then
-         (echo $ts; ps_prepared_statements) >> "$d/$p-prepared-statements" &
+      if [ -f "/proc/meminfo" ]; then
+         (echo $ts; cat /proc/meminfo) >> "$d/$p-meminfo" &
      fi
+      if [ -f "/proc/slabinfo" ]; then
+         (echo $ts; cat /proc/slabinfo) >> "$d/$p-slabinfo" &
+      fi
+      if [ -f "/proc/interrupts" ]; then
+         (echo $ts; cat /proc/interrupts) >> "$d/$p-interrupts" &
+      fi
+   fi
+   (echo $ts; df -k) >> "$d/$p-df" &
+   (echo $ts; netstat -antp) >> "$d/$p-netstat"   &
+   (echo $ts; netstat -s)    >> "$d/$p-netstat_s" &
+}

-      slave_status "$d/$p-slave-status" "${mysql_version}" 
-
-      curr_time=$(date +'%s')
-   done
-   log "Loop end: $(date +'TS %s.%N %F %T')"
-
+collect_mysql_data_two() {
   if [ "$have_oprofile" ]; then
      $CMD_OPCONTROL --stop
      $CMD_OPCONTROL --dump
@@ -316,28 +372,9 @@ collect() {
   open_tables                      >> "$d/$p-opentables2"   &

   # Kill backgrounded tasks.
-   kill $mysqladmin_pid
+   [ "$mysqladmin_pid" ] &&  kill $mysqladmin_pid
   [ "$tail_error_log_pid" ] && kill $tail_error_log_pid
   [ "$tcpdump_pid" ]        && kill $tcpdump_pid
-
-   # Finally, record what system we collected this data from.
-   hostname > "$d/$p-hostname"
-
-   # Remove "empty" files, i.e. ones that are truly empty or
-   # just contain timestamp lines.  When a command above fails,
-   # it may leave an empty file.  But first wait another --run-time
-   # seconds for any slow process to finish:
-   # https://bugs.launchpad.net/percona-toolkit/+bug/1047701
-   wait_for_subshells $OPT_RUN_TIME
-   kill_all_subshells
-   for file in "$d/$p-"*; do
-      # If there's not at least 1 line that's not a TS,
-      # then the file is empty.
-      if [ -z "$(grep -v '^TS ' --max-count 10 "$file")" ]; then
-         log "Removing empty file $file";
-         rm "$file"
-      fi
-   done
 }

 open_tables() {
@@ -391,8 +428,13 @@ transactions() {
 tokudb_status() {
    local n=$1

-    $CMD_MYSQL $EXT_ARGV -e "SHOW ENGINE TOKUDB STATUS\G" \
-      >> "$d/$p-tokudbstatus$n" || rm -f "$d/$p-tokudbstatus$n"
+    has_tokudb=`$CMD_MYSQL $EXT_ARGV -e "SHOW ENGINES" | grep -i 'tokudb'`
+    exit_code=$?
+
+    if [ $exit_code -eq 0 ]; then
+       $CMD_MYSQL $EXT_ARGV -e "SHOW ENGINE TOKUDB STATUS\G" \
+         >> "$d/$p-tokudbstatus$n" || rm -f "$d/$p-tokudbstatus$n"
+    fi
 }

 innodb_status() {
@@ -475,11 +517,11 @@ slave_status() {
      echo -e "\n$sql\n" >> $outfile
      $CMD_MYSQL $EXT_ARGV -e "$sql" >> $outfile

-      sql="SELECT * FROM replication_connection_status\G"
+      sql="SELECT * FROM performance_schema.replication_connection_status\G"
      echo -e "\n$sql\n" >> $outfile
      $CMD_MYSQL $EXT_ARGV -e "$sql" >> $outfile

-      sql="SELECT * FROM replication_applier_status JOIN replication_applier_status_by_coordinator USING(channel_name)\G"
+      sql="SELECT * FROM performance_schema.replication_applier_status JOIN performance_schema.replication_applier_status_by_coordinator USING(channel_name)\G"
      echo -e "\n$sql\n" >> $outfile
      $CMD_MYSQL $EXT_ARGV -e "$sql" >> $outfile
   fi