PT-2037 - option --system-only for pt-stalk

This commit:
1. Moves all changes to the proper place: lib/bash/collect.sh
2. Refactors pt-stalk so it is more modular
3. Places fix for PT-1734 into the right place: lib/bash/collect.sh
4. Fixes tests for PT-1336
This commit is contained in:
svetasmirnova
2022-01-22 21:21:05 +03:00
parent 1f6c95f524
commit 623fdaec91
4 changed files with 460 additions and 409 deletions

View File

@@ -49,10 +49,77 @@ collect() {
local d="$1" # directory to save results in
local p="$2" # prefix for each result file
local mysqld_pid=""
local cnt=$(($OPT_RUN_TIME / $OPT_SLEEP_COLLECT))
if [ ! "$OPT_SYSTEM_ONLY" ]; then
local mysqld_pid=""
local mysql_version=""
local mysql_error_log=""
local tail_error_log_pid=""
local have_lock_waits_table=""
local have_oprofile=""
local mysqladmin_pid=""
local mutex=""
local tcpdump_pid=""
local ps_instrumentation_enabled=""
collect_mysql_data_one
fi
# Grab a few general things first. Background all of these so we can start
# them all up as quickly as possible.
if [ ! "$OPT_MYSQL_ONLY" ]; then
collect_system_data
fi
# This loop gathers data for the rest of the duration, and defines the time
# of the whole job.
log "Loop start: $(date +'TS %s.%N %F %T')"
local start_time=$(date +'%s')
local curr_time=$start_time
local ts="$(date +"TS %s.%N %F %T")"
while [ $((curr_time - start_time)) -lt $OPT_RUN_TIME ]; do
if [ ! "$OPT_MYSQL_ONLY" ]; then
collect_system_data_loop
fi
if [ ! "$OPT_SYSTEM_ONLY" ]; then
collect_mysql_data_loop
fi
curr_time=$(date +'%s')
done
log "Loop end: $(date +'TS %s.%N %F %T')"
if [ ! "$OPT_SYSTEM_ONLY" ]; then
collect_mysql_data_two
fi
# Finally, record what system we collected this data from.
hostname > "$d/$p-hostname"
# Remove "empty" files, i.e. ones that are truly empty or
# just contain timestamp lines. When a command above fails,
# it may leave an empty file. But first wait another --run-time
# seconds for any slow process to finish:
# https://bugs.launchpad.net/percona-toolkit/+bug/1047701
wait_for_subshells $OPT_RUN_TIME
kill_all_subshells
for file in "$d/$p-"*; do
# If there's not at least 1 line that's not a TS,
# then the file is empty.
if [ -z "$(grep -v '^TS ' --max-count 10 "$file")" ]; then
log "Removing empty file $file";
rm "$file"
fi
done
}
collect_mysql_data_one() {
# Get pidof mysqld.
if [ ! "$OPT_MYSQL_ONLY" ]; then
port=$(mysql -ss -e 'SELECT @@port')
port=$($CMD_MYSQL $EXT_ARGV -ss -e 'SELECT @@port')
mysqld_pid=$(lsof -i ":${port}" | grep -i listen | cut -f 3 -d" ")
fi
@@ -84,16 +151,16 @@ collect() {
# Get the major.minor version number. Version 3.23 doesn't matter for our
# purposes, and other releases have x.x.x* version conventions so far.
local mysql_version="$(awk '/^version[^_]/{print substr($2,1,3)}' "$d/$p-variables")"
mysql_version="$(awk '/^version[^_]/{print substr($2,1,3)}' "$d/$p-variables")"
# Is MySQL logging its errors to a file? If so, tail that file.
local mysql_error_log="$(awk '/^log_error/{print $2}' "$d/$p-variables")"
mysql_error_log="$(awk '/^log_error\s/{print $2}' "$d/$p-variables")"
if [ -z "$mysql_error_log" -a "$mysqld_pid" ]; then
log $mysqld_pid
# Try getting it from the open filehandle...
mysql_error_log="$(ls -l /proc/$mysqld_pid/fd | awk '/ 2 ->/{print $NF}')"
fi
local tail_error_log_pid=""
if [ "$mysql_error_log" -a ! "$OPT_MYSQL_ONLY" ]; then
log "The MySQL error log seems to be $mysql_error_log"
tail -f "$mysql_error_log" >"$d/$p-log_error" &
@@ -108,9 +175,9 @@ collect() {
# Get a sample of these right away, so we can get these without interaction
# with the other commands we're about to run.
if [ "${mysql_version}" '>' "5.1" ]; then
local mutex="SHOW ENGINE INNODB MUTEX"
mutex="SHOW ENGINE INNODB MUTEX"
else
local mutex="SHOW MUTEX STATUS"
mutex="SHOW MUTEX STATUS"
fi
innodb_status 1
tokudb_status 1
@@ -120,7 +187,6 @@ collect() {
open_tables >> "$d/$p-opentables1" &
# If TCP dumping is specified, start that on the server's port.
local tcpdump_pid=""
if [ "$CMD_TCPDUMP" -a "$OPT_COLLECT_TCPDUMP" ]; then
local port=$(awk '/^port/{print $2}' "$d/$p-variables")
if [ "$port" ]; then
@@ -131,7 +197,6 @@ collect() {
# Next, start oprofile gathering data during the whole rest of this process.
# The --init should be a no-op if it has already been init-ed.
local have_oprofile=""
if [ "$CMD_OPCONTROL" -a "$OPT_COLLECT_OPROFILE" ]; then
if $CMD_OPCONTROL --init; then
$CMD_OPCONTROL --start --no-vmlinux
@@ -143,38 +208,11 @@ collect() {
local strace_pid=$!
fi
# Grab a few general things first. Background all of these so we can start
# them all up as quickly as possible.
if [ ! "$OPT_MYSQL_ONLY" ]; then
ps -eaF >> "$d/$p-ps" &
top -bn${OPT_RUN_TIME} >> "$d/$p-top" &
[ "$mysqld_pid" ] && _lsof $mysqld_pid >> "$d/$p-lsof" &
if [ "$CMD_SYSCTL" ]; then
$CMD_SYSCTL -a >> "$d/$p-sysctl" &
fi
# collect dmesg events from 60 seconds ago until present
if [ "$CMD_DMESG" ]; then
local UPTIME=`cat /proc/uptime | awk '{ print $1 }'`
local START_TIME=$(echo "$UPTIME 60" | awk '{print ($1 - $2)}')
$CMD_DMESG | perl -ne 'm/\[\s*(\d+)\./; if ($1 > '${START_TIME}') { print }' >> "$d/$p-dmesg" &
fi
local cnt=$(($OPT_RUN_TIME / $OPT_SLEEP_COLLECT))
if [ "$CMD_VMSTAT" ]; then
$CMD_VMSTAT $OPT_SLEEP_COLLECT $cnt >> "$d/$p-vmstat" &
$CMD_VMSTAT $OPT_RUN_TIME 2 >> "$d/$p-vmstat-overall" &
fi
if [ "$CMD_IOSTAT" ]; then
$CMD_IOSTAT -dx $OPT_SLEEP_COLLECT $cnt >> "$d/$p-iostat" &
$CMD_IOSTAT -dx $OPT_RUN_TIME 2 >> "$d/$p-iostat-overall" &
fi
if [ "$CMD_MPSTAT" ]; then
$CMD_MPSTAT -P ALL $OPT_SLEEP_COLLECT $cnt >> "$d/$p-mpstat" &
$CMD_MPSTAT -P ALL $OPT_RUN_TIME 1 >> "$d/$p-mpstat-overall" &
fi
$CMD_MYSQL $EXT_ARGV -e "SHOW TABLES FROM INFORMATION_SCHEMA" \
| grep -i "INNODB_LOCK_WAITS" >/dev/null 2>&1
if [ $? -eq 0 ]; then
have_lock_waits_table="yes"
fi
# Collect multiple snapshots of the status variables. We use
# mysqladmin -c even though it is buggy and won't stop on its
@@ -183,92 +221,110 @@ collect() {
# the database tends to exceed max_connections, so reconnecting
# in the loop tends not to work very well.
$CMD_MYSQLADMIN $EXT_ARGV ext -i$OPT_SLEEP_COLLECT -c$cnt >>"$d/$p-mysqladmin" &
local mysqladmin_pid=$!
fi
mysqladmin_pid=$!
local have_lock_waits_table=""
$CMD_MYSQL $EXT_ARGV -e "SHOW TABLES FROM INFORMATION_SCHEMA" \
| grep -i "INNODB_LOCK_WAITS" >/dev/null 2>&1
if [ $? -eq 0 ]; then
have_lock_waits_table="yes"
fi
# This loop gathers data for the rest of the duration, and defines the time
# of the whole job.
log "Loop start: $(date +'TS %s.%N %F %T')"
local start_time=$(date +'%s')
local curr_time=$start_time
local ps_instrumentation_enabled=$($CMD_MYSQL $EXT_ARGV -e 'SELECT ENABLED FROM performance_schema.setup_instruments WHERE NAME = "transaction";' \
ps_instrumentation_enabled=$($CMD_MYSQL $EXT_ARGV -e 'SELECT ENABLED FROM performance_schema.setup_instruments WHERE NAME = "transaction";' \
| sed "2q;d" | sed 'y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/')
if [ $ps_instrumentation_enabled != "yes" ]; then
log "Performance Schema instrumentation is disabled"
fi
}
while [ $((curr_time - start_time)) -lt $OPT_RUN_TIME ]; do
if [ ! "$OPT_MYSQL_ONLY" ]; then
# We check the disk, but don't exit, because we need to stop jobs if we
# need to exit.
disk_space $d > $d/$p-disk-space
check_disk_space \
$d/$p-disk-space \
"$OPT_DISK_BYTES_FREE" \
"$OPT_DISK_PCT_FREE" \
|| break
collect_system_data() {
ps -eaF >> "$d/$p-ps" &
top -bn${OPT_RUN_TIME} >> "$d/$p-top" &
# Sleep between collect cycles.
# Synchronize ourselves onto the clock tick, so the sleeps are 1-second
sleep $(date +'%s.%N' | awk "{print $OPT_SLEEP_COLLECT - (\$1 % $OPT_SLEEP_COLLECT)}")
local ts="$(date +"TS %s.%N %F %T")"
[ "$mysqld_pid" ] && _lsof $mysqld_pid >> "$d/$p-lsof" &
# #####################################################################
# Collect data for this cycle.
# #####################################################################
if [ -d "/proc" ]; then
if [ -f "/proc/diskstats" ]; then
(echo $ts; cat /proc/diskstats) >> "$d/$p-diskstats" &
fi
if [ -f "/proc/stat" ]; then
(echo $ts; cat /proc/stat) >> "$d/$p-procstat" &
fi
if [ -f "/proc/vmstat" ]; then
(echo $ts; cat /proc/vmstat) >> "$d/$p-procvmstat" &
fi
if [ -f "/proc/meminfo" ]; then
(echo $ts; cat /proc/meminfo) >> "$d/$p-meminfo" &
fi
if [ -f "/proc/slabinfo" ]; then
(echo $ts; cat /proc/slabinfo) >> "$d/$p-slabinfo" &
fi
if [ -f "/proc/interrupts" ]; then
(echo $ts; cat /proc/interrupts) >> "$d/$p-interrupts" &
fi
fi
(echo $ts; df -k) >> "$d/$p-df" &
(echo $ts; netstat -antp) >> "$d/$p-netstat" &
(echo $ts; netstat -s) >> "$d/$p-netstat_s" &
if [ "$CMD_SYSCTL" ]; then
$CMD_SYSCTL -a >> "$d/$p-sysctl" &
fi
# collect dmesg events from 60 seconds ago until present
if [ "$CMD_DMESG" ]; then
local UPTIME=`cat /proc/uptime | awk '{ print $1 }'`
local START_TIME=$(echo "$UPTIME 60" | awk '{print ($1 - $2)}')
$CMD_DMESG | perl -ne 'm/\[\s*(\d+)\./; if ($1 > '${START_TIME}') { print }' >> "$d/$p-dmesg" &
fi
if [ "$CMD_VMSTAT" ]; then
$CMD_VMSTAT $OPT_SLEEP_COLLECT $cnt >> "$d/$p-vmstat" &
$CMD_VMSTAT $OPT_RUN_TIME 2 >> "$d/$p-vmstat-overall" &
fi
if [ "$CMD_IOSTAT" ]; then
$CMD_IOSTAT -dx $OPT_SLEEP_COLLECT $cnt >> "$d/$p-iostat" &
$CMD_IOSTAT -dx $OPT_RUN_TIME 2 >> "$d/$p-iostat-overall" &
fi
if [ "$CMD_MPSTAT" ]; then
$CMD_MPSTAT -P ALL $OPT_SLEEP_COLLECT $cnt >> "$d/$p-mpstat" &
$CMD_MPSTAT -P ALL $OPT_RUN_TIME 1 >> "$d/$p-mpstat-overall" &
fi
}
collect_mysql_data_loop() {
(echo $ts; $CMD_MYSQL $EXT_ARGV -e "SHOW FULL PROCESSLIST\G") \
>> "$d/$p-processlist" &
if [ "$have_lock_waits_table" ]; then
(echo $ts; lock_waits) >>"$d/$p-lock-waits" &
(echo $ts; transactions) >>"$d/$p-transactions" &
fi
if [ "${mysql_version}" '>' "5.6" ] && [ $ps_instrumentation_enabled == "yes" ]; then
ps_locks_transactions "$d/$p-ps-locks-transactions"
fi
if [ "${mysql_version}" '>' "5.6" ]; then
(echo $ts; ps_prepared_statements) >> "$d/$p-prepared-statements" &
fi
slave_status "$d/$p-slave-status" "${mysql_version}"
}
collect_system_data_loop() {
# We check the disk, but don't exit, because we need to stop jobs if we
# need to exit.
disk_space $d > $d/$p-disk-space
check_disk_space \
$d/$p-disk-space \
"$OPT_DISK_BYTES_FREE" \
"$OPT_DISK_PCT_FREE" \
|| break
# Sleep between collect cycles.
# Synchronize ourselves onto the clock tick, so the sleeps are 1-second
sleep $(date +'%s.%N' | awk "{print $OPT_SLEEP_COLLECT - (\$1 % $OPT_SLEEP_COLLECT)}")
ts="$(date +"TS %s.%N %F %T")"
# #####################################################################
# Collect data for this cycle.
# #####################################################################
if [ -d "/proc" ]; then
if [ -f "/proc/diskstats" ]; then
(echo $ts; cat /proc/diskstats) >> "$d/$p-diskstats" &
fi
(echo $ts; $CMD_MYSQL $EXT_ARGV -e "SHOW FULL PROCESSLIST\G") \
>> "$d/$p-processlist" &
if [ "$have_lock_waits_table" ]; then
(echo $ts; lock_waits) >>"$d/$p-lock-waits" &
(echo $ts; transactions) >>"$d/$p-transactions" &
if [ -f "/proc/stat" ]; then
(echo $ts; cat /proc/stat) >> "$d/$p-procstat" &
fi
if [ "${mysql_version}" '>' "5.6" ] && [ $ps_instrumentation_enabled == "yes" ]; then
ps_locks_transactions "$d/$p-ps-locks-transactions"
if [ -f "/proc/vmstat" ]; then
(echo $ts; cat /proc/vmstat) >> "$d/$p-procvmstat" &
fi
if [ "${mysql_version}" '>' "5.6" ]; then
(echo $ts; ps_prepared_statements) >> "$d/$p-prepared-statements" &
if [ -f "/proc/meminfo" ]; then
(echo $ts; cat /proc/meminfo) >> "$d/$p-meminfo" &
fi
if [ -f "/proc/slabinfo" ]; then
(echo $ts; cat /proc/slabinfo) >> "$d/$p-slabinfo" &
fi
if [ -f "/proc/interrupts" ]; then
(echo $ts; cat /proc/interrupts) >> "$d/$p-interrupts" &
fi
fi
(echo $ts; df -k) >> "$d/$p-df" &
(echo $ts; netstat -antp) >> "$d/$p-netstat" &
(echo $ts; netstat -s) >> "$d/$p-netstat_s" &
}
slave_status "$d/$p-slave-status" "${mysql_version}"
curr_time=$(date +'%s')
done
log "Loop end: $(date +'TS %s.%N %F %T')"
collect_mysql_data_two() {
if [ "$have_oprofile" ]; then
$CMD_OPCONTROL --stop
$CMD_OPCONTROL --dump
@@ -316,28 +372,9 @@ collect() {
open_tables >> "$d/$p-opentables2" &
# Kill backgrounded tasks.
kill $mysqladmin_pid
[ "$mysqladmin_pid" ] && kill $mysqladmin_pid
[ "$tail_error_log_pid" ] && kill $tail_error_log_pid
[ "$tcpdump_pid" ] && kill $tcpdump_pid
# Finally, record what system we collected this data from.
hostname > "$d/$p-hostname"
# Remove "empty" files, i.e. ones that are truly empty or
# just contain timestamp lines. When a command above fails,
# it may leave an empty file. But first wait another --run-time
# seconds for any slow process to finish:
# https://bugs.launchpad.net/percona-toolkit/+bug/1047701
wait_for_subshells $OPT_RUN_TIME
kill_all_subshells
for file in "$d/$p-"*; do
# If there's not at least 1 line that's not a TS,
# then the file is empty.
if [ -z "$(grep -v '^TS ' --max-count 10 "$file")" ]; then
log "Removing empty file $file";
rm "$file"
fi
done
}
open_tables() {
@@ -391,8 +428,13 @@ transactions() {
tokudb_status() {
local n=$1
$CMD_MYSQL $EXT_ARGV -e "SHOW ENGINE TOKUDB STATUS\G" \
>> "$d/$p-tokudbstatus$n" || rm -f "$d/$p-tokudbstatus$n"
has_tokudb=`$CMD_MYSQL $EXT_ARGV -e "SHOW ENGINES" | grep -i 'tokudb'`
exit_code=$?
if [ $exit_code -eq 0 ]; then
$CMD_MYSQL $EXT_ARGV -e "SHOW ENGINE TOKUDB STATUS\G" \
>> "$d/$p-tokudbstatus$n" || rm -f "$d/$p-tokudbstatus$n"
fi
}
innodb_status() {
@@ -475,11 +517,11 @@ slave_status() {
echo -e "\n$sql\n" >> $outfile
$CMD_MYSQL $EXT_ARGV -e "$sql" >> $outfile
sql="SELECT * FROM replication_connection_status\G"
sql="SELECT * FROM performance_schema.replication_connection_status\G"
echo -e "\n$sql\n" >> $outfile
$CMD_MYSQL $EXT_ARGV -e "$sql" >> $outfile
sql="SELECT * FROM replication_applier_status JOIN replication_applier_status_by_coordinator USING(channel_name)\G"
sql="SELECT * FROM performance_schema.replication_applier_status JOIN performance_schema.replication_applier_status_by_coordinator USING(channel_name)\G"
echo -e "\n$sql\n" >> $outfile
$CMD_MYSQL $EXT_ARGV -e "$sql" >> $outfile
fi