Files
percona-toolkit/bin/pt-stalk
2012-01-24 12:15:29 -07:00

1568 lines
49 KiB
Bash
Executable File

#!/usr/bin/env bash
# This program is part of Percona Toolkit: http://www.percona.com/software/
# See "COPYRIGHT, LICENSE, AND WARRANTY" at the end of this file for legal
# notices and disclaimers.
set -u
# ###########################################################################
# log_warn_die package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/log_warn_die.sh
# t/lib/bash/log_warn_die.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
EXIT_STATUS=0
log() {
TS=$(date +%F-%T | tr :- _);
echo "$TS $*"
}
warn() {
log "$*" >&2
EXIT_STATUS=1
}
die() {
warn "$*"
exit 1
}
# ###########################################################################
# End log_warn_die package
# ###########################################################################
# ###########################################################################
# parse_options package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/parse_options.sh
# t/lib/bash/parse_options.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
ARGV="" # Non-option args (probably input files)
EXT_ARGV="" # Everything after -- (args for an external command)
HAVE_EXT_ARGV="" # Got --, everything else is put into EXT_ARGV
OPT_ERRS=0 # How many command line option errors
OPT_VERSION="" # If --version was specified
OPT_HELP="" # If --help was specified
PO_DIR="$TMPDIR/po" # Directory with program option spec files
usage() {
local file="$1"
local usage=$(grep '^Usage: ' "$file")
echo $usage
echo
echo "For more information, 'man $TOOL' or 'perldoc $file'."
}
usage_or_errors() {
local file="$1"
if [ "$OPT_VERSION" ]; then
local version=$(grep '^pt-[^ ]\+ [0-9]' "$file")
echo "$version"
return 1
fi
if [ "$OPT_HELP" ]; then
usage "$file"
echo
echo "Command line options:"
echo
for opt in $(ls $TMPDIR/po/); do
local desc=$(cat $TMPDIR/po/$opt | grep '^desc:' | sed -e 's/^desc://')
echo "--$opt"
echo " $desc"
echo
done
return 1
fi
if [ $OPT_ERRS -gt 0 ]; then
echo
usage "$file"
return 1
fi
return 0
}
parse_options() {
local file="$1"
shift
ARGV=""
EXT_ARGV=""
HAVE_EXT_ARGV=""
OPT_ERRS=0
OPT_VERSION=""
OPT_HELP=""
PO_DIR="$TMPDIR/po"
if [ ! -d "$PO_DIR" ]; then
mkdir "$PO_DIR"
if [ $? -ne 0 ]; then
echo "Cannot mkdir $PO_DIR" >&2
exit 1
fi
fi
rm -rf "$PO_DIR"/*
if [ $? -ne 0 ]; then
echo "Cannot rm -rf $PO_DIR/*" >&2
exit 1
fi
_parse_pod "$file" # Parse POD into program option (po) spec files
_eval_po # Eval po into existence with default values
if [ $# -ge 2 ] && [ "$1" = "--config" ]; then
shift # --config
local user_config_files="$1"
shift # that ^
local old_ifs="$IFS"
IFS=","
for user_config_file in $user_config_files; do
_parse_config_files "$user_config_file"
done
IFS="$old_ifs"
else
_parse_config_files "/etc/percona-toolkit/percona-toolkit.conf" "/etc/percona-toolkit/$TOOL.conf" "$HOME/.percona-toolkit.conf" "$HOME/.$TOOL.conf"
fi
_parse_command_line "$@"
}
_parse_pod() {
local file="$1"
cat "$file" | PO_DIR="$PO_DIR" perl -ne '
BEGIN { $/ = ""; }
next unless $_ =~ m/^=head1 OPTIONS/;
while ( defined(my $para = <>) ) {
last if $para =~ m/^=head1/;
chomp;
if ( $para =~ m/^=item --(\S+)/ ) {
my $opt = $1;
my $file = "$ENV{PO_DIR}/$opt";
open my $opt_fh, ">", $file or die "Cannot open $file: $!";
print $opt_fh "long:$opt\n";
$para = <>;
chomp;
if ( $para =~ m/^[a-z ]+:/ ) {
map {
chomp;
my ($attrib, $val) = split(/: /, $_);
print $opt_fh "$attrib:$val\n";
} split(/; /, $para);
$para = <>;
chomp;
}
my ($desc) = $para =~ m/^([^?.]+)/;
print $opt_fh "desc:$desc.\n";
close $opt_fh;
}
}
last;
'
}
_eval_po() {
for opt_spec in $(ls "$PO_DIR"); do
local opt=""
local default_val=""
local neg=0
while read line; do
local key=$(echo $line | cut -d ':' -f 1)
local val=$(echo $line | cut -d ':' -f 2)
case "$key" in
long)
opt=$(echo $val | sed 's/-/_/g' | tr [:lower:] [:upper:])
;;
default)
default_val="$val"
;;
"short form")
;;
type)
;;
desc)
;;
negatable)
if [ "$val" = "yes" ]; then
neg=1
fi
;;
*)
echo "Invalid attribute in $PO_DIR/$opt_spec: $line" >&2
exit 1
esac
done < "$PO_DIR/$opt_spec"
if [ -z "$opt" ]; then
echo "No long attribute in option spec $PO_DIR/$opt_spec" >&2
exit 1
fi
if [ $neg -eq 1 ]; then
if [ -z "$default_val" ] || [ "$default_val" != "yes" ]; then
echo "Option $opt_spec is negatable but not default: yes" >&2
exit 1
fi
fi
eval "OPT_${opt}"="$default_val"
done
}
_parse_config_files() {
for config_file in "$@"; do
test -f "$config_file" || continue
while read config_opt; do
echo "$config_opt" | grep '^[ ]*[^#]' >/dev/null 2>&1 || continue
config_opt="$(echo "$config_opt" | sed -e 's/^[ ]*//' -e 's/[ ]*\$//' -e 's/[ ]*=[ ]*/=/' -e 's/[ ]*#.*$//')"
[ "$config_opt" = "" ] && continue
if ! [ "$HAVE_EXT_ARGV" ]; then
config_opt="--$config_opt"
fi
_parse_command_line "$config_opt"
done < "$config_file"
HAVE_EXT_ARGV="" # reset for each file
done
}
_parse_command_line() {
local opt=""
local val=""
local next_opt_is_val=""
local opt_is_ok=""
local opt_is_negated=""
local real_opt=""
local required_arg=""
for opt in "$@"; do
if [ "$opt" = "--" -o "$opt" = "----" ]; then
HAVE_EXT_ARGV=1
continue
fi
if [ "$HAVE_EXT_ARGV" ]; then
if [ "$EXT_ARGV" ]; then
EXT_ARGV="$EXT_ARGV $opt"
else
EXT_ARGV="$opt"
fi
continue
fi
if [ "$next_opt_is_val" ]; then
next_opt_is_val=""
if [ $# -eq 0 ] || [ $(expr "$opt" : "-") -eq 1 ]; then
OPT_ERRS=$(($OPT_ERRS + 1))
echo "$real_opt requires a $required_arg argument" >&2
continue
fi
val="$opt"
opt_is_ok=1
else
if [ $(expr "$opt" : "-") -eq 0 ]; then
if [ -z "$ARGV" ]; then
ARGV="$opt"
else
ARGV="$ARGV $opt"
fi
continue
fi
real_opt="$opt"
if $(echo $opt | grep '^--no-' >/dev/null); then
opt_is_negated=1
opt=$(echo $opt | sed 's/^--no-//')
else
opt_is_negated=""
opt=$(echo $opt | sed 's/^-*//')
fi
if $(echo $opt | grep '^[a-z-][a-z-]*=' >/dev/null 2>&1); then
val="$(echo $opt | awk -F= '{print $2}')"
opt="$(echo $opt | awk -F= '{print $1}')"
fi
if [ -f "$TMPDIR/po/$opt" ]; then
spec="$TMPDIR/po/$opt"
else
spec=$(grep "^short form:-$opt\$" "$TMPDIR"/po/* | cut -d ':' -f 1)
if [ -z "$spec" ]; then
OPT_ERRS=$(($OPT_ERRS + 1))
echo "Unknown option: $real_opt" >&2
continue
fi
fi
required_arg=$(cat "$spec" | awk -F: '/^type:/{print $2}')
if [ "$required_arg" ]; then
if [ "$val" ]; then
opt_is_ok=1
else
next_opt_is_val=1
fi
else
if [ "$val" ]; then
OPT_ERRS=$(($OPT_ERRS + 1))
echo "Option $real_opt does not take a value" >&2
continue
fi
if [ "$opt_is_negated" ]; then
val=""
else
val="yes"
fi
opt_is_ok=1
fi
fi
if [ "$opt_is_ok" ]; then
opt=$(cat "$spec" | grep '^long:' | cut -d':' -f2 | sed 's/-/_/g' | tr [:lower:] [:upper:])
eval "OPT_$opt"="'$val'"
opt=""
val=""
next_opt_is_val=""
opt_is_ok=""
opt_is_negated=""
real_opt=""
required_arg=""
fi
done
}
# ###########################################################################
# End parse_options package
# ###########################################################################
# ###########################################################################
# tmpdir package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/tmpdir.sh
# t/lib/bash/tmpdir.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
TMPDIR=""
mk_tmpdir() {
local dir="${1:-""}"
if [ -n "$dir" ]; then
if [ ! -d "$dir" ]; then
mkdir $dir || die "Cannot make tmpdir $dir"
fi
TMPDIR="$dir"
else
local tool=`basename $0`
local pid="$$"
TMPDIR=`mktemp -d /tmp/${tool}.${pid}.XXXXX` \
|| die "Cannot make secure tmpdir"
fi
}
rm_tmpdir() {
if [ -n "$TMPDIR" ] && [ -d "$TMPDIR" ]; then
rm -rf "$TMPDIR"
fi
TMPDIR=""
}
# ###########################################################################
# End tmpdir package
# ###########################################################################
# ###########################################################################
# alt_cmds package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/alt_cmds.sh
# t/lib/bash/alt_cmds.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
_seq() {
local i="$1"
awk "BEGIN { for(i=1; i<=$i; i++) print i; }"
}
# ###########################################################################
# End alt_cmds package
# ###########################################################################
# ###########################################################################
# safeguards package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/safeguards.sh
# t/lib/bash/safeguards.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
disk_space() {
local filesystem="${1:-$PWD}"
df -P -k "$filesystem"
}
check_disk_space() {
local file="$1"
local mb="${2:-0}"
local pc="${3:-0}"
local mb_margin="${4:-0}"
local kb=$(($mb * 1024))
local kb_margin=$(($mb_margin * 1024))
local kb_used=$(cat "$file" | awk '/^\//{print $3}');
local kb_free=$(cat "$file" | awk '/^\//{print $4}');
local pc_used=$(cat "$file" | awk '/^\//{print $5}' | sed -e 's/%//g');
if [ "$kb_margin" -gt "0" ]; then
local kb_total=$(($kb_used + $kb_free))
kb_used=$(($kb_used + $kb_margin))
kb_free=$(($kb_free - $kb_margin))
pc_used=$(awk "BEGIN { printf(\"%d\", $kb_used/$kb_total * 100) }")
fi
local pc_free=$((100 - $pc_used))
if [ "$kb_free" -le "$kb" -o "$pc_free" -le "$pc" ]; then
warn "Not enough free disk space: ${pc_free}% free, ${kb_free} KB free; wanted more than ${pc}% free or ${kb} KB free"
return 1
fi
return 0
}
# ###########################################################################
# End safeguards package
# ###########################################################################
# ###########################################################################
# daemon package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/daemon.sh
# t/lib/bash/daemon.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
make_pid_file() {
local file="$1"
local pid="$2"
if [ -f "$file" ]; then
local old_pid=$(cat "$file")
if [ -z "$old_pid" ]; then
die "PID file $file already exists but it is empty"
else
kill -0 $old_pid 2>/dev/null
if [ $? -eq 0 ]; then
die "PID file $file already exists and its PID ($old_pid) is running"
else
echo "Overwriting PID file $file because its PID ($old_pid)" \
"is not running"
fi
fi
fi
echo "$pid" > "$file"
if [ $? -ne 0 ]; then
die "Cannot create or write PID file $file"
fi
}
remove_pid_file() {
local file="$1"
if [ -f "$file" ]; then
rm "$file"
fi
}
# ###########################################################################
# End daemon package
# ###########################################################################
# ###########################################################################
# collect package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/bash/collect.sh
# t/lib/bash/collect.sh
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
set -u
CMD_GDB="$(which gdb)"
CMD_IOSTAT="$(which iostat)"
CMD_MPSTAT="$(which mpstat)"
CMD_MYSQL="$(which mysql)"
CMD_MYSQLADMIN="$(which mysqladmin)"
CMD_OPCONTROL="$(which opcontrol)"
CMD_OPREPORT="$(which opreport)"
CMD_PMAP="$(which pmap)"
CMD_STRACE="$(which strace)"
CMD_TCPDUMP="$(which tcpdump)"
CMD_VMSTAT="$(which vmstat)"
collect() {
local d="$1" # directory to save results in
local p="$2" # prefix for each result file
local mysqld_pid=$(pidof -s mysqld);
if [ -z "$mysqld_pid" ]; then
mysqld_pid=$(pgrep -o -x mysqld);
fi
if [ -z "$mysqld_pid" ]; then
mysqld_pid=$(ps -eaf | grep 'mysql[d]' | grep -v mysqld_safe | awk '{print $2}' | head -n1);
fi
if [ "$CMD_PMAP" -a "$mysqld_pid" ]; then
if $CMD_PMAP --help 2>&1 | grep -- -x >/dev/null 2>&1 ; then
$CMD_PMAP -x $mysqld_pid > "$d/$p-pmap"
else
$CMD_PMAP $mysqld_pid > "$d/$p-pmap"
fi
fi
if [ "$CMD_GDB" -a "$OPT_COLLECT_GDB" = "yes" -a "$mysqld_pid" ]; then
$CMD_GDB \
-ex "set pagination 0" \
-ex "thread apply all bt" \
--batch -p $mysqld_pid \
>> "$d/$p-stacktrace"
fi
$CMD_MYSQL $EXT_ARGV -e 'SHOW GLOBAL VARIABLES' >> "$d/$p-variables" 2>&1 &
sleep .2
local mysql_version="$(awk '/^version[^_]/{print substr($2,1,3)}' "$d/$p-variables")"
local mysql_error_log="$(awk '/log_error/{print $2}' "$d/$p-variables")"
if [ -z "$mysql_error_log" -a "$mysqld_pid" ]; then
mysql_error_log="$(ls -l /proc/$mysqld_pid/fd | awk '/ 2 ->/{print $NF}')"
fi
local tail_error_log_pid=""
if [ "$mysql_error_log" ]; then
echo "The MySQL error log seems to be ${mysql_error_log}"
tail -f "$mysql_error_log" >"$d/$p-log_error" 2>&1 &
tail_error_log_pid=$!
$CMD_MYSQLADMIN $EXT_ARGV debug
else
echo "Could not find the MySQL error log"
fi
local innostat="SHOW /*!40100 ENGINE*/ INNODB STATUS\G"
if [ "${mysql_version}" '>' "5.1" ]; then
local mutex="SHOW ENGINE INNODB MUTEX"
else
local mutex="SHOW MUTEX STATUS"
fi
$CMD_MYSQL $EXT_ARGV -e "$innostat" >> "$d/$p-innodbstatus1" 2>&1 &
$CMD_MYSQL $EXT_ARGV -e "$mutex" >> "$d/$p-mutex-status1" 2>&1 &
open_tables >> "$d/$p-opentables1" 2>&1 &
local tcpdump_pid=""
if [ "$CMD_TCPDUMP" -a "$OPT_COLLECT_TCPDUMP" = "yes" ]; then
local port=$(awk '/^port/{print $2}' "$d/$p-variables")
if [ "$port" ]; then
$CMD_TCPDUMP -i any -s 4096 -w "$d/$p-tcpdump" port ${port} &
tcpdump_pid=$!
fi
fi
local have_oprofile="no"
if [ "$CMD_OPCONTROL" -a "$OPT_COLLECT_OPROFILE" = "yes" ]; then
if $CMD_OPCONTROL --init; then
$CMD_OPCONTROL --start --no-vmlinux
have_oprofile="yes"
fi
elif [ "$CMD_STRACE" -a "$OPT_COLLECT_STRACE" = "yes" ]; then
$CMD_STRACE -T -s 0 -f -p $mysqld_pid > "${DEST}/$d-strace" 2>&1 &
local strace_pid=$!
fi
ps -eaf >> "$d/$p-ps" 2>&1 &
sysctl -a >> "$d/$p-sysctl" 2>&1 &
top -bn1 >> "$d/$p-top" 2>&1 &
lsof -nP -p $mysqld_pid -bw >> "$d/$p-lsof" 2>&1 &
if [ "$CMD_VMSTAT" ]; then
$CMD_VMSTAT 1 $OPT_INTERVAL >> "$d/$p-vmstat" 2>&1 &
$CMD_VMSTAT $OPT_INTERVAL 2 >> "$d/$p-vmstat-overall" 2>&1 &
fi
if [ "$CMD_IOSTAT" ]; then
$CMD_IOSTAT -dx 1 $OPT_INTERVAL >> "$d/$p-iostat" 2>&1 &
$CMD_IOSTAT -dx $OPT_INTERVAL 2 >> "$d/$p-iostat-overall" 2>&1 &
fi
if [ "$CMD_MPSTAT" ]; then
$CMD_MPSTAT -P ALL 1 $OPT_INTERVAL >> "$d/$p-mpstat" 2>&1 &
$CMD_MPSTAT -P ALL $OPT_INTERVAL 1 >> "$d/$p-mpstat-overall" 2>&1 &
fi
$CMD_MYSQLADMIN $EXT_ARGV ext -i1 -c$OPT_RUN_TIME >>"$d/$p-mysqladmin" 2>&1 &
local mysqladmin_pid=$!
local have_lock_waits_table=0
$CMD_MYSQL $EXT_ARGV -e "SHOW TABLES FROM INFORMATION_SCHEMA" \
| grep -i "INNODB_LOCK_WAITS" >/dev/null 2>&1
if [ $? -eq 0 ]; then
have_lock_waits_table=1
fi
echo "Loop start: $(date +'TS %s.%N %F %T')"
for loopno in $(_seq $OPT_RUN_TIME); do
disk_space $d > $d/$p-disk-space
check_disk_space \
$d/$p-disk-space \
"$OPT_DISK_BYTE_LIMIT" \
"$OPT_DISK_PCT_LIMIT" \
|| break
sleep $(date +%s.%N | awk '{print 1 - ($1 % 1)}')
local ts="$(date +"TS %s.%N %F %T")"
if [ -d "/proc" ]; then
if [ -f "/proc/diskstats" ]; then
(cat /proc/diskstats 2>&1; echo $ts) >> "$d/$p-diskstats" &
fi
if [ -f "/proc/stat" ]; then
(cat /proc/stat 2>&1; echo $ts) >> "$d/$p-procstat" &
fi
if [ -f "/proc/vmstat" ]; then
(cat /proc/vmstat 2>&1; echo $ts) >> "$d/$p-procvmstat" &
fi
if [ -f "/proc/meminfo" ]; then
(cat /proc/meminfo 2>&1; echo $ts) >> "$d/$p-meminfo" &
fi
if [ -f "/proc/slabinfo" ]; then
(cat /proc/slabinfo 2>&1; echo $ts) >> "$d/$p-slabinfo" &
fi
if [ -f "/proc/interrupts" ]; then
(cat /proc/interrupts 2>&1; echo $ts) >> "$d/$p-interrupts" &
fi
fi
(df -h 2>&1; echo $ts) >> "$d/$p-df" &
(netstat -antp 2>&1; echo $ts) >> "$d/$p-netstat" &
(netstat -s 2>&1; echo $ts) >> "$d/$p-netstat_s" &
($CMD_MYSQL $EXT_ARGV -e "SHOW FULL PROCESSLIST\G" 2>&1; echo $ts) \
>> "$d/$p-processlist"
if [ $have_lock_waits_table -eq 1 ]; then
(lock_waits 2>&1; echo $ts) >>"$d/$p-lock-waits"
fi
done
echo "Loop end: $(date +'TS %s.%N %F %T')"
if [ "$have_oprofile" = "yes" ]; then
$CMD_OPCONTROL --stop
$CMD_OPCONTROL --dump
kill $(pidof oprofiled); # TODO: what if system doesn't have pidof?
$CMD_OPCONTROL --save=pt_collect_$p
local mysqld_path=$(which mysqld);
if [ "$mysqld_path" -a -f "$mysqld_path" ]; then
$CMD_OPREPORT \
--demangle=smart \
--symbols \
--merge tgid \
session:pt_collect_$p \
"$mysqld_path" \
> "$d/$p-opreport"
else
echo "oprofile data saved to pt_collect_$p; you should be able" \
"to get a report by running something like 'opreport" \
"--demangle=smart --symbols --merge tgid session:pt_collect_$p" \
"/path/to/mysqld'" \
> "$d/$p-opreport"
fi
elif [ "$CMD_STRACE" -a "$OPT_COLLECT_STRACE" = "yes" ]; then
kill -s 2 $strace_pid
sleep 1
kill -s 15 $strace_pid
kill -s 18 $mysqld_pid
fi
$CMD_MYSQL $EXT_ARGV -e "$innostat" >> "$d/$p-innodbstatus2" 2>&1 &
$CMD_MYSQL $EXT_ARGV -e "$mutex" >> "$d/$p-mutex-status2" 2>&1 &
open_tables >> "$d/$p-opentables2" 2>&1 &
kill $mysqladmin_pid
[ "$tail_error_log_pid" ] && kill $tail_error_log_pid
[ "$tcpdump_pid" ] && kill $tcpdump_pid
hostname > "$d/$p-hostname"
}
open_tables() {
local open_tables=$($CMD_MYSQLADMIN $EXT_ARGV ext | grep "Open_tables" | awk '{print $4}')
if [ -n "$open_tables" -a $open_tables -le 1000 ]; then
$CMD_MYSQL $EXT_ARGV -e 'SHOW OPEN TABLES' 2>&1 &
else
echo "Too many open tables: $open_tables"
fi
}
lock_waits() {
local sql1="SELECT
CONCAT('thread ', b.trx_mysql_thread_id, ' from ', p.host) AS who_blocks,
IF(p.command = \"Sleep\", p.time, 0) AS idle_in_trx,
MAX(TIMESTAMPDIFF(SECOND, r.trx_wait_started, CURRENT_TIMESTAMP)) AS max_wait_time,
COUNT(*) AS num_waiters
FROM INFORMATION_SCHEMA.INNODB_LOCK_WAITS AS w
INNER JOIN INFORMATION_SCHEMA.INNODB_TRX AS b ON b.trx_id = w.blocking_trx_id
INNER JOIN INFORMATION_SCHEMA.INNODB_TRX AS r ON r.trx_id = w.requesting_trx_id
LEFT JOIN INFORMATION_SCHEMA.PROCESSLIST AS p ON p.id = b.trx_mysql_thread_id
GROUP BY who_blocks ORDER BY num_waiters DESC\G"
$CMD_MYSQL $EXT_ARGV -e "$sql1"
local sql2="SELECT
r.trx_id AS waiting_trx_id,
r.trx_mysql_thread_id AS waiting_thread,
TIMESTAMPDIFF(SECOND, r.trx_wait_started, CURRENT_TIMESTAMP) AS wait_time,
r.trx_query AS waiting_query,
l.lock_table AS waiting_table_lock,
b.trx_id AS blocking_trx_id, b.trx_mysql_thread_id AS blocking_thread,
SUBSTRING(p.host, 1, INSTR(p.host, ':') - 1) AS blocking_host,
SUBSTRING(p.host, INSTR(p.host, ':') +1) AS blocking_port,
IF(p.command = \"Sleep\", p.time, 0) AS idle_in_trx,
b.trx_query AS blocking_query
FROM INFORMATION_SCHEMA.INNODB_LOCK_WAITS AS w
INNER JOIN INFORMATION_SCHEMA.INNODB_TRX AS b ON b.trx_id = w.blocking_trx_id
INNER JOIN INFORMATION_SCHEMA.INNODB_TRX AS r ON r.trx_id = w.requesting_trx_id
INNER JOIN INFORMATION_SCHEMA.INNODB_LOCKS AS l ON w.requested_lock_id = l.lock_id
LEFT JOIN INFORMATION_SCHEMA.PROCESSLIST AS p ON p.id = b.trx_mysql_thread_id
ORDER BY wait_time DESC\G"
$CMD_MYSQL $EXT_ARGV -e "$sql2"
}
# ###########################################################################
# End collect package
# ###########################################################################
# ###########################################################################
# Global variables
# ###########################################################################
RAN_WITH=""
EXIT_REASON=""
TOOL=$(basename $0)
OKTORUN=1
ITER=1
# ###########################################################################
# Subroutines
# ###########################################################################
grep_processlist() {
local file="$1"
local col="$2"
local pat="${3:-""}"
local gt="${4:-0}"
local quiet="${5:-0}"
awk "
BEGIN {
FS=\"|\"
OFS=\" | \"
n_cols=0
found=0
}
/^\|/ {
if ( n_cols ) {
val=colno_for_name[\"$col\"]
if ((\"$pat\" && match(\$val, \"$pat\")) || ($gt && \$val > $gt) ) {
found++
if (!$quiet) print \$0
}
}
else {
for (i = 1; i <= NF; i++) {
gsub(/^[ ]*/, \"\", \$i)
gsub(/[ ]*$/, \"\", \$i)
if ( \$i != \"\" ) {
name_for_colno[i]=\$i
colno_for_name[\$i]=i
n_cols++
}
}
}
}
END {
if ( found )
exit 0
exit 1
}
" $file
}
set_trg_func() {
if [ -f "$OPT_FUNCTION" ]; then
source $OPT_FUNCTION
TRIGGER_FUNCTION="trg_plugin"
else
TRIGGER_FUNCTION="trg_$OPT_FUNCTION"
fi
}
trg_status() {
local var="$1"
mysqladmin $EXT_ARGV extended-status \
| grep "$OPT_VARIABLE " \
| awk '{print $4}'
}
trg_processlist() {
local var="$1"
local tmpfile="$TMPDIR/processlist"
mysqladmin $EXT_ARGV processlist > $tmpfile-1
grep_processlist $tmpfile-1 $var $OPT_MATCH 0 0 > $tmpfile-2
wc -l $tmpfile-2 | awk '{print $1}'
rm -rf $tmpfile*
return
}
trg_magic() {
echo "TODO"
return
}
oktorun() {
if [ $OKTORUN -eq 0 ]; then
EXIT_REASON="OKTORUN is false"
return 1 # stop running
fi
if [ -n "$OPT_ITERATIONS" ] && [ $ITER -gt $OPT_ITERATIONS ]; then
EXIT_REASON="no more iterations"
return 1 # stop running
fi
return 0 # continue running
}
sleep_ok() {
local seconds="$1"
local msg="${2:-""}"
if oktorun; then
if [ -n "$msg" ]; then
log "$msg"
fi
sleep $seconds
fi
}
purge_samples() {
local dir="$1"
local retention_time="$2"
# Delete collect files which more than --retention-time days old.
find "$dir" -type f -mtime +$retention_time -exec rm -f '{}' \;
local oprofile_dir="/var/lib/oprofile/samples"
if [ -d "$oprofile_dir" ]; then
# "pt_collect_" here needs to match $CMD_OPCONTROL --save=pt_collect_$p
# in collect(). TODO: fix this
find "$oprofile_dir" -type d -name 'pt_collect_*' \
-depth -mtime +$retention_time -exec rm -f '{}' \;
fi
}
sigtrap() {
if [ $OKTORUN -eq 1 ]; then
warn "Caught signal, exiting"
OKTORUN=0
else
warn "Caught signal again, forcing exit"
exit $EXIT_STATUS
fi
}
stalk() {
local cycles_true=0 # increment each time check is true, else set to 0
local matched="no" # set to "yes" when check is true
local last_prefix="" # prefix of last collection
while oktorun; do
# Run the trigger which returns the value of whatever is being
# checked. When the value is > --threshold for at least --cycle
# consecutive times, start collecting.
local value=$($TRIGGER_FUNCTION $OPT_VARIABLE)
local trg_exit_status=$?
if [ -z "$value" ]; then
# No value. Maybe we failed to connect to MySQL?
warn "Detected value is empty; something failed? Trigger exit status: $trg_exit_status"
matched="no"
cycles_true=0
elif [ $value -gt $OPT_THRESHOLD ]; then
matched="yes"
cycles_true=$(($cycles_true + 1))
else
matched="no"
cycles_true=0
fi
local msg="Check results: $OPT_VARIABLE=$value, matched=$matched, cycles_true=$cycles_true"
log "$msg"
if [ "$matched" = "yes" -a $cycles_true -ge $OPT_CYCLES ]; then
# ##################################################################
# Start collecting, maybe.
# ##################################################################
local prefix="${OPT_PREFIX:-$(date +%F-%T | tr :- _)}"
log "Collect triggered"
# Check if we'll have enough disk space to collect. Disk space
# is also checked every interval while collecting.
local margin="20" # default 20M margin, unless:
if [ -n "$last_prefix" ]; then
margin=$(du -mc "$OPT_DEST"/"$last_prefix"-* | tail -n 1 | awk '{print $1'})
fi
disk_space "$OPT_DEST" > "$OPT_DEST/$prefix-disk-space"
check_disk_space \
"$OPT_DEST/$prefix-disk-space" \
"$OPT_DISK_BYTE_LIMIT" \
"$OPT_DISK_PCT_LIMIT" \
"$margin" # real used MB + margin MB
if [ $? -eq 0 ]; then
# There should be enough disk space, so collect.
log "$msg" >> "$OPT_DEST/$prefix-trigger"
log "pt-stalk ran with $RAN_WITH" >> "$OPT_DEST/$prefix-trigger"
last_prefix="$prefix"
# Send email to whomever that collect has been triggered.
if [ "$OPT_NOTIFY_BY_EMAIL" ]; then
echo "$msg on $(hostname)" \
| mail -s "Collect triggered on $(hostname)" \
"$OPT_NOTIFY_BY_EMAIL"
fi
# Fork and background the collect subroutine which will
# run for --run-time seconds. We (the parent) sleep
# while its collecting (hopefully --sleep is longer than
# --run-time).
(
collect "$OPT_DEST" "$prefix"
) >> "$OPT_DEST/$prefix-output" 2>&1 &
else
# There will not be enough disk space, so do not collect.
warn "Collect canceled because there will not be enough disk space after collecting another $margin MB"
fi
# ##################################################################
# Done collecting.
# ##################################################################
ITER=$((ITER + 1))
sleep_ok "$OPT_SLEEP" "Sleeping $OPT_SLEEP seconds after collect"
else
# Trigger/check/value is ok, sleep until next check.
sleep_ok "$OPT_INTERVAL"
fi
# Purge old collect file between checks.
purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME"
done
}
# ###########################################################################
# Main program loop, called below if tool is ran from the command line.
# ###########################################################################
main() {
trap sigtrap SIGHUP SIGINT SIGTERM
# Note: $$ is the parent's PID, but we're a child proc.
# Bash 4 has $BASHPID but we can't rely on that. Consequently,
# we don't know our own PID. See the usage of $! below.
RAN_WITH="--function=$OPT_FUNCTION --variable=$OPT_VARIABLE --threshold=$OPT_THRESHOLD --match=$OPT_MATCH --cycles=$OPT_CYCLES --interval=$OPT_INTERVAL --iterations=$OPT_ITERATIONS --run-time=$OPT_RUN_TIME --sleep=$OPT_SLEEP --dest=$OPT_DEST --prefix=$OPT_PREFIX --notify-by-email=$OPT_NOTIFY_BY_EMAIL --log=$OPT_LOG --pid=$OPT_PID"
log "Starting $0 $RAN_WITH"
# Make sure the collection dir exists.
if [ ! -d "$OPT_DEST" ]; then
mkdir -p "$OPT_DEST" || die "Cannot make --dest $OPT_DEST"
fi
# Check access to the --dest dir. By setting -x in the subshell,
# if either command fails, the subshell will exit immediately and
# $? will be non-zero.
(
set -e
touch "$OPT_DEST/test"
rm "$OPT_DEST/test"
)
if [ $? -ne 0 ]; then
die "Cannot read and write files to --dest $OPT_DEST"
fi
# Test if we have root; warn if not, but it isn't critical.
if [ "$(id -u)" != "0" ]; then
log 'Not running with root privileges!';
fi
# Make a secure tmpdir.
mk_tmpdir
# Set TRIGGER_FUNCTION based on --function.
set_trg_func
# Stalk while oktorun.
stalk
# Clean up.
rm_tmpdir
remove_pid_file "$OPT_PID"
log "Exiting because $EXIT_REASON"
log "$0 exit status $EXIT_STATUS"
exit $EXIT_STATUS
}
# Execute the program if it was not included from another file.
# This makes it possible to include without executing, and thus test.
if [ "$(basename "$0")" = "pt-stalk" ] \
|| [ "$(basename "$0")" = "bash" -a "$_" = "$0" ]; then
# Check that mysql and mysqladmin are in PATH. If not, we're
# already dead in the water, so don't bother with cmd line opts,
# just error and exit.
[ -n "$(mysql --help)" ] \
|| die "Cannot execute mysql. Check that it is in PATH."
[ -n "$(mysqladmin --help)" ] \
|| die "Cannot execute mysqladmin. Check that it is in PATH."
# Parse command line options. We must do this first so we can
# see if --daemonize was specified.
mk_tmpdir
parse_options "$0" "$@"
usage_or_errors "$0"
po_status=$?
rm_tmpdir
if [ $po_status -ne 0 ]; then
exit $po_status
fi
# Now that we have the cmd line opts, check that we can actually
# connect to MySQL.
[ -n "$(mysql $EXT_ARGV -e 'SELECT 1')" ] \
|| die "Cannot connect to MySQL. Check that MySQL is running and that the options after -- are correct."
if [ "$OPT_DAEMONIZE" = "yes" ]; then
# Check access to the --log file.
(
set -e
touch "$OPT_LOG"
)
if [ $? -ne 0 ]; then
die "Cannot write to --log $OPT_LOG"
fi
# The PID file will at first have our (parent) PID.
# This is fine for ensuring that only one of us is
# running, but it's not fine if the user wants to use
# the PID in the PID file to check or kill the child
# process. So we'll need to update the PID file with
# the child's PID.
make_pid_file "$OPT_PID" $$
main "$@" </dev/null 1>>"$OPT_LOG" 2>&1 &
# Update PID file with the child's PID.
# The child PID is $BASHPID but that special var is only
# in Bash 4+, so we can't rely on it. Consequently, we
# use $! to get the PID of the child we just forked.
echo "$!" > "$OPT_PID"
else
make_pid_file "$OPT_PID" $$
main "$@"
fi
fi
# ############################################################################
# Documentation
# ############################################################################
:<<'DOCUMENTATION'
=pod
=head1 NAME
pt-stalk - Gather forensic data about MySQL when a problem occurs.
=head1 SYNOPSIS
Usage: pt-stalk [OPTIONS] [-- MYSQL OPTIONS]
pt-stalk watches for a trigger condition to become true, and then collects data
to help in diagnosing problems. It is designed to run as a daemon so that you
can diagnose intermittent problems that you cannot observe directly. You can
also use it to execute a custom command, or to gather the data on demand without
waiting for the trigger to happen.
=head1 RISKS
The following section is included to inform users about the potential risks,
whether known or unknown, of using this tool. The two main categories of risks
are those created by the nature of the tool (e.g. read-only tools vs. read-write
tools) and those created by bugs.
pt-stalk is a read-only tool. It should be very low-risk. Some of the options
can cause intrusive data collection to be performed, however, so if you enable
any non-default options, you should read their documentation carefully.
At the time of this release, we know of no bugs that could cause serious harm
to users.
The authoritative source for updated information is always the online issue
tracking system. Issues that affect this tool will be marked as such. You can
see a list of such issues at the following URL:
L<http://www.percona.com/bugs/pt-stalk>.
See also L<"BUGS"> for more information on filing bugs and getting help.
=head1 DESCRIPTION
Sometimes a problem happens infrequently and for a short time, giving you no
chance to see the system when it happens. How do you solve intermittent MySQL
problems when you can't observe them? That's why pt-stalk exists. In addition to
using it when there's a known problem on your servers, it is a good idea to run
pt-stalk all the time, even when you think nothing is wrong. You will
appreciate the data it gathers when a problem occurs, because problems such as
MySQL lockups or spikes of activity typically leave no evidence to use in root
cause analysis.
This tool does two things: it watches a server (typically MySQL) for a trigger
to occur, and it gathers diagnostic data. To use it effectively, you need to
define a good trigger condition. A good trigger is sensitive enough to fire
reliably when a problem occurs, so that you don't miss a chance to solve
problems. On the other hand, a good trigger isn't prone to false positives, so
you don't gather information when the server is functioning normally.
The most reliable triggers for MySQL tend to be the number of connections to the
server, and the number of queries running concurrently. These are available in
the SHOW GLOBAL STATUS command as Threads_connected and Threads_running.
Sometimes Threads_connected is not a reliable indicator of trouble, but
Threads_running usually is. Your job, as the tool's user, is to define an
appropriate trigger condition for the tool. Choose carefully, because the
quality of your results will depend on the trigger you choose.
You can define the trigger with the L<"--function">, L<"--variable">, and
L<"--threshold"> options, among others. Please read the documentation for
--function to learn how to do this.
The pt-stalk tool, by default, simply watches MySQL repeatedly until the trigger
becomes true. It then gathers diagnostics for a while, and sleeps afterwards for
some time to prevent repeatedly gathering data if the condition remains true.
In crude pseudocode, omitting some subtleties,
while true; do
if --variable from --function is greater than --threshold; then
observations++
if observations is greater than --cycles; then
capture diagnostics for --run-time seconds
exit if --iterations is exceeded
sleep for --sleep seconds
done
done
clean up data that's older than --retention-time
sleep for --interval seconds
done
The diagnostic data is written to files whose names begin with a timestamp, so
you can distinguish samples from each other in case the tool collects data
multiple times. The pt-sift tool is designed to help you browse and analyze the
resulting samples of data.
Although this sounds simple enough, in practice there are a number of
subtleties, such as detecting when the disk is beginning to fill up so that the
tool doesn't cause the server to run out of disk space.
=head1 CONFIGURING
TODO
=head1 OPTIONS
=over
=item --collect
default: yes; negatable: yes
Collect system information. You can negate this option to make the tool watch
the system but not actually gather any diagnostic data.
=item --collect-gdb
Collect GDB stacktraces. This is achieved by attaching to MySQL and printing
stack traces from all threads. This will freeze the server for some period of
time, ranging from a second or so to much longer on very busy systems with a lot
of memory and many threads in the server. For this reason, it is disabled by
default. However, if you are trying to diagnose a server stall or lockup,
freezing the server causes no additional harm, and the stack traces can be vital
for diagnosis.
In addition to freezing the server, there is also some risk of the server
crashing or performing badly after GDB detaches from it.
=item --collect-oprofile
Collect oprofile data. This is achieved by starting an oprofile session,
letting it run for the collection time, and then stopping and saving the
resulting profile data in the system's default location. Please read your
system's oprofile documentation to learn more about this.
=item --collect-strace
Collect strace data. This is achieved by attaching strace to the server, which
will make it run very slowly until strace detaches. The same cautions apply as
those listed in --collect-gdb. You should not enable this option together with
--collect-gdb, because GDB and strace can't attach to the server process
simultaneously.
=item --collect-tcpdump
Collect tcpdump data. This option causes tcpdump to capture all traffic on all
interfaces for the port on which MySQL is listening. You can later use
pt-query-digest to decode the MySQL protocol and extract a log of query traffic
from it.
=item --config
type: string
Read this comma-separated list of config files. If specified, this must be the
first option on the command line.
=item --cycles
type: int; default: 5
The number of times the trigger condition must be true before collecting data.
This helps prevent false positives and make the trigger condition less
susceptible to firing when the condition recovers quickly.
=item --daemonize
Daemonize the tool. This causes the tool to fork into the background and log
its output as specified in --log.
=item --dest
type: string; default: ${HOME}/collected
Where to store the diagnostic data. Each time the tool collects data, it writes
to a new set of files, which are named with the current system timestamp.
=item --disk-byte-limit
type: int; default: 100
Don't collect data unless the destination disk has this much free space. This
prevents the tool from filling up the disk with diagnostic data.
If the destination directory contains a previously captured sample of data, the
tool will measure its size and use that as an estimate of how much data is
likely to be gathered this time, too. It will then be even more pessimistic,
and will refuse to collect data unless the disk has enough free space to hold
the sample and still have the desired amount of free space. For example, if
you'd like 100MB of free space and the previous diagnostic sample consumed
100MB, the tool won't collect any data unless the disk has 200MB free.
=item --disk-pct-limit
type: int; default: 5
Don't collect data unless the disk has at least this percent free space. This
option works similarly to --disk-byte-limit, but specifies a percentage margin
of safety instead of a byte margin of safety. The tool honors both options, and
will not collect any data unless both margins are satisfied.
=item --function
type: string; default: status
Specifies what to watch for a diagnostic trigger. The default value watches
SHOW GLOBAL STATUS, but you can also watch SHOW PROCESSLIST or supply a plugin
file with your own custom code. This function supplies the value of
L<"--variable">, which is then compared against L<"--threshold"> to see if the
trigger condition is met. Additional options may be required as well; see
below. Possible values:
=over
=item * status
This value specifies that the source of data for the diagnostic trigger is SHOW
GLOBAL STATUS. The value of L<"--variable"> then defines which status counter
is the trigger.
=item * processlist
This value specifies that the data for the diagnostic trigger comes from SHOW
FULL PROCESSLIST. The trigger value is the count of processes whose
L<"--variable"> column matches the L<"--match"> option. For example, to trigger
when more than 10 processes are in the "statistics" state, use the following
options:
--trigger processlist --variable State --match statistics --threshold 10
In addition, you can specify a file that contains your custom trigger function,
written in Unix shell script. This can be a wrapper that executes anything you
wish. If the argument to --function is a file, then it takes precedence over
builtin functions, so if there is a file in the working directory named "status"
or "processlist" then the tool will use that file as a plugin, even though those
are otherwise recognized as reserved words for this option.
The plugin file works by providing a function called C<trg_plugin>, and the tool
simply sources the file and executes the function. For example, the function
might look like the following:
trg_plugin() {
mysql $EXT_ARGV -e "SHOW ENGINE INNODB STATUS" | grep -c "has waited at"
}
This snippet will count the number of mutex waits inside of InnoDB. It
illustrates the general principle: the function must output a number, which is
then compared to the threshold as usual. The $EXT_ARGV variable contains the
MySQL options mentioned in the L<"SYNOPSIS"> above.
The plugin should not alter the tool's existing global variables. Prefix any
plugin-specific global variables with "PLUGIN_" or make them local.
=back
=item --help
Print help and exit.
=item --interval
type: int; default: 1
Interval between checks for the diagnostic trigger.
=item --iterations
type: int
Exit after collecting diagnostics this many times. By default, the tool
will continue to watch the server forever, but this is useful for scenarios
where you want to capture once and then exit, for example.
=item --log
type: string; default: /var/log/pt-stalk.log
Print all output to this file when daemonized.
=item --match
type: string
The pattern to use when watching SHOW PROCESSLIST. See the documentation for
L<"--function"> for details.
=item --notify-by-email
type: string
Send mail to this list of addresses when data is collected.
=item --pid
type: string; default: /var/run/pt-stalk.pid
Create a PID file when daemonized.
=item --prefix
type: string
The filename prefix for diagnostic samples. By default, samples have a timestamp
prefix based on the current local time, such as 2011_12_06_14_02_02, which is
December 6, 2011 at 14:02:02.
=item --retention-time
type: int; default: 30
Number of days to retain collected samples. Any samples that are older will be
purged.
=item --run-time
type: int; default: 30
How long the tool will collect data when it triggers. This should not be longer
than L<"--sleep">. It is usually not necessary to change this; if the default 30
seconds hasn't gathered enough diagnostic data, running longer is not likely to
do so. In fact, in many cases a shorter collection period is appropriate.
=item --sleep
type: int; default: 300
How long to sleep after collecting data. This prevents the tool from triggering
continuously, which might be a problem if the collection process is intrusive.
It also prevents filling up the disk or gathering too much data to analyze
reasonably.
=item --threshold
type: int; default: 25
The threshold at which the diagnostic trigger should fire. See L<"--function">
for details.
=item --variable
type: string; default: Threads_running
The variable to compare against the threshold. See L<"--function"> for details.
=item --version
Print tool's version and exit.
=back
=head1 ENVIRONMENT
No env vars used.
=head1 SYSTEM REQUIREMENTS
This tool requires Bash v3 or newer.
=head1 BUGS
For a list of known bugs, see L<http://www.percona.com/bugs/pt-stalk>.
Please report bugs at L<https://bugs.launchpad.net/percona-toolkit>.
Include the following information in your bug report:
=over
=item * Complete command-line used to run the tool
=item * Tool L<"--version">
=item * MySQL version of all servers involved
=item * Output from the tool including STDERR
=item * Input files (log/dump/config files, etc.)
=back
If possible, include debugging output by running the tool with C<PTDEBUG>;
see L<"ENVIRONMENT">.
=head1 DOWNLOADING
Visit L<http://www.percona.com/software/percona-toolkit/> to download the
latest release of Percona Toolkit. Or, get the latest release from the
command line:
wget percona.com/get/percona-toolkit.tar.gz
wget percona.com/get/percona-toolkit.rpm
wget percona.com/get/percona-toolkit.deb
You can also get individual tools from the latest release:
wget percona.com/get/TOOL
Replace C<TOOL> with the name of any tool.
=head1 AUTHORS
Baron Schwartz, Justin Swanhart, Fernando Ipar, and Daniel Nichter
=head1 ABOUT PERCONA TOOLKIT
This tool is part of Percona Toolkit, a collection of advanced command-line
tools developed by Percona for MySQL support and consulting. Percona Toolkit
was forked from two projects in June, 2011: Maatkit and Aspersa. Those
projects were created by Baron Schwartz and developed primarily by him and
Daniel Nichter, both of whom are employed by Percona. Visit
L<http://www.percona.com/software/> for more software developed by Percona.
=head1 COPYRIGHT, LICENSE, AND WARRANTY
This program is copyright 2010-2011 Baron Schwartz, 2011 Percona Inc.
Feedback and improvements are welcome.
THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
systems, you can issue `man perlgpl' or `man perlartistic' to read these
licenses.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA.
=head1 VERSION
pt-stalk 2.0.0
=cut
DOCUMENTATION