diff --git a/lib/bash/alt_cmds.sh b/lib/bash/alt_cmds.sh new file mode 100644 index 00000000..55a41e3a --- /dev/null +++ b/lib/bash/alt_cmds.sh @@ -0,0 +1,34 @@ +# This program is copyright 2011 Percona Inc. +# Feedback and improvements are welcome. +# +# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar +# systems, you can issue `man perlgpl' or `man perlartistic' to read these +# licenses. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA. +# ########################################################################### +# alt_cmds package +# ########################################################################### + +# Package: alt_cmds +# alt_cmds provides alternatives to commands that aren't on all systems. + +set -u + +# seq N, return 1, ..., 5 +_seq() { + local i=$1 + awk "BEGIN { for(i=1; i<=$i; i++) print i; }" +} + +# ########################################################################### +# End alt_cmds package +# ########################################################################### diff --git a/lib/bash/collect.sh b/lib/bash/collect.sh new file mode 100644 index 00000000..7c8b68f9 --- /dev/null +++ b/lib/bash/collect.sh @@ -0,0 +1,238 @@ +# This program is copyright 2011 Percona Inc. +# Feedback and improvements are welcome. +# +# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar +# systems, you can issue `man perlgpl' or `man perlartistic' to read these +# licenses. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA. +# ########################################################################### +# collect package +# ########################################################################### + +# Package: collect +# collect collects system information. + +set -u + +# Global variables. +CMD_GDB=${CMD_GDB:-"gdb"} +CMD_IOSTAT=${CMD_IOSTAT:-"iostat"} +CMD_MPSTAT=${CMD_MPSTAT:-"mpstat"} +CMD_MYSQL=${CMD_MSSQL:-"mysql"} +CMD_MYSQLADMIN=${CMD_MYSQL_ADMIN:-"mysqladmin"} +CMD_OPCONTROL=${CMD_OPCONTROL:-"opcontrol"} +CMD_OPREPORT=${CMD_OPREPORT:-"opreport"} +CMD_PMAP=${CMD_PMAP:-"pmap"} +CMD_STRACE=${CMD_STRACE:-"strace"} +CMD_TCPDUMP=${CMD_TCPDUMP:-"tcpdump"} +CMD_VMSTAT=${CMD_VMSTAT:-"vmstat"} + +collect() { + local d=$1 # directory to save results in + local p=$2 # prefix for each result file + + # Get pidof mysqld; pidof doesn't exist on some systems. We try our best... + local mysqld_pid=$(pidof -s mysqld); + if [ -z "$mysqld_pid" ]; then + mysqld_pid=$(pgrep -o -x mysqld); + fi + if [ -z "$mysqld_pid" ]; then + mysqld_pid=$(ps -eaf | grep 'mysql[d]' | grep -v mysqld_safe | awk '{print $2}' | head -n1); + fi + + # Get memory allocation info before anything else. + if [ -x "$CMD_PMAP" -a "$mysqld_pid" ]; then + if $CMD_PMAP --help 2>&1 | grep -- -x >/dev/null 2>&1 ; then + $CMD_PMAP -x $mysqld_pid > "$d/$p-pmap" + else + # Some pmap's apparently don't support -x (issue 116). + $CMD_PMAP $mysqld_pid > "$d/$p-pmap" + fi + fi + + # Getting a GDB stacktrace can be an intensive operation, + # so do this only if necessary. + if [ "$OPT_COLLECT_GDB" = "yes" -a "$mysqld_pid" ]; then + $CMD_GDB \ + -ex "set pagination 0" \ + -ex "thread apply all bt" \ + --batch -p $mysqld_pid \ + >> "$d/$p-stacktrace" + else + echo "GDB (--collect-gdb) was not enabled" >> "$d/$p-stacktrace" + fi + + # Get MySQL's variables if possible. Then sleep long enough that we probably + # complete SHOW VARIABLES if all's well. (We don't want to run mysql in the + # foreground, because it could hang.) + $CMD_MYSQL "$EXT_ARGV" -e 'SHOW GLOBAL VARIABLES' >> "$d/$p-variables" 2>&1 & + sleep .2 + + # Get the major.minor version number. Version 3.23 doesn't matter for our + # purposes, and other releases have x.x.x* version conventions so far. + local mysql_version="$(awk '/^version[^_]/{print substr($2,1,3)}' "$d/$p-variables")" + + # Is MySQL logging its errors to a file? If so, tail that file. + local mysql_error_log="$(awk '/log_error/{print $2}' "$d/$p-variables")" + if [ -z "$mysql_error_log" -a "$mysqld_pid" ]; then + # Try getting it from the open filehandle... + mysql_error_log="$(ls -l /proc/$mysqld_pid/fd | awk '/ 2 ->/{print $NF}')" + fi + + local tail_error_log_pid="" + if [ "$mysql_error_log" ]; then + echo "The MySQL error log seems to be ${mysql_error_log}" + tail -f "$mysql_error_log" >"$d/$p-log_error" 2>&1 & + tail_error_log_pid=$! + # Send a mysqladmin debug to the server so we can potentially learn about + # locking etc. + $CMD_MYSQLADMIN "$EXT_ARGV" debug + else + echo "Could not find the MySQL error log" + fi + + # Get a sample of these right away, so we can get these without interaction + # with the other commands we're about to run. + local innostat="SHOW /*!40100 ENGINE*/ INNODB STATUS\G" + local proclist="SHOW FULL PROCESSLIST\G" + if [ "${mysql_version}" '>' "5.1" ]; then + local mutex="SHOW ENGINE INNODB MUTEX" + else + local mutex="SHOW MUTEX STATUS" + fi + $CMD_MYSQL "$EXT_ARGV" -e "$innostat" >> "$d/$p-innodbstatus1" 2>&1 & + $CMD_MYSQL "$EXT_ARGV" -e "$proclist" >> "$d/$p-processlist1" 2>&1 & + $CMD_MYSQL "$EXT_ARGV" -e 'SHOW OPEN TABLES' >> "$d/$p-opentables1" 2>&1 & + $CMD_MYSQL "$EXT_ARGV" -e "$mutex" >> "$d/$p-mutex-status1" 2>&1 & + + # If TCP dumping is specified, start that on the server's port. + local tcpdump_pid="" + if [ "$OPT_COLLECT_TCPDUMP" = "yes" ]; then + local port=$(awk '/^port/{print $2}' "$d/$p-variables") + if [ "$port" ]; then + $CMD_TCPDUMP -i any -s 4096 -w "$d/$p-tcpdump" port ${port} & + tcpdump_pid=$! + fi + fi + + # Next, start oprofile gathering data during the whole rest of this process. + # The --init should be a no-op if it has already been init-ed. + local have_oprofile="no" + if [ "$OPT_COLLECT_OPROFILE" = "yes" ]; then + if $CMD_OPCONTROL --init; then + $CMD_OPCONTROL --start --no-vmlinux + have_oprofile="yes" + fi + elif [ "$OPT_COLLECT_STRACE" = "yes" ]; then + # Don't run oprofile and strace at the same time. + $CMD_STRACE -T -s 0 -f -p $mysqld_pid > "${DEST}/$d-strace" 2>&1 & + local strace_pid=$! + fi + + # Grab a few general things first. Background all of these so we can start + # them all up as quickly as possible. We use mysqladmin -c even though it is + # buggy and won't stop on its own in 5.1 and newer, because there is a chance + # that we will get and keep a connection to the database; in troubled times + # the database tends to exceed max_connections, so reconnecting in the loop + # tends not to work very well. + ps -eaf >> "$d/$p-ps" 2>&1 & + sysctl -a >> "$d/$p-sysctl" 2>&1 & + top -bn1 >> "$d/$p-top" 2>&1 & + $CMD_VMSTAT 1 $OPT_INTERVAL >> "$d/$p-vmstat" 2>&1 & + $CMD_VMSTAT $OPT_INTERVAL 2 >> "$d/$p-vmstat-overall" 2>&1 & + $CMD_IOSTAT -dx 1 $OPT_INTERVAL >> "$d/$p-iostat" 2>&1 & + $CMD_IOSTAT -dx $OPT_INTERVAL 2 >> "$d/$p-iostat-overall" 2>&1 & + $CMD_MPSTAT -P ALL 1 $OPT_INTERVAL >> "$d/$p-mpstat" 2>&1 & + $CMD_MPSTAT -P ALL $OPT_INTERVAL 1 >> "$d/$p-mpstat-overall" 2>&1 & + lsof -nP -p $mysqld_pid -bw >> "$d/$p-lsof" 2>&1 & + $CMD_MYSQLADMIN "$EXT_ARGV" ext -i1 -c$OPT_INTERVAL >> "$d/$p-mysqladmin" 2>&1 & + local mysqladmin_pid=$! + + # This loop gathers data for the rest of the duration, and defines the time + # of the whole job. + echo "Loop start: $(date +'TS %s.%N %F %T')" + for a in $(_seq $OPT_INTERVAL); do + # We check the disk, but don't exit, because we need to stop jobs if we + # need to exit. + disk_space $d > $d/$p-disk-space + check_disk_space \ + $d/$p-disk-space \ + "$OPT_DISK_BYTE_LIMIT" \ + "$OPT_DISK_PCT_LIMIT" \ + || break + + # Synchronize ourselves onto the clock tick, so the sleeps are 1-second + sleep $(date +%s.%N | awk '{print 1 - ($1 % 1)}') + local ts="$(date +"TS %s.%N %F %T")" + + # Collect the stuff for this cycle + (cat /proc/diskstats 2>&1; echo $ts) >> "$d/$p-diskstats" & + (cat /proc/stat 2>&1; echo $ts) >> "$d/$p-procstat" & + (cat /proc/vmstat 2>&1; echo $ts) >> "$d/$p-procvmstat" & + (cat /proc/meminfo 2>&1; echo $ts) >> "$d/$p-meminfo" & + (cat /proc/slabinfo 2>&1; echo $ts) >> "$d/$p-slabinfo" & + (cat /proc/interrupts 2>&1; echo $ts) >> "$d/$p-interrupts" & + (df -h 2>&1; echo $ts) >> "$d/$p-df" & + (netstat -antp 2>&1; echo $ts) >> "$d/$p-netstat" & + (netstat -s 2>&1; echo $ts) >> "$d/$p-netstat_s" & + done + echo "Loop end: $(date +'TS %s.%N %F %T')" + + if [ "$have_oprofile" = "yes" ]; then + $CMD_OPCONTROL --stop + $CMD_OPCONTROL --dump + kill $(pidof oprofiled); # TODO: what if system doesn't have pidof? + $CMD_OPCONTROL --save=pt_collect_$p + + # Attempt to generate a report; if this fails, then just tell the user + # how to generate the report. + local mysqld_path=$(which mysqld); + if [ "$mysqld_path" -a -f "$mysqld_path" ]; then + $CMD_OPREPORT \ + --demangle=smart \ + --symbols \ + --merge tgid \ + session:pt_collect_$p \ + "$mysqld_path" \ + > "$d/$p-opreport" + else + echo "oprofile data saved to pt_collect_$p; you should be able" \ + "to get a report by running something like 'opreport" \ + "--demangle=smart --symbols --merge tgid session:pt_collect_$p" \ + "/path/to/mysqld'" \ + > "$d/$p-opreport" + fi + elif [ "$OPT_COLLECT_STRACE" = "yes" ]; then + kill -s 2 $strace_pid + sleep 1 + kill -s 15 $strace_pid + # Sometimes strace leaves threads/processes in T status. + kill -s 18 $mysqld_pid + fi + + $CMD_MYSQL "$EXT_ARGV" -e "$innostat" >> "$d/$p-innodbstatus2" 2>&1 & + $CMD_MYSQL "$EXT_ARGV" -e "$proclist" >> "$d/$p-processlist2" 2>&1 & + $CMD_MYSQL "$EXT_ARGV" -e 'SHOW OPEN TABLES' >> "$d/$p-opentables2" 2>&1 & + $CMD_MYSQL "$EXT_ARGV" -e "$mutex" >> "$d/$p-mutex-status2" 2>&1 & + + # Kill backgrounded tasks. + kill $mysqladmin_pid + [ "$tail_error_log_pid" ] && kill $tail_error_log_pid + [ "$tcpdump_pid" ] && kill $tcpdump_pid + + # Finally, record what system we collected this data from. + hostname > "$d/$p-hostname" +} + +# ########################################################################### +# End tmpdir package +# ########################################################################### diff --git a/lib/bash/safeguards.sh b/lib/bash/safeguards.sh new file mode 100644 index 00000000..216cbeac --- /dev/null +++ b/lib/bash/safeguards.sh @@ -0,0 +1,51 @@ +# This program is copyright 2011 Percona Inc. +# Feedback and improvements are welcome. +# +# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar +# systems, you can issue `man perlgpl' or `man perlartistic' to read these +# licenses. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple +# Place, Suite 330, Boston, MA 02111-1307 USA. +# ########################################################################### +# safeguards package +# ########################################################################### + +# Package: safeguards +# safeguards is a collection of function to help avoid blowing things up. + +set -u + +disk_space() { + local filesystem=${1:-"$PWD"} + # Filesystem 512-blocks Used Available Capacity Mounted on + # /dev/disk0s2 236306352 190223184 45571168 81% / + df -m -P $filesystem +} + +check_disk_space() { + local file=$1 + local mb=${2:-"0"} + local pct=${3:-"0"} + + local avail=$(cat $file | awk '/^\//{print $4}'); + local full=$(cat $file | awk '/^\//{print $5}' | sed -e 's/%//g'); + if [ "${avail}" -le "$mb" -o "$full" -le "$pct" ]; then + echo "Not enough free space (${full}% full, ${avail}MB free)" + echo "Wanted less than ${pct}% full and more than ${mb}MB" + return 1 + fi + return 0 +} + + +# ########################################################################### +# End safeguards package +# ########################################################################### diff --git a/t/lib/bash/collect.sh b/t/lib/bash/collect.sh new file mode 100644 index 00000000..d906011d --- /dev/null +++ b/t/lib/bash/collect.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +TESTS=24 + +TMPFILE="$TEST_TMPDIR/parse-opts-output" +TMPDIR="$TEST_TMPDIR" +PATH="$PATH:$PERCONA_TOOLKIT_SANDBOX/bin" + +mkdir "$TMPDIR/collect" 2>/dev/null + +source "$LIB_DIR/log_warn_die.sh" +source "$LIB_DIR/parse_options.sh" +source "$LIB_DIR/safeguards.sh" +source "$LIB_DIR/alt_cmds.sh" +source "$LIB_DIR/collect.sh" + +parse_options "$T_LIB_DIR/samples/bash/po002.sh" -- --defaults-file=/tmp/12345/my.sandbox.cnf + +collect "$TMPDIR/collect" "2011_12_05" + +# ############################################################################ +# Done +# ############################################################################ +exit diff --git a/t/lib/samples/bash/po002.sh b/t/lib/samples/bash/po002.sh new file mode 100644 index 00000000..63a8672b --- /dev/null +++ b/t/lib/samples/bash/po002.sh @@ -0,0 +1,212 @@ +#!/usr/bin/env bash + +: + +# ############################################################################ +# Documentation +# ############################################################################ +:<<'DOCUMENTATION' +=pod + +=head1 NAME + +pt-stalk - Wait for a condition to occur then begin collecting data. + +=head1 OPTIONS + +=over + +=item --collect + +default: yes; negatable: yes + +Collect system information. + +=item --collect-gdb + +Collect GDB stacktraces. + +=item --collect-oprofile + +Collect oprofile data. + +=item --collect-strace + +Collect strace data. + +=item --collect-tcpdump + +Collect tcpdump data. + +=item --cycles + +type: int; default: 5 + +Number of times condition must be met before triggering collection. + +=item --daemonize + +default: yes; negatable: yes + +Daemonize the tool. + +=item --dest + +type: string + +Where to store collected data. + +=item --disk-byte-limit + +type: int; default: 100 + +Exit if the disk has less than this many MB free. + +=item --disk-pct-limit + +type: int; default: 5 + +Exit if the disk is less than this %full. + +=item --execute-command + +type: string; default: pt-collect + +Location of the C tool. + +=item --function + +type: string; default: status + +Built-in function name or plugin file name which returns the value of C. + +Possible values are: + +=over + +=item * status + +Grep the value of C from C. + +=item * processlist + +Count the number of processes in C whose +C column matches C. For example: + + TRIGGER_FUNCTION="processlist" \ + VARIABLE="State" \ + MATCH="statistics" \ + THRESHOLD="10" + +The above triggers when more than 10 processes are in the "statistics" state. +C must be specified for this trigger function. + +=item * magic + +TODO + +=item * plugin file name + +A plugin file allows you to specify a custom trigger function. The plugin +file must contain a function called C. For example: + + trg_plugin() { + # Do some stuff. + echo "$value" + } + +The last output if the function (its "return value") must be a number. +This number is compared to C. All L<"ENVIRONMENT"> variables +are available to the function. + +Do not alter the tool's existing global variables. Prefix any plugin-specific +global variables with "PLUGIN_". + +=back + +=item --help + +Print help and exit. + +=item --interval + +type: int; default: 1 + +Interval between checks. + +=item --iterations + +type: int + +Exit after triggering C this many times. By default, the tool +will collect as many times as it's triggered. + +=item --log + +type: string; default: /var/log/pt-stalk.log + +Print all output to this file when daemonized. + +=item --match + +type: string + +Match pattern for C L<"--function">. + +=item --notify-by-email + +type: string + +Send mail to this list of addresses when C triggers. + +=item --pid FILE + +type: string; default: /var/run/pt-stalk.pid + +Create a PID file when daemonized. + +=item --retention-time + +type: int; default: 30 + +Remove samples after this many days. + +=item --run-time + +type: int; default: 30 + +How long to collect statistics data for? + +Make sure that this isn't longer than SLEEP. + +=item --sleep + +type: int; default: 300 + +How long to sleep after collecting? + +=item --threshold N + +type: int; default: 25 + +Max number of C to tolerate. + +=item --variable NAME + +type: string; default: Threads_running + +This is the thing to check for. + +=item --version + +Print tool's version and exit. + +=back + +=head1 ENVIRONMENT + +No env vars used. + +=cut + +DOCUMENTATION