Make ptc exit status 0, 255, or a bitmask of flags for various problems.

This commit is contained in:
Daniel Nichter
2013-10-03 11:32:11 -07:00
parent f6661ca658
commit cd29c0d7dc
5 changed files with 278 additions and 164 deletions

View File

@@ -6930,59 +6930,79 @@ package Daemon;
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
use POSIX qw(setsid);
use Fcntl qw(:DEFAULT);
sub new {
my ( $class, %args ) = @_;
foreach my $arg ( qw(o) ) {
die "I need a $arg argument" unless $args{$arg};
}
my $o = $args{o};
my ($class, %args) = @_;
my $self = {
o => $o,
log_file => $o->has('log') ? $o->get('log') : undef,
PID_file => $o->has('pid') ? $o->get('pid') : undef,
log_file => $args{log_file},
pid_file => $args{pid_file},
daemonize => $args{daemonize},
force_log_file => $args{force_log_file},
parent_exit => $args{parent_exit},
pid_file_owner => 0,
};
check_PID_file(undef, $self->{PID_file});
PTDEBUG && _d('Daemonized child will log to', $self->{log_file});
return bless $self, $class;
}
sub daemonize {
my ( $self ) = @_;
sub run {
my ($self) = @_;
PTDEBUG && _d('About to fork and daemonize');
defined (my $pid = fork()) or die "Cannot fork: $OS_ERROR";
if ( $pid ) {
PTDEBUG && _d('Parent PID', $PID, 'exiting after forking child PID',$pid);
exit;
my $daemonize = $self->{daemonize};
my $pid_file = $self->{pid_file};
my $log_file = $self->{log_file};
my $force_log_file = $self->{force_log_file};
my $parent_exit = $self->{parent_exit};
PTDEBUG && _d('Starting daemon');
if ( $pid_file ) {
eval {
$self->_make_pid_file(
pid => $PID, # parent's pid
pid_file => $pid_file,
);
};
die "$EVAL_ERROR\n" if $EVAL_ERROR;
if ( !$daemonize ) {
$self->{pid_file_owner} = $PID; # parent's pid
}
}
PTDEBUG && _d('Daemonizing child PID', $PID);
$self->{PID_owner} = $PID;
$self->{child} = 1;
if ( $daemonize ) {
defined (my $child_pid = fork()) or die "Cannot fork: $OS_ERROR";
if ( $child_pid ) {
PTDEBUG && _d('Forked child', $child_pid);
$parent_exit->($child_pid) if $parent_exit;
exit 0;
}
POSIX::setsid() or die "Cannot start a new session: $OS_ERROR";
chdir '/' or die "Cannot chdir to /: $OS_ERROR";
$self->_make_PID_file();
$OUTPUT_AUTOFLUSH = 1;
if ( $pid_file ) {
$self->_update_pid_file(
pid => $PID, # child's pid
pid_file => $pid_file,
);
$self->{pid_file_owner} = $PID;
}
}
if ( $daemonize || $force_log_file ) {
PTDEBUG && _d('Redirecting STDIN to /dev/null');
close STDIN;
open STDIN, '/dev/null'
or die "Cannot reopen STDIN to /dev/null: $OS_ERROR";
if ( $self->{log_file} ) {
PTDEBUG && _d('Redirecting STDOUT and STDERR to', $self->{log_file});
if ( $log_file ) {
PTDEBUG && _d('Redirecting STDOUT and STDERR to', $log_file);
close STDOUT;
open STDOUT, '>>', $self->{log_file}
or die "Cannot open log file $self->{log_file}: $OS_ERROR";
open STDOUT, '>>', $log_file
or die "Cannot open log file $log_file: $OS_ERROR";
close STDERR;
open STDERR, ">&STDOUT"
@@ -7005,82 +7025,119 @@ sub daemonize {
}
}
$OUTPUT_AUTOFLUSH = 1;
}
PTDEBUG && _d('Daemon running');
return;
}
sub check_PID_file {
my ( $self, $file ) = @_;
my $PID_file = $self ? $self->{PID_file} : $file;
PTDEBUG && _d('Checking PID file', $PID_file);
if ( $PID_file && -f $PID_file ) {
my $pid;
eval {
chomp($pid = (slurp_file($PID_file) || ''));
sub _make_pid_file {
my ($self, %args) = @_;
my @required_args = qw(pid pid_file);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
};
if ( $EVAL_ERROR ) {
die "The PID file $PID_file already exists but it cannot be read: "
. $EVAL_ERROR;
my $pid = $args{pid};
my $pid_file = $args{pid_file};
eval {
sysopen(PID_FH, $pid_file, O_RDWR|O_CREAT|O_EXCL) or die $OS_ERROR;
print PID_FH $PID, "\n";
close PID_FH;
};
if ( my $e = $EVAL_ERROR ) {
if ( $e =~ m/file exists/i ) {
my $old_pid = $self->_check_pid_file(
pid_file => $pid_file,
pid => $PID,
);
if ( $old_pid ) {
warn "Overwriting PID file $pid_file because PID $old_pid "
. "is not running.\n";
}
PTDEBUG && _d('PID file exists; it contains PID', $pid);
if ( $pid ) {
my $pid_is_alive = kill 0, $pid;
$self->_update_pid_file(
pid => $PID,
pid_file => $pid_file
);
}
else {
die "Error creating PID file $pid_file: $e\n";
}
}
return;
}
sub _check_pid_file {
my ($self, %args) = @_;
my @required_args = qw(pid_file pid);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
};
my $pid_file = $args{pid_file};
my $pid = $args{pid};
PTDEBUG && _d('Checking if PID in', $pid_file, 'is running');
if ( ! -f $pid_file ) {
PTDEBUG && _d('PID file', $pid_file, 'does not exist');
return;
}
open my $fh, '<', $pid_file
or die "Error opening $pid_file: $OS_ERROR";
my $existing_pid = do { local $/; <$fh> };
chomp($existing_pid) if $existing_pid;
close $fh
or die "Error closing $pid_file: $OS_ERROR";
if ( $existing_pid ) {
if ( $existing_pid == $pid ) {
warn "The current PID $pid already holds the PID file $pid_file\n";
return;
}
else {
PTDEBUG && _d('Checking if PID', $existing_pid, 'is running');
my $pid_is_alive = kill 0, $existing_pid;
if ( $pid_is_alive ) {
die "The PID file $PID_file already exists "
. " and the PID that it contains, $pid, is running";
die "PID file $pid_file exists and PID $existing_pid is running\n";
}
else {
warn "Overwriting PID file $PID_file because the PID that it "
. "contains, $pid, is not running";
}
}
else {
die "The PID file $PID_file already exists but it does not "
. "contain a PID";
}
}
else {
PTDEBUG && _d('No PID file');
die "PID file $pid_file exists but it is empty. Remove the file "
. "if the process is no longer running.\n";
}
return $existing_pid;
}
sub _update_pid_file {
my ($self, %args) = @_;
my @required_args = qw(pid pid_file);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
};
my $pid = $args{pid};
my $pid_file = $args{pid_file};
open my $fh, '>', $pid_file
or die "Cannot open $pid_file: $OS_ERROR";
print { $fh } $pid, "\n"
or die "Cannot print to $pid_file: $OS_ERROR";
close $fh
or warn "Cannot close $pid_file: $OS_ERROR";
return;
}
sub make_PID_file {
my ( $self ) = @_;
if ( exists $self->{child} ) {
die "Do not call Daemon::make_PID_file() for daemonized scripts";
}
$self->_make_PID_file();
$self->{PID_owner} = $PID;
return;
}
sub _make_PID_file {
my ( $self ) = @_;
my $PID_file = $self->{PID_file};
if ( !$PID_file ) {
PTDEBUG && _d('No PID file to create');
return;
}
$self->check_PID_file();
open my $PID_FH, '>', $PID_file
or die "Cannot open PID file $PID_file: $OS_ERROR";
print $PID_FH $PID
or die "Cannot print to PID file $PID_file: $OS_ERROR";
close $PID_FH
or die "Cannot close PID file $PID_file: $OS_ERROR";
PTDEBUG && _d('Created PID file:', $self->{PID_file});
return;
}
sub _remove_PID_file {
my ( $self ) = @_;
if ( $self->{PID_file} && -f $self->{PID_file} ) {
unlink $self->{PID_file}
or warn "Cannot remove PID file $self->{PID_file}: $OS_ERROR";
sub remove_pid_file {
my ($self, $pid_file) = @_;
$pid_file ||= $self->{pid_file};
if ( $pid_file && -f $pid_file ) {
unlink $self->{pid_file}
or warn "Cannot remove PID file $pid_file: $OS_ERROR";
PTDEBUG && _d('Removed PID file');
}
else {
@@ -7090,20 +7147,15 @@ sub _remove_PID_file {
}
sub DESTROY {
my ( $self ) = @_;
my ($self) = @_;
$self->_remove_PID_file() if ($self->{PID_owner} || 0) == $PID;
if ( $self->{pid_file_owner} == $PID ) {
$self->remove_pid_file();
}
return;
}
sub slurp_file {
my ($file) = @_;
return unless $file;
open my $fh, "<", $file or die "Cannot open $file: $OS_ERROR";
return do { local $/; <$fh> };
}
sub _d {
my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
@@ -8882,6 +8934,22 @@ use sigtrap 'handler', \&sig_int, 'normal-signals';
my $oktorun = 1;
my $print_header = 1;
my $exit_status = 0;
# "exit codes 1 - 2, 126 - 165, and 255 [1] have special meanings,
# and should therefore be avoided for user-specified exit parameters"
# http://www.tldp.org/LDP/abs/html/exitcodes.html
our %PTC_EXIT_STATUS = (
# General flags:
ALREADY_RUNNING => 4,
NO_SLAVES_FOUND => 8,
CAUGHT_SIGNAL => 16,
ERROR => 32,
# Tool-specific flags:
TABLE_DIFF => 512,
SKIP_CHUNK => 1024,
SKIP_TABLE => 2048,
);
# The following two hashes are used in exec_nibble().
# They're static, so they do not need to be reset in main().
@@ -8912,8 +8980,7 @@ sub main {
local @ARGV = @_;
$oktorun = 1;
$print_header = 1;
my $exit_status = 0;
$exit_status = 0;
# ########################################################################
# Get configuration information.
@@ -9003,13 +9070,26 @@ sub main {
# ########################################################################
# If --pid, check it first since we'll die if it already exists.
# ########################################################################
my $daemon;
if ( $o->get('pid') ) {
# We're not daemoninzing, it just handles PID stuff. Keep $daemon
# in the the scope of main() because when it's destroyed it automatically
# removes the PID file.
$daemon = new Daemon(o=>$o);
$daemon->make_PID_file();
my $pid_file = $o->get('pid');
my $daemon = new Daemon(
pid_file => $pid_file,
);
eval {
$daemon->run();
};
if ( my $e = $EVAL_ERROR ) {
# TODO quite hackish but it should work for now
if ( $e =~ m/PID file $pid_file exists/ ) {
$exit_status |= $PTC_EXIT_STATUS{ALREADY_RUNNING};
warn "$e\n";
return $exit_status;
}
else {
die $e;
}
}
# ########################################################################
@@ -9222,7 +9302,7 @@ sub main {
&& (($o->get('recursion-method')->[0] || '') ne 'none'
|| $autodiscover_cluster))
{
$exit_status |= 1;
$exit_status |= $PTC_EXIT_STATUS{NO_SLAVES_FOUND};
if ( $o->get('quiet') < 2 ) {
my $type = $autodiscover_cluster ? 'cluster nodes' : 'slaves';
warn "Diffs cannot be detected because no $type were found. "
@@ -9394,7 +9474,7 @@ sub main {
$slave->name());
$diffs = filter_tables_replicate_check_only($diffs, $o);
if ( @$diffs ) {
$exit_status |= 1;
$exit_status |= $PTC_EXIT_STATUS{TABLE_DIFF};
if ( $o->get('quiet') < 2 ) {
print_checksum_diffs(
cxn => $slave,
@@ -9698,7 +9778,7 @@ sub main {
. "This can break replication. If you understand the risks, "
. "specify --no-check-slave-tables to disable this check.\n";
warn ts($msg);
$tbl->{checksum_results}->{errors}++;
$exit_status |= $PTC_EXIT_STATUS{SKIP_TABLE};
$oktonibble = 0;
}
}
@@ -9759,7 +9839,7 @@ sub main {
. " * chunk size limit=$chunk_size_limit).\n";
warn ts($msg);
}
$tbl->{checksum_results}->{errors}++;
$exit_status |= $PTC_EXIT_STATUS{SKIP_TABLE};
$oktonibble = 0;
}
}
@@ -9935,6 +10015,7 @@ sub main {
# Nibble time will be zero if the chunk was skipped.
if ( !defined $tbl->{nibble_time} ) {
PTDEBUG && _d('Skipping chunk', $chunk);
$exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK};
$tbl->{checksum_results}->{skipped}++;
return;
}
@@ -10075,6 +10156,7 @@ sub main {
# statement in RowChecksum::find_replication_differences()
# for the full list of columns.
map { $diff_chunks{ $_->{chunk} }++ } @$diffs;
$exit_status |= $PTC_EXIT_STATUS{TABLE_DIFF};
}
};
if ($EVAL_ERROR) {
@@ -10141,6 +10223,7 @@ sub main {
if ( $EVAL_ERROR ) {
warn ts("Skipping table $tbl->{db}.$tbl->{tbl} because "
. "$EVAL_ERROR\n");
$exit_status |= $PTC_EXIT_STATUS{SKIP_TABLE};
return;
}
@@ -10230,9 +10313,8 @@ sub main {
}
# Update the tool's exit status.
if ( $tbl->{checksum_results}->{errors}
|| $tbl->{checksum_results}->{diffs} ) {
$exit_status |= 1;
if ( $tbl->{checksum_results}->{errors} ) {
$exit_status |= $PTC_EXIT_STATUS{ERROR};
}
}
@@ -10281,6 +10363,7 @@ sub nibble_is_safe {
. ($expl->{key} ? "the $expl->{key}" : "no") . " index "
. " instead of the " . $nibble_iter->nibble_index() . "index.\n");
}
$exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK};
return 0; # not safe
}
@@ -10304,6 +10387,7 @@ sub nibble_is_safe {
. "that there are " . ($expl->{rows} || 0)
. " rows in the chunk.\n");
}
$exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK};
return 0; # not safe
}
}
@@ -10322,6 +10406,7 @@ sub nibble_is_safe {
. $tbl->{key_len} . ". See the --[no]check-plan documentation "
. "for more information.\n");
}
$exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK};
return 0; # not safe
}
@@ -11077,13 +11162,14 @@ sub wait_for_last_checksum {
# Catches signals so we can exit gracefully.
sub sig_int {
my ( $signal ) = @_;
$exit_status |= $PTC_EXIT_STATUS{CAUGHT_SIGNAL};
if ( $oktorun ) {
print STDERR "# Caught SIG$signal.\n";
warn "# Caught SIG$signal.\n";
$oktorun = 0;
}
else {
print STDERR "# Exiting on SIG$signal.\n";
exit 1;
warn "# Exiting on SIG$signal.\n";
exit $exit_status;
}
}
@@ -11130,8 +11216,8 @@ Usage: pt-table-checksum [OPTIONS] [DSN]
pt-table-checksum performs an online replication consistency check by executing
checksum queries on the master, which produces different results on replicas
that are inconsistent with the master. The optional DSN specifies the master
host. The tool's exit status is nonzero if any differences are found, or if any
warnings or errors occur.
host. The tool's L<"EXIT STATUS"> is non-zero if any differences are found,
or if any warnings or errors occur.
The following command will connect to the replication master on localhost,
checksum every table, and report the results on every detected replica:
@@ -11509,7 +11595,31 @@ The index values that define the upper boundary of the chunk.
=head1 EXIT STATUS
A non-zero exit status indicates errors, warnings, or checksum differences.
pt-table-checksum has three possible exit statuses: zero, 255, and any other
value is a bitmask with flags for different problems.
A zero exit status indicates no errors, warnings, or checksum differences,
or skipped chunks or tables.
A 255 exit status indicates a fatal error. In other words: the tool died
or crashed. The error is printed to C<STDERR>.
If the exit status is not zero or 255, then its value functions as a bitmask
with these flags:
FLAG BIT VALUE MEANING
================ ========= ==========================================
ALREADY_RUNNING 4 --pid file exists and the PID is running
NO_SLAVES_FOUND 8 No replicas or cluster nodes were found
CAUGHT_SIGNAL 16 Caught SIGHUP, SIGINT, SIGPIPE, or SIGTERM
ERROR 32 A non-fatal error occurred
TABLE_DIFF 512 At least one diff was found
SKIP_CHUNK 1024 At least one chunk was skipped
SKIP_TABLE 2048 At least one table was skipped
If any flag is set, the exit status will be non-zero. Use the bitwise C<AND>
operation to check for a particular flag. For example, if C<$exit_status & 4>
is true, then at least one diff was found.
=head1 OPTIONS

View File

@@ -123,7 +123,7 @@ $exit_status = pt_table_checksum::main(@args,
is(
$exit_status,
1,
512, # = TABLE_DIFF but nothing else; https://bugs.launchpad.net/percona-toolkit/+bug/944051
"--replicate-check on by default, detects diff"
);
@@ -297,7 +297,7 @@ is_deeply(
is(
$exit_status,
1,
2048,
"Non-zero exit status"
);
@@ -386,8 +386,8 @@ like(
is(
$exit_status,
1,
"Exit status 1 when no slaves are found (bug 1087804)"
8, # https://bugs.launchpad.net/percona-toolkit/+bug/944051
"Exit status 8 when no slaves are found (bug 1087804)"
) or diag($output);
# #############################################################################

View File

@@ -171,7 +171,7 @@ $output = output(sub {
is(
$exit_status,
0,
1024,
"Bad key_len chunks are not errors"
) or diag($output);

View File

@@ -153,7 +153,7 @@ like(
is(
$exit_status,
1,
2048, # https://bugs.launchpad.net/percona-toolkit/+bug/944051
"Non-zero exit status (bug 1009510)"
);

View File

@@ -26,9 +26,6 @@ if ( !$master_dbh ) {
elsif ( !$slave_dbh ) {
plan skip_all => 'Cannot connect to sandbox slave1';
}
else {
plan tests => 6;
}
# The sandbox servers run with lock_wait_timeout=3 and it's not dynamic
# so we need to specify --set-vars innodb_lock_wait_timeout=3 else the tool will die.
@@ -116,19 +113,26 @@ like(
diag(`rm -rf $pid_file >/dev/null 2>&1`);
diag(`touch $pid_file`);
eval {
pt_table_checksum::main(@args, $cnf, '--pid', $pid_file);
};
$output = output(
sub { $exit_status = pt_table_checksum::main(@args, $cnf, '--pid', $pid_file) },
stderr => 1,
);
like(
$EVAL_ERROR,
qr/PID file $pid_file already exists/,
$output,
qr/PID file $pid_file exists/,
'Dies if PID file already exists (issue 391)'
);
is(
$exit_status,
4,
"Exit status 4 if if PID file already exist (bug 944051)"
);
diag(`rm -rf $pid_file >/dev/null 2>&1`);
# #############################################################################
# Done.
# #############################################################################
ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox");
exit;
done_testing;