diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index 5da172fb..dff7361c 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -6930,157 +6930,214 @@ package Daemon; use strict; use warnings FATAL => 'all'; use English qw(-no_match_vars); + use constant PTDEBUG => $ENV{PTDEBUG} || 0; use POSIX qw(setsid); +use Fcntl qw(:DEFAULT); sub new { - my ( $class, %args ) = @_; - foreach my $arg ( qw(o) ) { - die "I need a $arg argument" unless $args{$arg}; - } - my $o = $args{o}; + my ($class, %args) = @_; my $self = { - o => $o, - log_file => $o->has('log') ? $o->get('log') : undef, - PID_file => $o->has('pid') ? $o->get('pid') : undef, + log_file => $args{log_file}, + pid_file => $args{pid_file}, + daemonize => $args{daemonize}, + force_log_file => $args{force_log_file}, + parent_exit => $args{parent_exit}, + pid_file_owner => 0, }; - - check_PID_file(undef, $self->{PID_file}); - - PTDEBUG && _d('Daemonized child will log to', $self->{log_file}); return bless $self, $class; } -sub daemonize { - my ( $self ) = @_; +sub run { + my ($self) = @_; - PTDEBUG && _d('About to fork and daemonize'); - defined (my $pid = fork()) or die "Cannot fork: $OS_ERROR"; - if ( $pid ) { - PTDEBUG && _d('Parent PID', $PID, 'exiting after forking child PID',$pid); - exit; - } + my $daemonize = $self->{daemonize}; + my $pid_file = $self->{pid_file}; + my $log_file = $self->{log_file}; + my $force_log_file = $self->{force_log_file}; + my $parent_exit = $self->{parent_exit}; - PTDEBUG && _d('Daemonizing child PID', $PID); - $self->{PID_owner} = $PID; - $self->{child} = 1; + PTDEBUG && _d('Starting daemon'); - POSIX::setsid() or die "Cannot start a new session: $OS_ERROR"; - chdir '/' or die "Cannot chdir to /: $OS_ERROR"; - - $self->_make_PID_file(); - - $OUTPUT_AUTOFLUSH = 1; - - PTDEBUG && _d('Redirecting STDIN to /dev/null'); - close STDIN; - open STDIN, '/dev/null' - or die "Cannot reopen STDIN to /dev/null: $OS_ERROR"; - - if ( $self->{log_file} ) { - PTDEBUG && _d('Redirecting STDOUT and STDERR to', $self->{log_file}); - close STDOUT; - open STDOUT, '>>', $self->{log_file} - or die "Cannot open log file $self->{log_file}: $OS_ERROR"; - - close STDERR; - open STDERR, ">&STDOUT" - or die "Cannot dupe STDERR to STDOUT: $OS_ERROR"; - } - else { - if ( -t STDOUT ) { - PTDEBUG && _d('No log file and STDOUT is a terminal;', - 'redirecting to /dev/null'); - close STDOUT; - open STDOUT, '>', '/dev/null' - or die "Cannot reopen STDOUT to /dev/null: $OS_ERROR"; - } - if ( -t STDERR ) { - PTDEBUG && _d('No log file and STDERR is a terminal;', - 'redirecting to /dev/null'); - close STDERR; - open STDERR, '>', '/dev/null' - or die "Cannot reopen STDERR to /dev/null: $OS_ERROR"; - } - } - - return; -} - -sub check_PID_file { - my ( $self, $file ) = @_; - my $PID_file = $self ? $self->{PID_file} : $file; - PTDEBUG && _d('Checking PID file', $PID_file); - if ( $PID_file && -f $PID_file ) { - my $pid; + if ( $pid_file ) { eval { - chomp($pid = (slurp_file($PID_file) || '')); + $self->_make_pid_file( + pid => $PID, # parent's pid + pid_file => $pid_file, + ); }; - if ( $EVAL_ERROR ) { - die "The PID file $PID_file already exists but it cannot be read: " - . $EVAL_ERROR; + die "$EVAL_ERROR\n" if $EVAL_ERROR; + if ( !$daemonize ) { + $self->{pid_file_owner} = $PID; # parent's pid } - PTDEBUG && _d('PID file exists; it contains PID', $pid); - if ( $pid ) { - my $pid_is_alive = kill 0, $pid; - if ( $pid_is_alive ) { - die "The PID file $PID_file already exists " - . " and the PID that it contains, $pid, is running"; - } - else { - warn "Overwriting PID file $PID_file because the PID that it " - . "contains, $pid, is not running"; - } + } + + if ( $daemonize ) { + defined (my $child_pid = fork()) or die "Cannot fork: $OS_ERROR"; + if ( $child_pid ) { + PTDEBUG && _d('Forked child', $child_pid); + $parent_exit->($child_pid) if $parent_exit; + exit 0; + } + + POSIX::setsid() or die "Cannot start a new session: $OS_ERROR"; + chdir '/' or die "Cannot chdir to /: $OS_ERROR"; + + if ( $pid_file ) { + $self->_update_pid_file( + pid => $PID, # child's pid + pid_file => $pid_file, + ); + $self->{pid_file_owner} = $PID; + } + } + + if ( $daemonize || $force_log_file ) { + PTDEBUG && _d('Redirecting STDIN to /dev/null'); + close STDIN; + open STDIN, '/dev/null' + or die "Cannot reopen STDIN to /dev/null: $OS_ERROR"; + if ( $log_file ) { + PTDEBUG && _d('Redirecting STDOUT and STDERR to', $log_file); + close STDOUT; + open STDOUT, '>>', $log_file + or die "Cannot open log file $log_file: $OS_ERROR"; + + close STDERR; + open STDERR, ">&STDOUT" + or die "Cannot dupe STDERR to STDOUT: $OS_ERROR"; } else { - die "The PID file $PID_file already exists but it does not " - . "contain a PID"; + if ( -t STDOUT ) { + PTDEBUG && _d('No log file and STDOUT is a terminal;', + 'redirecting to /dev/null'); + close STDOUT; + open STDOUT, '>', '/dev/null' + or die "Cannot reopen STDOUT to /dev/null: $OS_ERROR"; + } + if ( -t STDERR ) { + PTDEBUG && _d('No log file and STDERR is a terminal;', + 'redirecting to /dev/null'); + close STDERR; + open STDERR, '>', '/dev/null' + or die "Cannot reopen STDERR to /dev/null: $OS_ERROR"; + } + } + + $OUTPUT_AUTOFLUSH = 1; + } + + PTDEBUG && _d('Daemon running'); + return; +} + +sub _make_pid_file { + my ($self, %args) = @_; + my @required_args = qw(pid pid_file); + foreach my $arg ( @required_args ) { + die "I need a $arg argument" unless $args{$arg}; + }; + my $pid = $args{pid}; + my $pid_file = $args{pid_file}; + + eval { + sysopen(PID_FH, $pid_file, O_RDWR|O_CREAT|O_EXCL) or die $OS_ERROR; + print PID_FH $PID, "\n"; + close PID_FH; + }; + if ( my $e = $EVAL_ERROR ) { + if ( $e =~ m/file exists/i ) { + my $old_pid = $self->_check_pid_file( + pid_file => $pid_file, + pid => $PID, + ); + if ( $old_pid ) { + warn "Overwriting PID file $pid_file because PID $old_pid " + . "is not running.\n"; + } + $self->_update_pid_file( + pid => $PID, + pid_file => $pid_file + ); + } + else { + die "Error creating PID file $pid_file: $e\n"; } } - else { - PTDEBUG && _d('No PID file'); - } + return; } -sub make_PID_file { - my ( $self ) = @_; - if ( exists $self->{child} ) { - die "Do not call Daemon::make_PID_file() for daemonized scripts"; - } - $self->_make_PID_file(); - $self->{PID_owner} = $PID; - return; -} +sub _check_pid_file { + my ($self, %args) = @_; + my @required_args = qw(pid_file pid); + foreach my $arg ( @required_args ) { + die "I need a $arg argument" unless $args{$arg}; + }; + my $pid_file = $args{pid_file}; + my $pid = $args{pid}; -sub _make_PID_file { - my ( $self ) = @_; + PTDEBUG && _d('Checking if PID in', $pid_file, 'is running'); - my $PID_file = $self->{PID_file}; - if ( !$PID_file ) { - PTDEBUG && _d('No PID file to create'); + if ( ! -f $pid_file ) { + PTDEBUG && _d('PID file', $pid_file, 'does not exist'); return; } - $self->check_PID_file(); + open my $fh, '<', $pid_file + or die "Error opening $pid_file: $OS_ERROR"; + my $existing_pid = do { local $/; <$fh> }; + chomp($existing_pid) if $existing_pid; + close $fh + or die "Error closing $pid_file: $OS_ERROR"; - open my $PID_FH, '>', $PID_file - or die "Cannot open PID file $PID_file: $OS_ERROR"; - print $PID_FH $PID - or die "Cannot print to PID file $PID_file: $OS_ERROR"; - close $PID_FH - or die "Cannot close PID file $PID_file: $OS_ERROR"; + if ( $existing_pid ) { + if ( $existing_pid == $pid ) { + warn "The current PID $pid already holds the PID file $pid_file\n"; + return; + } + else { + PTDEBUG && _d('Checking if PID', $existing_pid, 'is running'); + my $pid_is_alive = kill 0, $existing_pid; + if ( $pid_is_alive ) { + die "PID file $pid_file exists and PID $existing_pid is running\n"; + } + } + } + else { + die "PID file $pid_file exists but it is empty. Remove the file " + . "if the process is no longer running.\n"; + } + + return $existing_pid; +} + +sub _update_pid_file { + my ($self, %args) = @_; + my @required_args = qw(pid pid_file); + foreach my $arg ( @required_args ) { + die "I need a $arg argument" unless $args{$arg}; + }; + my $pid = $args{pid}; + my $pid_file = $args{pid_file}; + + open my $fh, '>', $pid_file + or die "Cannot open $pid_file: $OS_ERROR"; + print { $fh } $pid, "\n" + or die "Cannot print to $pid_file: $OS_ERROR"; + close $fh + or warn "Cannot close $pid_file: $OS_ERROR"; - PTDEBUG && _d('Created PID file:', $self->{PID_file}); return; } -sub _remove_PID_file { - my ( $self ) = @_; - if ( $self->{PID_file} && -f $self->{PID_file} ) { - unlink $self->{PID_file} - or warn "Cannot remove PID file $self->{PID_file}: $OS_ERROR"; +sub remove_pid_file { + my ($self, $pid_file) = @_; + $pid_file ||= $self->{pid_file}; + if ( $pid_file && -f $pid_file ) { + unlink $self->{pid_file} + or warn "Cannot remove PID file $pid_file: $OS_ERROR"; PTDEBUG && _d('Removed PID file'); } else { @@ -7090,20 +7147,15 @@ sub _remove_PID_file { } sub DESTROY { - my ( $self ) = @_; + my ($self) = @_; - $self->_remove_PID_file() if ($self->{PID_owner} || 0) == $PID; + if ( $self->{pid_file_owner} == $PID ) { + $self->remove_pid_file(); + } return; } -sub slurp_file { - my ($file) = @_; - return unless $file; - open my $fh, "<", $file or die "Cannot open $file: $OS_ERROR"; - return do { local $/; <$fh> }; -} - sub _d { my ($package, undef, $line) = caller 0; @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } @@ -8882,6 +8934,22 @@ use sigtrap 'handler', \&sig_int, 'normal-signals'; my $oktorun = 1; my $print_header = 1; +my $exit_status = 0; + +# "exit codes 1 - 2, 126 - 165, and 255 [1] have special meanings, +# and should therefore be avoided for user-specified exit parameters" +# http://www.tldp.org/LDP/abs/html/exitcodes.html +our %PTC_EXIT_STATUS = ( + # General flags: + ALREADY_RUNNING => 4, + NO_SLAVES_FOUND => 8, + CAUGHT_SIGNAL => 16, + ERROR => 32, + # Tool-specific flags: + TABLE_DIFF => 512, + SKIP_CHUNK => 1024, + SKIP_TABLE => 2048, +); # The following two hashes are used in exec_nibble(). # They're static, so they do not need to be reset in main(). @@ -8912,8 +8980,7 @@ sub main { local @ARGV = @_; $oktorun = 1; $print_header = 1; - - my $exit_status = 0; + $exit_status = 0; # ######################################################################## # Get configuration information. @@ -9003,13 +9070,26 @@ sub main { # ######################################################################## # If --pid, check it first since we'll die if it already exists. # ######################################################################## - my $daemon; - if ( $o->get('pid') ) { - # We're not daemoninzing, it just handles PID stuff. Keep $daemon - # in the the scope of main() because when it's destroyed it automatically - # removes the PID file. - $daemon = new Daemon(o=>$o); - $daemon->make_PID_file(); + # We're not daemoninzing, it just handles PID stuff. Keep $daemon + # in the the scope of main() because when it's destroyed it automatically + # removes the PID file. + my $pid_file = $o->get('pid'); + my $daemon = new Daemon( + pid_file => $pid_file, + ); + eval { + $daemon->run(); + }; + if ( my $e = $EVAL_ERROR ) { + # TODO quite hackish but it should work for now + if ( $e =~ m/PID file $pid_file exists/ ) { + $exit_status |= $PTC_EXIT_STATUS{ALREADY_RUNNING}; + warn "$e\n"; + return $exit_status; + } + else { + die $e; + } } # ######################################################################## @@ -9222,7 +9302,7 @@ sub main { && (($o->get('recursion-method')->[0] || '') ne 'none' || $autodiscover_cluster)) { - $exit_status |= 1; + $exit_status |= $PTC_EXIT_STATUS{NO_SLAVES_FOUND}; if ( $o->get('quiet') < 2 ) { my $type = $autodiscover_cluster ? 'cluster nodes' : 'slaves'; warn "Diffs cannot be detected because no $type were found. " @@ -9394,7 +9474,7 @@ sub main { $slave->name()); $diffs = filter_tables_replicate_check_only($diffs, $o); if ( @$diffs ) { - $exit_status |= 1; + $exit_status |= $PTC_EXIT_STATUS{TABLE_DIFF}; if ( $o->get('quiet') < 2 ) { print_checksum_diffs( cxn => $slave, @@ -9698,7 +9778,7 @@ sub main { . "This can break replication. If you understand the risks, " . "specify --no-check-slave-tables to disable this check.\n"; warn ts($msg); - $tbl->{checksum_results}->{errors}++; + $exit_status |= $PTC_EXIT_STATUS{SKIP_TABLE}; $oktonibble = 0; } } @@ -9759,7 +9839,7 @@ sub main { . " * chunk size limit=$chunk_size_limit).\n"; warn ts($msg); } - $tbl->{checksum_results}->{errors}++; + $exit_status |= $PTC_EXIT_STATUS{SKIP_TABLE}; $oktonibble = 0; } } @@ -9935,6 +10015,7 @@ sub main { # Nibble time will be zero if the chunk was skipped. if ( !defined $tbl->{nibble_time} ) { PTDEBUG && _d('Skipping chunk', $chunk); + $exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK}; $tbl->{checksum_results}->{skipped}++; return; } @@ -10075,6 +10156,7 @@ sub main { # statement in RowChecksum::find_replication_differences() # for the full list of columns. map { $diff_chunks{ $_->{chunk} }++ } @$diffs; + $exit_status |= $PTC_EXIT_STATUS{TABLE_DIFF}; } }; if ($EVAL_ERROR) { @@ -10141,6 +10223,7 @@ sub main { if ( $EVAL_ERROR ) { warn ts("Skipping table $tbl->{db}.$tbl->{tbl} because " . "$EVAL_ERROR\n"); + $exit_status |= $PTC_EXIT_STATUS{SKIP_TABLE}; return; } @@ -10230,9 +10313,8 @@ sub main { } # Update the tool's exit status. - if ( $tbl->{checksum_results}->{errors} - || $tbl->{checksum_results}->{diffs} ) { - $exit_status |= 1; + if ( $tbl->{checksum_results}->{errors} ) { + $exit_status |= $PTC_EXIT_STATUS{ERROR}; } } @@ -10281,6 +10363,7 @@ sub nibble_is_safe { . ($expl->{key} ? "the $expl->{key}" : "no") . " index " . " instead of the " . $nibble_iter->nibble_index() . "index.\n"); } + $exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK}; return 0; # not safe } @@ -10304,6 +10387,7 @@ sub nibble_is_safe { . "that there are " . ($expl->{rows} || 0) . " rows in the chunk.\n"); } + $exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK}; return 0; # not safe } } @@ -10322,6 +10406,7 @@ sub nibble_is_safe { . $tbl->{key_len} . ". See the --[no]check-plan documentation " . "for more information.\n"); } + $exit_status |= $PTC_EXIT_STATUS{SKIP_CHUNK}; return 0; # not safe } @@ -11077,13 +11162,14 @@ sub wait_for_last_checksum { # Catches signals so we can exit gracefully. sub sig_int { my ( $signal ) = @_; + $exit_status |= $PTC_EXIT_STATUS{CAUGHT_SIGNAL}; if ( $oktorun ) { - print STDERR "# Caught SIG$signal.\n"; + warn "# Caught SIG$signal.\n"; $oktorun = 0; } else { - print STDERR "# Exiting on SIG$signal.\n"; - exit 1; + warn "# Exiting on SIG$signal.\n"; + exit $exit_status; } } @@ -11130,8 +11216,8 @@ Usage: pt-table-checksum [OPTIONS] [DSN] pt-table-checksum performs an online replication consistency check by executing checksum queries on the master, which produces different results on replicas that are inconsistent with the master. The optional DSN specifies the master -host. The tool's exit status is nonzero if any differences are found, or if any -warnings or errors occur. +host. The tool's L<"EXIT STATUS"> is non-zero if any differences are found, +or if any warnings or errors occur. The following command will connect to the replication master on localhost, checksum every table, and report the results on every detected replica: @@ -11509,7 +11595,31 @@ The index values that define the upper boundary of the chunk. =head1 EXIT STATUS -A non-zero exit status indicates errors, warnings, or checksum differences. +pt-table-checksum has three possible exit statuses: zero, 255, and any other +value is a bitmask with flags for different problems. + +A zero exit status indicates no errors, warnings, or checksum differences, +or skipped chunks or tables. + +A 255 exit status indicates a fatal error. In other words: the tool died +or crashed. The error is printed to C. + +If the exit status is not zero or 255, then its value functions as a bitmask +with these flags: + + FLAG BIT VALUE MEANING + ================ ========= ========================================== + ALREADY_RUNNING 4 --pid file exists and the PID is running + NO_SLAVES_FOUND 8 No replicas or cluster nodes were found + CAUGHT_SIGNAL 16 Caught SIGHUP, SIGINT, SIGPIPE, or SIGTERM + ERROR 32 A non-fatal error occurred + TABLE_DIFF 512 At least one diff was found + SKIP_CHUNK 1024 At least one chunk was skipped + SKIP_TABLE 2048 At least one table was skipped + +If any flag is set, the exit status will be non-zero. Use the bitwise C +operation to check for a particular flag. For example, if C<$exit_status & 4> +is true, then at least one diff was found. =head1 OPTIONS diff --git a/t/pt-table-checksum/basics.t b/t/pt-table-checksum/basics.t index 51695ca8..a49bdf4b 100644 --- a/t/pt-table-checksum/basics.t +++ b/t/pt-table-checksum/basics.t @@ -123,7 +123,7 @@ $exit_status = pt_table_checksum::main(@args, is( $exit_status, - 1, + 512, # = TABLE_DIFF but nothing else; https://bugs.launchpad.net/percona-toolkit/+bug/944051 "--replicate-check on by default, detects diff" ); @@ -297,7 +297,7 @@ is_deeply( is( $exit_status, - 1, + 2048, "Non-zero exit status" ); @@ -386,8 +386,8 @@ like( is( $exit_status, - 1, - "Exit status 1 when no slaves are found (bug 1087804)" + 8, # https://bugs.launchpad.net/percona-toolkit/+bug/944051 + "Exit status 8 when no slaves are found (bug 1087804)" ) or diag($output); # ############################################################################# diff --git a/t/pt-table-checksum/chunk_index.t b/t/pt-table-checksum/chunk_index.t index 5433401e..33adb505 100644 --- a/t/pt-table-checksum/chunk_index.t +++ b/t/pt-table-checksum/chunk_index.t @@ -171,7 +171,7 @@ $output = output(sub { is( $exit_status, - 0, + 1024, "Bad key_len chunks are not errors" ) or diag($output); diff --git a/t/pt-table-checksum/error_handling.t b/t/pt-table-checksum/error_handling.t index 3289cd11..48a54216 100644 --- a/t/pt-table-checksum/error_handling.t +++ b/t/pt-table-checksum/error_handling.t @@ -153,7 +153,7 @@ like( is( $exit_status, - 1, + 2048, # https://bugs.launchpad.net/percona-toolkit/+bug/944051 "Non-zero exit status (bug 1009510)" ); diff --git a/t/pt-table-checksum/standard_options.t b/t/pt-table-checksum/standard_options.t index a1c2c987..85297df0 100644 --- a/t/pt-table-checksum/standard_options.t +++ b/t/pt-table-checksum/standard_options.t @@ -26,9 +26,6 @@ if ( !$master_dbh ) { elsif ( !$slave_dbh ) { plan skip_all => 'Cannot connect to sandbox slave1'; } -else { - plan tests => 6; -} # The sandbox servers run with lock_wait_timeout=3 and it's not dynamic # so we need to specify --set-vars innodb_lock_wait_timeout=3 else the tool will die. @@ -116,19 +113,26 @@ like( diag(`rm -rf $pid_file >/dev/null 2>&1`); diag(`touch $pid_file`); -eval { - pt_table_checksum::main(@args, $cnf, '--pid', $pid_file); -}; +$output = output( + sub { $exit_status = pt_table_checksum::main(@args, $cnf, '--pid', $pid_file) }, + stderr => 1, +); like( - $EVAL_ERROR, - qr/PID file $pid_file already exists/, + $output, + qr/PID file $pid_file exists/, 'Dies if PID file already exists (issue 391)' ); +is( + $exit_status, + 4, + "Exit status 4 if if PID file already exist (bug 944051)" +); + diag(`rm -rf $pid_file >/dev/null 2>&1`); # ############################################################################# # Done. # ############################################################################# ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); -exit; +done_testing;