diff --git a/bin/pt-heartbeat b/bin/pt-heartbeat index fa6da080..f7e04045 100755 --- a/bin/pt-heartbeat +++ b/bin/pt-heartbeat @@ -20,6 +20,7 @@ BEGIN { Daemon Quoter TableParser + Retry Transformers VersionCheck HTTPMicro @@ -2920,6 +2921,84 @@ sub _d { # End TableParser package # ########################################################################### +# ########################################################################### +# Retry package +# This package is a copy without comments from the original. The original +# with comments and its test file can be found in the Bazaar repository at, +# lib/Retry.pm +# t/lib/Retry.t +# See https://launchpad.net/percona-toolkit for more information. +# ########################################################################### +{ +package Retry; + +use strict; +use warnings FATAL => 'all'; +use English qw(-no_match_vars); +use constant PTDEBUG => $ENV{PTDEBUG} || 0; + +sub new { + my ( $class, %args ) = @_; + my $self = { + %args, + }; + return bless $self, $class; +} + +sub retry { + my ( $self, %args ) = @_; + my @required_args = qw(try fail final_fail); + foreach my $arg ( @required_args ) { + die "I need a $arg argument" unless $args{$arg}; + }; + my ($try, $fail, $final_fail) = @args{@required_args}; + my $wait = $args{wait} || sub { sleep 1; }; + my $tries = $args{tries} || 3; + + my $last_error; + my $tryno = 0; + TRY: + while ( ++$tryno <= $tries ) { + PTDEBUG && _d("Try", $tryno, "of", $tries); + my $result; + eval { + $result = $try->(tryno=>$tryno); + }; + if ( $EVAL_ERROR ) { + PTDEBUG && _d("Try code failed:", $EVAL_ERROR); + $last_error = $EVAL_ERROR; + + if ( $tryno < $tries ) { # more retries + my $retry = $fail->(tryno=>$tryno, error=>$last_error); + last TRY unless $retry; + PTDEBUG && _d("Calling wait code"); + $wait->(tryno=>$tryno); + } + } + else { + PTDEBUG && _d("Try code succeeded"); + return $result; + } + } + + PTDEBUG && _d('Try code did not succeed'); + return $final_fail->(error=>$last_error); +} + +sub _d { + my ($package, undef, $line) = caller 0; + @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } + map { defined $_ ? $_ : 'undef' } + @_; + print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; +} + +1; +} +# ########################################################################### +# End Retry package +# ########################################################################### + # ########################################################################### # Transformers package # This package is a copy without comments from the original. The original @@ -4920,10 +4999,31 @@ sub main { } } - $sth->execute(ts(time), @vals); - PTDEBUG && _d($sth->{Statement}); - $sth->finish(); - + my $retry = Retry->new(); + $retry->retry( + tries => 3, + wait => sub { sleep 0.25; return; }, + try => sub { + $sth->execute(ts(time), @vals); + PTDEBUG && _d($sth->{Statement}); + $sth->finish(); + }, + fail => sub { + my (%args) = @_; + my $error = $args{error}; + if ( $error =~ m/Deadlock found/ ) { + return 1; # try again + } + else { + return 0; + } + }, + final_fail => sub { + my (%args) = @_; + die $args{error}; + } + ); + return; }; } @@ -5387,6 +5487,19 @@ information from C and C. These columns are optional. If any are present, their corresponding information will be saved. +=head1 Percona XtraDB Cluster + +Although pt-heartbeat should work with all supported versions of Percona XtraDB +Cluster (PXC), we recommend using 5.5.28-23.7 and newer. + +If you are setting up heartbeat instances between cluster nodes, keep in mind +that, since the speed of the cluster is determined by its slowest node, +pt-heartbeat will not report how fast the cluster itself is, but only how +fast events are replicating from one node to another. + +You must specify L<"--master-server-id"> for L<"--monitor"> and L<"--check"> +instances. + =head1 OPTIONS Specify at least one of L<"--stop">, L<"--update">, L<"--monitor">, or L<"--check">. diff --git a/sandbox/start-sandbox b/sandbox/start-sandbox index 3b775e24..e14c0fc8 100755 --- a/sandbox/start-sandbox +++ b/sandbox/start-sandbox @@ -52,6 +52,10 @@ make_sandbox() { if [ -n "${master_port}" ]; then local master_listen_port=$(($master_port + 10)) cluster_address="gcomm://$ip:$master_listen_port" + + local this_listen_port=$(($port + 10)) + local this_cluster_address="gcomm://$ip:$this_listen_port" + sed -e "s!gcomm://\$!$this_cluster_address!g" -i.bak "/tmp/$master_port/my.sandbox.cnf" fi sed -e "s/ADDR/$ip/g" -i.bak "/tmp/$port/my.sandbox.cnf" @@ -118,7 +122,7 @@ make_sandbox() { debug_sandbox $port exit 1 fi - + # If the sandbox is a slave, start the slave. if [ "$type" = "slave" ]; then /tmp/$port/use -e "change master to master_host='127.0.0.1', master_user='msandbox', master_password='msandbox', master_port=$master_port" diff --git a/sandbox/test-env b/sandbox/test-env index 2c7e5c39..23eaf561 100755 --- a/sandbox/test-env +++ b/sandbox/test-env @@ -299,6 +299,12 @@ case $opt in exit_status=$((exit_status | $?)) if [ "${2:-""}" = "cluster" ]; then + # Bit of magic here. 'start-sandbox cluster new_node old_node' + # changes old_node's my.sandbox.cnf's wsrep_cluster_address to + # point to new_node. This is especially useful because otherwise, + # calling stop/start like below on 12345 would create a new cluster. + /tmp/12345/stop >/dev/null + /tmp/12345/start >/dev/null echo -n "Checking that the cluster size is correct... " size=$(/tmp/12345/use -ss -e "SHOW STATUS LIKE 'wsrep_cluster_size'" | awk '{print $2}') if [ ${size:-0} -ne 3 ]; then diff --git a/t/pt-heartbeat/pxc.t b/t/pt-heartbeat/pxc.t new file mode 100644 index 00000000..dbb458c0 --- /dev/null +++ b/t/pt-heartbeat/pxc.t @@ -0,0 +1,384 @@ +#!/usr/bin/env perl + +BEGIN { + die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n" + unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH}; + unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib"; +}; + +use strict; +use warnings FATAL => 'all'; +use English qw(-no_match_vars); +use Test::More; +use Data::Dumper; + +use File::Temp qw(tempfile); + +use PerconaTest; +use Sandbox; + +require "$trunk/bin/pt-heartbeat"; +# Do this after requiring pt-hb, since it uses Mo +require VersionParser; + +my $dp = new DSNParser(opts=>$dsn_opts); +my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp); +my $node1 = $sb->get_dbh_for('node1'); +my $node2 = $sb->get_dbh_for('node2'); +my $node3 = $sb->get_dbh_for('node3'); + +if ( !$node1 ) { + plan skip_all => 'Cannot connect to cluster node1'; +} +elsif ( !$node2 ) { + plan skip_all => 'Cannot connect to cluster node2'; +} +elsif ( !$node3 ) { + plan skip_all => 'Cannot connect to cluster node3'; +} + +my $db_flavor = VersionParser->new($node1)->flavor(); +if ( $db_flavor !~ /XtraDB Cluster/ ) { + plan skip_all => "PXC tests"; +} + +my $node1_dsn = $sb->dsn_for('node1'); +my $node2_dsn = $sb->dsn_for('node2'); +my $node3_dsn = $sb->dsn_for('node3'); +my $node1_port = $sb->port_for('node1'); +my $node2_port = $sb->port_for('node2'); +my $node3_port = $sb->port_for('node3'); + +my $output; +my $exit; +my $base_pidfile = (tempfile("/tmp/pt-heartbeat-test.XXXXXXXX", OPEN => 0, UNLINK => 0))[1]; +my $sample = "t/pt-heartbeat/samples/"; + +my $sentinel = '/tmp/pt-heartbeat-sentinel'; + +diag(`rm -rf $sentinel >/dev/null 2>&1`); +$sb->create_dbs($node1, ['test']); + +my @exec_pids; +my @pidfiles; + +sub start_update_instance { + my ($port) = @_; + my $pidfile = "$base_pidfile.$port.pid"; + push @pidfiles, $pidfile; + + my $pid = fork(); + die "Cannot fork: $OS_ERROR" unless defined $pid; + if ( $pid == 0 ) { + my $cmd = "$trunk/bin/pt-heartbeat"; + exec { $cmd } $cmd, qw(-h 127.0.0.1 -u msandbox -p msandbox -P), $port, + qw(--database test --table heartbeat --create-table), + qw(--update --interval 0.5 --pid), $pidfile; + exit 1; + } + push @exec_pids, $pid; + + PerconaTest::wait_for_files($pidfile); + ok( + -f $pidfile, + "--update on $port started" + ); +} + +sub stop_all_instances { + my @pids = @exec_pids, map { chomp; $_ } map { slurp_file($_) } @pidfiles; + diag(`$trunk/bin/pt-heartbeat --stop >/dev/null`); + + waitpid($_, 0) for @pids; + PerconaTest::wait_until(sub{ !-e $_ }) for @pidfiles; + + unlink $sentinel; +} + +foreach my $port ( map { $sb->port_for($_) } qw(node1 node2 node3) ) { + start_update_instance($port); +} + +# ############################################################################# +# Basic cluster tests +# ############################################################################# + +my $rows = $node1->selectall_hashref("select * from test.heartbeat", 'server_id'); + +is( + scalar keys %$rows, + 3, + "Sanity check: All nodes are in the heartbeat table" +); + +my $only_slave_data = { + map { + $_ => { + relay_master_log_file => $rows->{$_}->{relay_master_log_file}, + exec_master_log_pos => $rows->{$_}->{exec_master_log_pos}, + } } keys %$rows +}; + +my $same_data = { relay_master_log_file => undef, exec_master_log_pos => undef }; +is_deeply( + $only_slave_data, + { + 12345 => $same_data, + 12346 => $same_data, + 12347 => $same_data, + }, + "Sanity check: No slave data (relay log or master pos) is stored" +); + +$output = output(sub{ + pt_heartbeat::main($node1_dsn, qw(-D test --check)), + }, + stderr => 1, +); + +like( + $output, + qr/\QThe --master-server-id option must be specified because the heartbeat table `test`.`heartbeat`/, + "pt-heartbeat --check + PXC doesn't autodetect a master if there isn't any" +); + +$output = output(sub{ + pt_heartbeat::main($node1_dsn, qw(-D test --check), + '--master-server-id', $node3_port), + }, + stderr => 1, +); + +$output =~ s/\d\.\d{2}/0.00/g; +is( + $output, + "0.00\n", + "pt-heartbeat --check + PXC works with --master-server-id" +); + +# Test --monitor + +$output = output(sub { + pt_heartbeat::main($node1_dsn, + qw(-D test --monitor --run-time 1s), + '--master-server-id', $node3_port) + }, + stderr => 1, +); + +$output =~ s/\d\.\d{2}/0.00/g; +is( + $output, + "0.00s [ 0.00s, 0.00s, 0.00s ]\n", + "--monitor works" +); + +# Try to generate some lag between cluster nodes. Rather brittle at the moment. + +# Lifted from alter active table +my $pt_osc_sample = "t/pt-online-schema-change/samples"; + +my $query_table_stop = "/tmp/query_table.$PID.stop"; +my $query_table_pid = "/tmp/query_table.$PID.pid"; +my $query_table_output = "/tmp/query_table.$PID.output"; + +$sb->create_dbs($node1, ['pt_osc']); +$sb->load_file('master', "$pt_osc_sample/basic_no_fks_innodb.sql"); + +$node1->do("USE pt_osc"); +$node1->do("TRUNCATE TABLE t"); +$node1->do("LOAD DATA INFILE '$trunk/$pt_osc_sample/basic_no_fks.data' INTO TABLE t"); +$node1->do("ANALYZE TABLE t"); +$sb->wait_for_slaves(); + +diag(`rm -rf $query_table_stop`); +diag(`echo > $query_table_output`); + +my $cmd = "$trunk/$pt_osc_sample/query_table.pl"; +system("$cmd 127.0.0.1 $node1_port pt_osc t id $query_table_stop $query_table_pid >$query_table_output 2>&1 &"); +wait_until(sub{-e $query_table_pid}); + +# Reload sakila +system "$trunk/sandbox/load-sakila-db $node1_port &"; + +$output = output(sub { + pt_heartbeat::main($node3_dsn, + qw(-D test --monitor --run-time 5s), + '--master-server-id', $node1_port) + }, + stderr => 1, +); + +like( + $output, + qr/^(?:0\.(?:\d[1-9]|[1-9]\d)|\d*[1-9]\d*\.\d{2})s\s+\[/m, + "pt-heartbeat can detect replication lag between nodes" +); + +diag(`touch $query_table_stop`); +chomp(my $p = slurp_file($query_table_pid)); +wait_until(sub{!kill 0, $p}); + +$node1->do(q{DROP DATABASE pt_osc}); + +$sb->wait_for_slaves(); + +# ############################################################################# +# cluster, node1 -> slave, run on node1 +# ############################################################################# + +my ($slave_dbh, $slave_dsn) = $sb->start_sandbox( + server => 'cslave1', + type => 'slave', + master => 'node1', + env => q/BINLOG_FORMAT="ROW"/, +); + +$sb->create_dbs($slave_dbh, ['test']); + +start_update_instance($sb->port_for('cslave1')); +PerconaTest::wait_for_table($slave_dbh, "test.heartbeat", "1=1"); + +$output = output(sub{ + pt_heartbeat::main($slave_dsn, qw(-D test --check)), + }, + stderr => 1, +); + +like( + $output, + qr/\d\.\d{2}\n/, + "pt-heartbeat --check works on a slave of a cluster node" +); + +$output = output(sub { + pt_heartbeat::main($slave_dsn, + qw(-D test --monitor --run-time 2s)) + }, + stderr => 1, +); + +like( + $output, + qr/^\d.\d{2}s\s+\[/, + "pt-heartbeat --monitor + slave of a node1, without --master-server-id" +); + +$output = output(sub { + pt_heartbeat::main($slave_dsn, + qw(-D test --monitor --run-time 2s), + '--master-server-id', $node3_port) + }, + stderr => 1, +); + +like( + $output, + qr/^\d.\d{2}s\s+\[/, + "pt-heartbeat --monitor + slave of node1, --master-server-id pointing to node3" +); + +# ############################################################################# +# master -> node1 in cluster +# ############################################################################# + +# CAREFUL! See the comments in t/pt-table-checksum/pxc.t about cmaster. +# Nearly everything applies here. + +my ($master_dbh, $master_dsn) = $sb->start_sandbox( + server => 'cmaster', + type => 'master', + env => q/BINLOG_FORMAT="ROW"/, +); + +my $cmaster_port = $sb->port_for('cmaster'); + +$sb->create_dbs($master_dbh, ['test']); + +$master_dbh->do("FLUSH LOGS"); +$master_dbh->do("RESET MASTER"); + +$sb->set_as_slave('node1', 'cmaster'); + +start_update_instance($sb->port_for('cmaster')); +PerconaTest::wait_for_table($node1, "test.heartbeat", "server_id=$cmaster_port"); + +$output = output(sub{ + pt_heartbeat::main($node1_dsn, qw(-D test --check --print-master-server-id)), + }, + stderr => 1, +); + +like( + $output, + qr/^\d.\d{2} $cmaster_port$/, + "--print-master-id works for master -> $node1_port, when run from $node1_port" +); + +# Wait until node2 & node3 get cmaster in their heartbeat tables +$sb->wait_for_slaves(master => 'node1', slave => 'node2'); +$sb->wait_for_slaves(master => 'node1', slave => 'node3'); + +foreach my $test ( + [ $node2_port, $node2_dsn, $node2 ], + [ $node3_port, $node3_dsn, $node3 ], +) { + my ($port, $dsn, $dbh) = @$test; + + $output = output(sub{ + pt_heartbeat::main($dsn, qw(-D test --check --print-master-server-id)), + }, + stderr => 1, + ); + + # This could be made to work, see the node autodiscovery branch + TODO: { + local $::TODO = "cmaster -> node1, other nodes can't autodetect the master"; + like( + $output, + qr/$cmaster_port/, + "--print-master-id works for master -> $node1_port, when run from $port" + ); + } + + $output = output(sub{ + pt_heartbeat::main($dsn, qw(-D test --check --master-server-id), $cmaster_port), + }, + stderr => 1, + ); + + $output =~ s/\d\.\d{2}/0.00/g; + is( + $output, + "0.00\n", + "--check + explicit --master-server-id work for master -> node1, run from $port" + ); +} + +# ############################################################################ +# Stop the --update instances. +# ############################################################################ + +stop_all_instances(); + +# ############################################################################ +# Disconnect & stop the two servers we started +# ############################################################################ + +# We have to do this after the --stop, otherwise the --update processes will +# spew a bunch of warnings and clog + +$slave_dbh->disconnect; +$master_dbh->disconnect; +$sb->stop_sandbox('cslave1', 'cmaster'); +$node1->do("STOP SLAVE"); +$node1->do("RESET SLAVE"); + +# ############################################################################# +# Done. +# ############################################################################# +$sb->wipe_clean($node1); +diag(`/tmp/12345/stop`); +diag(`/tmp/12345/start`); +ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox"); +done_testing;