Merged pxc-pt-heartbeat

This commit is contained in:
Brian Fraser
2012-12-13 22:24:03 -03:00
4 changed files with 512 additions and 5 deletions

View File

@@ -20,6 +20,7 @@ BEGIN {
Daemon
Quoter
TableParser
Retry
Transformers
VersionCheck
HTTPMicro
@@ -2920,6 +2921,84 @@ sub _d {
# End TableParser package
# ###########################################################################
# ###########################################################################
# Retry package
# This package is a copy without comments from the original. The original
# with comments and its test file can be found in the Bazaar repository at,
# lib/Retry.pm
# t/lib/Retry.t
# See https://launchpad.net/percona-toolkit for more information.
# ###########################################################################
{
package Retry;
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use constant PTDEBUG => $ENV{PTDEBUG} || 0;
sub new {
my ( $class, %args ) = @_;
my $self = {
%args,
};
return bless $self, $class;
}
sub retry {
my ( $self, %args ) = @_;
my @required_args = qw(try fail final_fail);
foreach my $arg ( @required_args ) {
die "I need a $arg argument" unless $args{$arg};
};
my ($try, $fail, $final_fail) = @args{@required_args};
my $wait = $args{wait} || sub { sleep 1; };
my $tries = $args{tries} || 3;
my $last_error;
my $tryno = 0;
TRY:
while ( ++$tryno <= $tries ) {
PTDEBUG && _d("Try", $tryno, "of", $tries);
my $result;
eval {
$result = $try->(tryno=>$tryno);
};
if ( $EVAL_ERROR ) {
PTDEBUG && _d("Try code failed:", $EVAL_ERROR);
$last_error = $EVAL_ERROR;
if ( $tryno < $tries ) { # more retries
my $retry = $fail->(tryno=>$tryno, error=>$last_error);
last TRY unless $retry;
PTDEBUG && _d("Calling wait code");
$wait->(tryno=>$tryno);
}
}
else {
PTDEBUG && _d("Try code succeeded");
return $result;
}
}
PTDEBUG && _d('Try code did not succeed');
return $final_fail->(error=>$last_error);
}
sub _d {
my ($package, undef, $line) = caller 0;
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
map { defined $_ ? $_ : 'undef' }
@_;
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
}
1;
}
# ###########################################################################
# End Retry package
# ###########################################################################
# ###########################################################################
# Transformers package
# This package is a copy without comments from the original. The original
@@ -4920,9 +4999,30 @@ sub main {
}
}
my $retry = Retry->new();
$retry->retry(
tries => 3,
wait => sub { sleep 0.25; return; },
try => sub {
$sth->execute(ts(time), @vals);
PTDEBUG && _d($sth->{Statement});
$sth->finish();
},
fail => sub {
my (%args) = @_;
my $error = $args{error};
if ( $error =~ m/Deadlock found/ ) {
return 1; # try again
}
else {
return 0;
}
},
final_fail => sub {
my (%args) = @_;
die $args{error};
}
);
return;
};
@@ -5387,6 +5487,19 @@ information from C<SHOW MASTER STATUS> and C<SHOW SLAVE STATUS>. These
columns are optional. If any are present, their corresponding information
will be saved.
=head1 Percona XtraDB Cluster
Although pt-heartbeat should work with all supported versions of Percona XtraDB
Cluster (PXC), we recommend using 5.5.28-23.7 and newer.
If you are setting up heartbeat instances between cluster nodes, keep in mind
that, since the speed of the cluster is determined by its slowest node,
pt-heartbeat will not report how fast the cluster itself is, but only how
fast events are replicating from one node to another.
You must specify L<"--master-server-id"> for L<"--monitor"> and L<"--check">
instances.
=head1 OPTIONS
Specify at least one of L<"--stop">, L<"--update">, L<"--monitor">, or L<"--check">.

View File

@@ -52,6 +52,10 @@ make_sandbox() {
if [ -n "${master_port}" ]; then
local master_listen_port=$(($master_port + 10))
cluster_address="gcomm://$ip:$master_listen_port"
local this_listen_port=$(($port + 10))
local this_cluster_address="gcomm://$ip:$this_listen_port"
sed -e "s!gcomm://\$!$this_cluster_address!g" -i.bak "/tmp/$master_port/my.sandbox.cnf"
fi
sed -e "s/ADDR/$ip/g" -i.bak "/tmp/$port/my.sandbox.cnf"

View File

@@ -299,6 +299,12 @@ case $opt in
exit_status=$((exit_status | $?))
if [ "${2:-""}" = "cluster" ]; then
# Bit of magic here. 'start-sandbox cluster new_node old_node'
# changes old_node's my.sandbox.cnf's wsrep_cluster_address to
# point to new_node. This is especially useful because otherwise,
# calling stop/start like below on 12345 would create a new cluster.
/tmp/12345/stop >/dev/null
/tmp/12345/start >/dev/null
echo -n "Checking that the cluster size is correct... "
size=$(/tmp/12345/use -ss -e "SHOW STATUS LIKE 'wsrep_cluster_size'" | awk '{print $2}')
if [ ${size:-0} -ne 3 ]; then

384
t/pt-heartbeat/pxc.t Normal file
View File

@@ -0,0 +1,384 @@
#!/usr/bin/env perl
BEGIN {
die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n"
unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH};
unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib";
};
use strict;
use warnings FATAL => 'all';
use English qw(-no_match_vars);
use Test::More;
use Data::Dumper;
use File::Temp qw(tempfile);
use PerconaTest;
use Sandbox;
require "$trunk/bin/pt-heartbeat";
# Do this after requiring pt-hb, since it uses Mo
require VersionParser;
my $dp = new DSNParser(opts=>$dsn_opts);
my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
my $node1 = $sb->get_dbh_for('node1');
my $node2 = $sb->get_dbh_for('node2');
my $node3 = $sb->get_dbh_for('node3');
if ( !$node1 ) {
plan skip_all => 'Cannot connect to cluster node1';
}
elsif ( !$node2 ) {
plan skip_all => 'Cannot connect to cluster node2';
}
elsif ( !$node3 ) {
plan skip_all => 'Cannot connect to cluster node3';
}
my $db_flavor = VersionParser->new($node1)->flavor();
if ( $db_flavor !~ /XtraDB Cluster/ ) {
plan skip_all => "PXC tests";
}
my $node1_dsn = $sb->dsn_for('node1');
my $node2_dsn = $sb->dsn_for('node2');
my $node3_dsn = $sb->dsn_for('node3');
my $node1_port = $sb->port_for('node1');
my $node2_port = $sb->port_for('node2');
my $node3_port = $sb->port_for('node3');
my $output;
my $exit;
my $base_pidfile = (tempfile("/tmp/pt-heartbeat-test.XXXXXXXX", OPEN => 0, UNLINK => 0))[1];
my $sample = "t/pt-heartbeat/samples/";
my $sentinel = '/tmp/pt-heartbeat-sentinel';
diag(`rm -rf $sentinel >/dev/null 2>&1`);
$sb->create_dbs($node1, ['test']);
my @exec_pids;
my @pidfiles;
sub start_update_instance {
my ($port) = @_;
my $pidfile = "$base_pidfile.$port.pid";
push @pidfiles, $pidfile;
my $pid = fork();
die "Cannot fork: $OS_ERROR" unless defined $pid;
if ( $pid == 0 ) {
my $cmd = "$trunk/bin/pt-heartbeat";
exec { $cmd } $cmd, qw(-h 127.0.0.1 -u msandbox -p msandbox -P), $port,
qw(--database test --table heartbeat --create-table),
qw(--update --interval 0.5 --pid), $pidfile;
exit 1;
}
push @exec_pids, $pid;
PerconaTest::wait_for_files($pidfile);
ok(
-f $pidfile,
"--update on $port started"
);
}
sub stop_all_instances {
my @pids = @exec_pids, map { chomp; $_ } map { slurp_file($_) } @pidfiles;
diag(`$trunk/bin/pt-heartbeat --stop >/dev/null`);
waitpid($_, 0) for @pids;
PerconaTest::wait_until(sub{ !-e $_ }) for @pidfiles;
unlink $sentinel;
}
foreach my $port ( map { $sb->port_for($_) } qw(node1 node2 node3) ) {
start_update_instance($port);
}
# #############################################################################
# Basic cluster tests
# #############################################################################
my $rows = $node1->selectall_hashref("select * from test.heartbeat", 'server_id');
is(
scalar keys %$rows,
3,
"Sanity check: All nodes are in the heartbeat table"
);
my $only_slave_data = {
map {
$_ => {
relay_master_log_file => $rows->{$_}->{relay_master_log_file},
exec_master_log_pos => $rows->{$_}->{exec_master_log_pos},
} } keys %$rows
};
my $same_data = { relay_master_log_file => undef, exec_master_log_pos => undef };
is_deeply(
$only_slave_data,
{
12345 => $same_data,
12346 => $same_data,
12347 => $same_data,
},
"Sanity check: No slave data (relay log or master pos) is stored"
);
$output = output(sub{
pt_heartbeat::main($node1_dsn, qw(-D test --check)),
},
stderr => 1,
);
like(
$output,
qr/\QThe --master-server-id option must be specified because the heartbeat table `test`.`heartbeat`/,
"pt-heartbeat --check + PXC doesn't autodetect a master if there isn't any"
);
$output = output(sub{
pt_heartbeat::main($node1_dsn, qw(-D test --check),
'--master-server-id', $node3_port),
},
stderr => 1,
);
$output =~ s/\d\.\d{2}/0.00/g;
is(
$output,
"0.00\n",
"pt-heartbeat --check + PXC works with --master-server-id"
);
# Test --monitor
$output = output(sub {
pt_heartbeat::main($node1_dsn,
qw(-D test --monitor --run-time 1s),
'--master-server-id', $node3_port)
},
stderr => 1,
);
$output =~ s/\d\.\d{2}/0.00/g;
is(
$output,
"0.00s [ 0.00s, 0.00s, 0.00s ]\n",
"--monitor works"
);
# Try to generate some lag between cluster nodes. Rather brittle at the moment.
# Lifted from alter active table
my $pt_osc_sample = "t/pt-online-schema-change/samples";
my $query_table_stop = "/tmp/query_table.$PID.stop";
my $query_table_pid = "/tmp/query_table.$PID.pid";
my $query_table_output = "/tmp/query_table.$PID.output";
$sb->create_dbs($node1, ['pt_osc']);
$sb->load_file('master', "$pt_osc_sample/basic_no_fks_innodb.sql");
$node1->do("USE pt_osc");
$node1->do("TRUNCATE TABLE t");
$node1->do("LOAD DATA INFILE '$trunk/$pt_osc_sample/basic_no_fks.data' INTO TABLE t");
$node1->do("ANALYZE TABLE t");
$sb->wait_for_slaves();
diag(`rm -rf $query_table_stop`);
diag(`echo > $query_table_output`);
my $cmd = "$trunk/$pt_osc_sample/query_table.pl";
system("$cmd 127.0.0.1 $node1_port pt_osc t id $query_table_stop $query_table_pid >$query_table_output 2>&1 &");
wait_until(sub{-e $query_table_pid});
# Reload sakila
system "$trunk/sandbox/load-sakila-db $node1_port &";
$output = output(sub {
pt_heartbeat::main($node3_dsn,
qw(-D test --monitor --run-time 5s),
'--master-server-id', $node1_port)
},
stderr => 1,
);
like(
$output,
qr/^(?:0\.(?:\d[1-9]|[1-9]\d)|\d*[1-9]\d*\.\d{2})s\s+\[/m,
"pt-heartbeat can detect replication lag between nodes"
);
diag(`touch $query_table_stop`);
chomp(my $p = slurp_file($query_table_pid));
wait_until(sub{!kill 0, $p});
$node1->do(q{DROP DATABASE pt_osc});
$sb->wait_for_slaves();
# #############################################################################
# cluster, node1 -> slave, run on node1
# #############################################################################
my ($slave_dbh, $slave_dsn) = $sb->start_sandbox(
server => 'cslave1',
type => 'slave',
master => 'node1',
env => q/BINLOG_FORMAT="ROW"/,
);
$sb->create_dbs($slave_dbh, ['test']);
start_update_instance($sb->port_for('cslave1'));
PerconaTest::wait_for_table($slave_dbh, "test.heartbeat", "1=1");
$output = output(sub{
pt_heartbeat::main($slave_dsn, qw(-D test --check)),
},
stderr => 1,
);
like(
$output,
qr/\d\.\d{2}\n/,
"pt-heartbeat --check works on a slave of a cluster node"
);
$output = output(sub {
pt_heartbeat::main($slave_dsn,
qw(-D test --monitor --run-time 2s))
},
stderr => 1,
);
like(
$output,
qr/^\d.\d{2}s\s+\[/,
"pt-heartbeat --monitor + slave of a node1, without --master-server-id"
);
$output = output(sub {
pt_heartbeat::main($slave_dsn,
qw(-D test --monitor --run-time 2s),
'--master-server-id', $node3_port)
},
stderr => 1,
);
like(
$output,
qr/^\d.\d{2}s\s+\[/,
"pt-heartbeat --monitor + slave of node1, --master-server-id pointing to node3"
);
# #############################################################################
# master -> node1 in cluster
# #############################################################################
# CAREFUL! See the comments in t/pt-table-checksum/pxc.t about cmaster.
# Nearly everything applies here.
my ($master_dbh, $master_dsn) = $sb->start_sandbox(
server => 'cmaster',
type => 'master',
env => q/BINLOG_FORMAT="ROW"/,
);
my $cmaster_port = $sb->port_for('cmaster');
$sb->create_dbs($master_dbh, ['test']);
$master_dbh->do("FLUSH LOGS");
$master_dbh->do("RESET MASTER");
$sb->set_as_slave('node1', 'cmaster');
start_update_instance($sb->port_for('cmaster'));
PerconaTest::wait_for_table($node1, "test.heartbeat", "server_id=$cmaster_port");
$output = output(sub{
pt_heartbeat::main($node1_dsn, qw(-D test --check --print-master-server-id)),
},
stderr => 1,
);
like(
$output,
qr/^\d.\d{2} $cmaster_port$/,
"--print-master-id works for master -> $node1_port, when run from $node1_port"
);
# Wait until node2 & node3 get cmaster in their heartbeat tables
$sb->wait_for_slaves(master => 'node1', slave => 'node2');
$sb->wait_for_slaves(master => 'node1', slave => 'node3');
foreach my $test (
[ $node2_port, $node2_dsn, $node2 ],
[ $node3_port, $node3_dsn, $node3 ],
) {
my ($port, $dsn, $dbh) = @$test;
$output = output(sub{
pt_heartbeat::main($dsn, qw(-D test --check --print-master-server-id)),
},
stderr => 1,
);
# This could be made to work, see the node autodiscovery branch
TODO: {
local $::TODO = "cmaster -> node1, other nodes can't autodetect the master";
like(
$output,
qr/$cmaster_port/,
"--print-master-id works for master -> $node1_port, when run from $port"
);
}
$output = output(sub{
pt_heartbeat::main($dsn, qw(-D test --check --master-server-id), $cmaster_port),
},
stderr => 1,
);
$output =~ s/\d\.\d{2}/0.00/g;
is(
$output,
"0.00\n",
"--check + explicit --master-server-id work for master -> node1, run from $port"
);
}
# ############################################################################
# Stop the --update instances.
# ############################################################################
stop_all_instances();
# ############################################################################
# Disconnect & stop the two servers we started
# ############################################################################
# We have to do this after the --stop, otherwise the --update processes will
# spew a bunch of warnings and clog
$slave_dbh->disconnect;
$master_dbh->disconnect;
$sb->stop_sandbox('cslave1', 'cmaster');
$node1->do("STOP SLAVE");
$node1->do("RESET SLAVE");
# #############################################################################
# Done.
# #############################################################################
$sb->wipe_clean($node1);
diag(`/tmp/12345/stop`);
diag(`/tmp/12345/start`);
ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox");
done_testing;