From 41d7700aa5c62dae1f520e6c7378806c9d4c8d9a Mon Sep 17 00:00:00 2001 From: Brian Fraser Date: Fri, 23 Nov 2012 19:00:38 -0300 Subject: [PATCH] WIP ptc: Autodetect cluster nodes & recurse to slaves --- bin/pt-table-checksum | 78 +++++++++--- lib/Percona/XtraDB/Cluster.pm | 31 +++-- sandbox/servers/pxc/5.5/my.sandbox.cnf | 2 +- t/pt-table-checksum/pxc.t | 158 +++++++++++++------------ 4 files changed, 166 insertions(+), 103 deletions(-) diff --git a/bin/pt-table-checksum b/bin/pt-table-checksum index 39701ca3..8e50dc20 100755 --- a/bin/pt-table-checksum +++ b/bin/pt-table-checksum @@ -3425,8 +3425,7 @@ sub find_cluster_nodes { my $dsn = $args{dsn}; my $dp = $args{DSNParser}; my $make_cxn = $args{make_cxn}; - - + my $sql = q{SHOW STATUS LIKE 'wsrep_incoming_addresses'}; PTDEBUG && _d($sql); my (undef, $addresses) = $dbh->selectrow_array($sql); @@ -3442,16 +3441,18 @@ sub find_cluster_nodes { my $spec = "h=$host" . ($port ? ",P=$port" : ""); my $node_dsn = $dp->parse($spec, $dsn); - my $node_dbh = eval { - $dp->get_dbh( - $dp->get_cxn_params($node_dsn), { AutoCommit => 1 }); - PTDEBUG && _d('Connected to', $dp->as_string($node_dsn)); - }; + my $node_dbh = eval { $dp->get_dbh( + $dp->get_cxn_params($node_dsn), { AutoCommit => 1 }) }; if ( $EVAL_ERROR ) { print STDERR "Cannot connect to ", $dp->as_string($node_dsn), ", discovered through $sql: $EVAL_ERROR\n"; + if ( !$port && $dsn->{P} != 3306 ) { + $address .= ":3306"; + redo; + } next; } + PTDEBUG && _d('Connected to', $dp->as_string($node_dsn)); $node_dbh->disconnect(); push @nodes, $make_cxn->(dsn => $node_dsn); @@ -3468,7 +3469,7 @@ sub remove_duplicate_cxns { my @unique_cxns; CXN: foreach my $cxn ( @cxns ) { - if ( !$self->cluster_node($cxn) ) { + if ( !$self->is_cluster_node($cxn) ) { push @unique_cxns, $cxn; next CXN; } @@ -3476,7 +3477,7 @@ sub remove_duplicate_cxns { my $dbh = $cxn->dbh(); my $sql = q{SHOW VARIABLES LIKE 'wsrep_sst_receive_address'}; PTDEBUG && _d($dbh, $sql); - my (undef, $receive_addr) = $dbh->selectrow_array(); + my (undef, $receive_addr) = $dbh->selectrow_array($sql); if ( !$receive_addr ) { PTDEBUG && _d(q{Query returned nothing, assuming that it's }, @@ -3492,8 +3493,6 @@ sub remove_duplicate_cxns { } } - warn "<@cxns>"; - warn "<@unique_cxns>"; return @unique_cxns; } @@ -8767,14 +8766,57 @@ sub main { make_cxn => $make_cxn_cluster, ); - if ( $cluster_name_for{$master_cxn} ) { - push @$slaves, $cluster->find_cluster_nodes( - dbh => $master_dbh, - dsn => $master_dsn, - make_cxn => $make_cxn_cluster, - DSNParser => $dp, - ); + if ( $o->get('recursion-method') !~ /^dsn/i ) { + my %seen; + my @new_slaves; + for my $slave ( @$slaves ) { + next unless $cluster->is_cluster_node($slave); + my @nodes = $cluster->find_cluster_nodes( + dbh => $slave->dbh(), + dsn => $slave->dsn(), + make_cxn => $make_cxn_cluster, + DSNParser => $dp, + ); + @nodes = grep { !$seen{$dp->as_string($_->dsn)}++ } + grep { !$cluster->same_node($slave, $_) } @nodes; + push @new_slaves, @nodes; + foreach my $node (@nodes) { + my $node_slaves = $ms->get_slaves( + dbh => $node->dbh(), + dsn => $node->dsn(), + make_cxn => $make_cxn_cluster, + ); + push @new_slaves, @$node_slaves; + } + ($master_cxn, @new_slaves) = + $cluster->remove_duplicate_cxns($master_cxn, @new_slaves); + } + push @$slaves, @new_slaves; + } + + if ( $cluster_name_for{$master_cxn} ) { + if ( $o->get('recursion-method') !~ /^dsn/i ) { + my @nodes = $cluster->find_cluster_nodes( + dbh => $master_dbh, + dsn => $master_dsn, + make_cxn => $make_cxn_cluster, + DSNParser => $dp, + ); + + @nodes = grep { !$cluster->same_node($master_cxn, $_) } @nodes; + push @$slaves, @nodes; + + foreach my $node (@nodes) { + my $node_slaves = $ms->get_slaves( + dbh => $node->dbh(), + dsn => $node->dsn(), + make_cxn => $make_cxn_cluster, + ); + push @$slaves, @$node_slaves; + } + } + my @pruned_slaves; ($master_cxn, @pruned_slaves) = $cluster->remove_duplicate_cxns($master_cxn, @$slaves); diff --git a/lib/Percona/XtraDB/Cluster.pm b/lib/Percona/XtraDB/Cluster.pm index 81d6f3e4..092f3984 100644 --- a/lib/Percona/XtraDB/Cluster.pm +++ b/lib/Percona/XtraDB/Cluster.pm @@ -64,6 +64,8 @@ sub same_node { } # TODO: Check that the PXC version supports wsrep_incoming_addresses +# Not really necessary, actually. But in case it's needed, +# wsrep_provider_version =~ /[0-9]+\.[0-9]+\(r([0-9]+)\)/ && $1 >= 137 sub find_cluster_nodes { my ($self, %args) = @_; @@ -72,7 +74,10 @@ sub find_cluster_nodes { my $dp = $args{DSNParser}; my $make_cxn = $args{make_cxn}; - + # Ostensibly the caller should've done this already, but + # useful for safety. + $dp->fill_in_dsn($dbh, $dsn); + my $sql = q{SHOW STATUS LIKE 'wsrep_incoming_addresses'}; PTDEBUG && _d($sql); my (undef, $addresses) = $dbh->selectrow_array($sql); @@ -88,16 +93,24 @@ sub find_cluster_nodes { my $spec = "h=$host" . ($port ? ",P=$port" : ""); my $node_dsn = $dp->parse($spec, $dsn); - my $node_dbh = eval { - $dp->get_dbh( - $dp->get_cxn_params($node_dsn), { AutoCommit => 1 }); - PTDEBUG && _d('Connected to', $dp->as_string($node_dsn)); - }; + my $node_dbh = eval { $dp->get_dbh( + $dp->get_cxn_params($node_dsn), { AutoCommit => 1 }) }; if ( $EVAL_ERROR ) { print STDERR "Cannot connect to ", $dp->as_string($node_dsn), ", discovered through $sql: $EVAL_ERROR\n"; + # This is a bit strange, so an explanation is called for. + # If there wasn't a port, that means that this bug + # https://bugs.launchpad.net/percona-toolkit/+bug/1082406 + # isn't fixed on this version of PXC. We tried using the + # master's port, but that didn't work. So try again, using + # the default port. + if ( !$port && $dsn->{P} != 3306 ) { + $address .= ":3306"; + redo; + } next; } + PTDEBUG && _d('Connected to', $dp->as_string($node_dsn)); $node_dbh->disconnect(); push @nodes, $make_cxn->(dsn => $node_dsn); @@ -125,7 +138,7 @@ sub remove_duplicate_cxns { CXN: foreach my $cxn ( @cxns ) { # If not a cluster node, assume that it's unique - if ( !$self->cluster_node($cxn) ) { + if ( !$self->is_cluster_node($cxn) ) { push @unique_cxns, $cxn; next CXN; } @@ -134,7 +147,7 @@ sub remove_duplicate_cxns { my $dbh = $cxn->dbh(); my $sql = q{SHOW VARIABLES LIKE 'wsrep_sst_receive_address'}; PTDEBUG && _d($dbh, $sql); - my (undef, $receive_addr) = $dbh->selectrow_array(); + my (undef, $receive_addr) = $dbh->selectrow_array($sql); if ( !$receive_addr ) { PTDEBUG && _d(q{Query returned nothing, assuming that it's }, @@ -150,8 +163,6 @@ sub remove_duplicate_cxns { } } - warn "<@cxns>"; - warn "<@unique_cxns>"; return @unique_cxns; } diff --git a/sandbox/servers/pxc/5.5/my.sandbox.cnf b/sandbox/servers/pxc/5.5/my.sandbox.cnf index 8bf4a692..ad2673d9 100644 --- a/sandbox/servers/pxc/5.5/my.sandbox.cnf +++ b/sandbox/servers/pxc/5.5/my.sandbox.cnf @@ -31,7 +31,7 @@ binlog_format = ROW wsrep_provider = LIBGALERA wsrep_cluster_address = CLUSTER_AD wsrep_sst_receive_address = ADDR:RECEIVE_PRT -wsrep_node_incoming_address= ADDR +wsrep_node_incoming_address= ADDR:PORT wsrep_slave_threads = 2 wsrep_cluster_name = CLUSTER_NAME wsrep_provider_options = "gmcast.listen_addr=tcp://ADDR:LISTEN_PRT;" diff --git a/t/pt-table-checksum/pxc.t b/t/pt-table-checksum/pxc.t index df833958..31c3bced 100644 --- a/t/pt-table-checksum/pxc.t +++ b/t/pt-table-checksum/pxc.t @@ -272,6 +272,7 @@ is( "Slave is changed" ); +for my $output = output( sub { pt_table_checksum::main(@args, '--recursion-method', "dsn=$node1_dsn,D=dsns,t=dsns", @@ -445,92 +446,101 @@ like( "Warns that direct replica of the master isn't found or specified", ); +# Originally, these tested a dsn table with all nodes. # Use the other DSN table with all three nodes. Now the tool should # give a more specific warning than that ^. -$output = output( - sub { pt_table_checksum::main($master_dsn, - '--recursion-method', "dsn=$node1_dsn,D=dsns,t=dsns", - qw(-d test)) - }, - stderr => 1, -); +for my $args ( + ["using recusion-method", '--recursion-method', "dsn=$node1_dsn,D=dsns,t=dsns"], + ["autodetecting everything"] + ) +{ + my $test = shift @$args; + $output = output( + sub { pt_table_checksum::main($master_dsn, + @$args, + qw(-d test)) + }, + stderr => 1, + ); -is( - PerconaTest::count_checksum_results($output, 'diffs'), - 1, - "...check all nodes: 1 diff" -) or diag($output); + is( + PerconaTest::count_checksum_results($output, 'diffs'), + 1, + "...check all nodes: 1 diff ($test)" + ) or diag($output); -# 11-17T13:02:54 0 1 26 1 0 0.021 test.t -like( - $output, - qr/^\S+\s+ # ts - 0\s+ # errors - 1\s+ # diffs - 26\s+ # rows - \d+\s+ # chunks - 0\s+ # skipped - \S+\s+ # time - test.t$ # table - /xm, - "...check all nodes: it's in test.t" -); + # 11-17T13:02:54 0 1 26 1 0 0.021 test.t + like( + $output, + qr/^\S+\s+ # ts + 0\s+ # errors + 1\s+ # diffs + 26\s+ # rows + \d+\s+ # chunks + 0\s+ # skipped + \S+\s+ # time + test.t$ # table + /xm, + "...check all nodes: it's in test.t ($test)" + ); -like( - $output, - qr/Diffs will only be detected if the cluster is consistent with h=127.1,P=12345 because h=127.1,P=12349/, - "Warns that diffs only detected if cluster consistent with direct replica", -); + like( + $output, + qr/Diffs will only be detected if the cluster is consistent with h=127.1,P=12345 because h=127.1,P=12349/, + "Warns that diffs only detected if cluster consistent with direct replica ($test)", + ); -# Restore node1 so the cluster is consistent, but then make node2 differ. -# ptc should NOT detect this diff because the checksum query will replicate -# to node1, node1 isn't different, so it broadcasts the result in ROW format -# that all is ok, which node2 gets and thus false reports. This is why -# those ^ warnings exist. -$node1->do("set sql_log_bin=0"); -$node1->do("update test.t set c='z' where c='zebra'"); -$node1->do("set sql_log_bin=1"); + # Restore node1 so the cluster is consistent, but then make node2 differ. + # ptc should NOT detect this diff because the checksum query will replicate + # to node1, node1 isn't different, so it broadcasts the result in ROW format + # that all is ok, which node2 gets and thus false reports. This is why + # those ^ warnings exist. + $node1->do("set sql_log_bin=0"); + $node1->do("update test.t set c='z' where c='zebra'"); + $node1->do("set sql_log_bin=1"); -$node2->do("set sql_log_bin=0"); -$node2->do("update test.t set c='zebra' where c='z'"); -$node2->do("set sql_log_bin=1"); + $node2->do("set sql_log_bin=0"); + $node2->do("update test.t set c='zebra' where c='z'"); + $node2->do("set sql_log_bin=1"); -($row) = $node2->selectrow_array("select c from test.t order by c desc limit 1"); -is( - $row, - "zebra", - "Node2 is changed again" -); + ($row) = $node2->selectrow_array("select c from test.t order by c desc limit 1"); + is( + $row, + "zebra", + "Node2 is changed again ($test)" + ); -($row) = $node1->selectrow_array("select c from test.t order by c desc limit 1"); -is( - $row, - "z", - "Node1 not changed again" -); + ($row) = $node1->selectrow_array("select c from test.t order by c desc limit 1"); + is( + $row, + "z", + "Node1 not changed again ($test)" + ); -($row) = $node3->selectrow_array("select c from test.t order by c desc limit 1"); -is( - $row, - "z", - "Node3 not changed again" -); + ($row) = $node3->selectrow_array("select c from test.t order by c desc limit 1"); + is( + $row, + "z", + "Node3 not changed again ($test)" + ); -# the other DSN table with all three nodes, but it won't matter because -# node1 is going to broadcast the false-positive that there are no diffs. -$output = output( - sub { pt_table_checksum::main($master_dsn, - '--recursion-method', "dsn=$node1_dsn,D=dsns,t=dsns", - qw(-d test)) - }, - stderr => 1, -); + # the other DSN table with all three nodes, but it won't matter because + # node1 is going to broadcast the false-positive that there are no diffs. + $output = output( + sub { pt_table_checksum::main($master_dsn, + @$args, + qw(-d test)) + }, + stderr => 1, + ); -is( - PerconaTest::count_checksum_results($output, 'diffs'), - 0, - "Limitation: diff not on direct replica not detected" -) or diag($output); + is( + PerconaTest::count_checksum_results($output, 'diffs'), + 0, + "Limitation: diff not on direct replica not detected ($test)" + ) or diag($output); + +} # ########################################################################### # Be sure to stop the slave on node1, else further test will die with: