PT-2168 pt-osc shouldnt fail while unable to monitor a replica node (#676)

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Proof of concept
- Fixed regular expression in lib/TableParser.pm mistakenly chaged in the tool's code

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added basic test case for PT-2168
- Added more details for replica lag information
- Disconnecting replica if lag is not checked. This prevents "Too many
  connections" error

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Implemented option --wait-lost-replicas for pt-osc, added test case

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added more tests for situations where connection to the replica can
  fail

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Removed extra checks for wait_no_die variable
- Added test cases for SQL queries that pt-osc sends to replicas

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Allow to reload dsns table while waiting for missed replica if
  --recursion-method is dsn
- Fixed logic in replica rediscovery, so it works with replicas on the
  same host but with different ports
- Renamed option wait-lost-replicas to fail-on-stopped-replication, so
  it is in line with pt-table-checksum
- Adjusted tests
- Removed debug code for PT-1760
- Added test case for PT-1760
- Added exception for variable Open_tables_with_triggers in
  lib/bash/collect.sh due to failed test in Percona Server 8.0.34+
- Updated pt-stalk

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Updated modules
- Fixed typo in t/pt-table-sync/bidirectional.t
- Removed trailing whitespaces in lib/MasterSlave.pm

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Help for option --fail-on-stopped-replication

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added check for availability of the simple_rewrite_plugin in t/pt-online-schema-change/pt-2168.t

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added link to the simple_rewrite_plugin source code
- Removed tests for code that runs only in the beginning of pt-osc
  action, so should not be affected by the option fail-on-stopped-replication
This commit is contained in:
Sveta Smirnova
2023-09-19 23:31:33 +03:00
committed by GitHub
parent 36a183e2aa
commit f8c43118bd
19 changed files with 1287 additions and 202 deletions

View File

@@ -2734,6 +2734,7 @@ sub get_slaves {
push @$slaves, $make_cxn->(dsn => $slave_dsn, dbh => $dbh, parent => $parent);
return;
},
wait_no_die => $args{'wait_no_die'},
}
);
} elsif ( $methods->[0] =~ m/^dsn=/i ) {
@@ -2741,6 +2742,7 @@ sub get_slaves {
$slaves = $self->get_cxn_from_dsn_table(
%args,
dsn_table_dsn => $dsn_table_dsn,
wait_no_die => $args{'wait_no_die'},
);
}
elsif ( $methods->[0] =~ m/none/i ) {
@@ -2796,6 +2798,20 @@ sub recurse_to_slaves {
my $dbh = $args->{dbh};
my $get_dbh = sub {
eval {
$dbh = $dp->get_dbh(
$dp->get_cxn_params($slave_dsn), { AutoCommit => 1 }
);
PTDEBUG && _d('Connected to', $dp->as_string($slave_dsn));
};
if ( $EVAL_ERROR ) {
print STDERR "Cannot connect to ", $dp->as_string($slave_dsn), ": ", $EVAL_ERROR, "\n"
or die "Cannot print: $OS_ERROR";
return;
}
};
DBH: {
if ( !defined $dbh ) {
foreach my $known_slave ( @{$args->{slaves}} ) {
@@ -2805,23 +2821,29 @@ sub recurse_to_slaves {
last DBH;
}
}
eval {
$dbh = $dp->get_dbh(
$dp->get_cxn_params($slave_dsn), { AutoCommit => 1 });
PTDEBUG && _d('Connected to', $dp->as_string($slave_dsn));
};
if ( $EVAL_ERROR ) {
print STDERR "Cannot connect to ", $dp->as_string($slave_dsn), ": ", $EVAL_ERROR, "\n"
or die "Cannot print: $OS_ERROR";
return;
}
$get_dbh->();
}
}
my $sql = 'SELECT @@SERVER_ID';
PTDEBUG && _d($sql);
my ($id) = $dbh->selectrow_array($sql);
my $id = undef;
do {
eval {
($id) = $dbh->selectrow_array($sql);
};
if ( $EVAL_ERROR ) {
if ( $args->{wait_no_die} ) {
print STDERR "Error getting server id: ", $EVAL_ERROR,
"\nRetrying query for server ", $slave_dsn->{h}, ":", $slave_dsn->{P}, "\n";
sleep 1;
$dbh->disconnect();
$get_dbh->();
} else {
die $EVAL_ERROR;
}
}
} until ($id);
PTDEBUG && _d('Working on server ID', $id);
my $master_thinks_i_am = $dsn->{server_id};
if ( !defined $id
@@ -3443,18 +3465,39 @@ sub get_cxn_from_dsn_table {
. "or a database-qualified table (t)";
}
my $done = 0;
my $dsn_tbl_cxn = $make_cxn->(dsn => $dsn);
my $dbh = $dsn_tbl_cxn->connect();
my $sql = "SELECT dsn FROM $dsn_table ORDER BY id";
PTDEBUG && _d($sql);
my $dsn_strings = $dbh->selectcol_arrayref($sql);
my @cxn;
if ( $dsn_strings ) {
foreach my $dsn_string ( @$dsn_strings ) {
PTDEBUG && _d('DSN from DSN table:', $dsn_string);
push @cxn, $make_cxn->(dsn_string => $dsn_string);
use Data::Dumper;
DSN:
do {
@cxn = ();
my $dsn_strings = $dbh->selectcol_arrayref($sql);
if ( $dsn_strings ) {
foreach my $dsn_string ( @$dsn_strings ) {
PTDEBUG && _d('DSN from DSN table:', $dsn_string);
if ($args{wait_no_die}) {
my $lcxn;
eval {
$lcxn = $make_cxn->(dsn_string => $dsn_string);
};
if ( $EVAL_ERROR && ($dsn_tbl_cxn->lost_connection($EVAL_ERROR)
|| $EVAL_ERROR =~ m/Can't connect to MySQL server/)) {
PTDEBUG && _d("Server is not accessible, waiting when it is online again");
sleep(1);
goto DSN;
}
push @cxn, $lcxn;
} else {
push @cxn, $make_cxn->(dsn_string => $dsn_string);
}
}
}
}
$done = 1;
} until $done;
return \@cxn;
}