PT-2168 pt-osc shouldnt fail while unable to monitor a replica node (#676)

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Proof of concept
- Fixed regular expression in lib/TableParser.pm mistakenly chaged in the tool's code

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added basic test case for PT-2168
- Added more details for replica lag information
- Disconnecting replica if lag is not checked. This prevents "Too many
  connections" error

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Implemented option --wait-lost-replicas for pt-osc, added test case

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added more tests for situations where connection to the replica can
  fail

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Removed extra checks for wait_no_die variable
- Added test cases for SQL queries that pt-osc sends to replicas

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Allow to reload dsns table while waiting for missed replica if
  --recursion-method is dsn
- Fixed logic in replica rediscovery, so it works with replicas on the
  same host but with different ports
- Renamed option wait-lost-replicas to fail-on-stopped-replication, so
  it is in line with pt-table-checksum
- Adjusted tests
- Removed debug code for PT-1760
- Added test case for PT-1760
- Added exception for variable Open_tables_with_triggers in
  lib/bash/collect.sh due to failed test in Percona Server 8.0.34+
- Updated pt-stalk

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Updated modules
- Fixed typo in t/pt-table-sync/bidirectional.t
- Removed trailing whitespaces in lib/MasterSlave.pm

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Help for option --fail-on-stopped-replication

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added check for availability of the simple_rewrite_plugin in t/pt-online-schema-change/pt-2168.t

* PT-2168 - PT-OSC shouldn't fail while unable to monitor a replica node

- Added link to the simple_rewrite_plugin source code
- Removed tests for code that runs only in the beginning of pt-osc
  action, so should not be affected by the option fail-on-stopped-replication
This commit is contained in:
Sveta Smirnova
2023-09-19 23:31:33 +03:00
committed by GitHub
parent 36a183e2aa
commit f8c43118bd
19 changed files with 1287 additions and 202 deletions

View File

@@ -104,6 +104,7 @@ sub get_slaves {
push @$slaves, $make_cxn->(dsn => $slave_dsn, dbh => $dbh, parent => $parent);
return;
},
wait_no_die => $args{'wait_no_die'},
}
);
} elsif ( $methods->[0] =~ m/^dsn=/i ) {
@@ -111,6 +112,7 @@ sub get_slaves {
$slaves = $self->get_cxn_from_dsn_table(
%args,
dsn_table_dsn => $dsn_table_dsn,
wait_no_die => $args{'wait_no_die'},
);
}
elsif ( $methods->[0] =~ m/none/i ) {
@@ -188,6 +190,20 @@ sub recurse_to_slaves {
my $dbh = $args->{dbh};
my $get_dbh = sub {
eval {
$dbh = $dp->get_dbh(
$dp->get_cxn_params($slave_dsn), { AutoCommit => 1 }
);
PTDEBUG && _d('Connected to', $dp->as_string($slave_dsn));
};
if ( $EVAL_ERROR ) {
print STDERR "Cannot connect to ", $dp->as_string($slave_dsn), ": ", $EVAL_ERROR, "\n"
or die "Cannot print: $OS_ERROR";
return;
}
};
DBH: {
if ( !defined $dbh ) {
foreach my $known_slave ( @{$args->{slaves}} ) {
@@ -197,23 +213,29 @@ sub recurse_to_slaves {
last DBH;
}
}
eval {
$dbh = $dp->get_dbh(
$dp->get_cxn_params($slave_dsn), { AutoCommit => 1 });
PTDEBUG && _d('Connected to', $dp->as_string($slave_dsn));
};
if ( $EVAL_ERROR ) {
print STDERR "Cannot connect to ", $dp->as_string($slave_dsn), ": ", $EVAL_ERROR, "\n"
or die "Cannot print: $OS_ERROR";
return;
}
$get_dbh->();
}
}
my $sql = 'SELECT @@SERVER_ID';
PTDEBUG && _d($sql);
my ($id) = $dbh->selectrow_array($sql);
my $id = undef;
do {
eval {
($id) = $dbh->selectrow_array($sql);
};
if ( $EVAL_ERROR ) {
if ( $args->{wait_no_die} ) {
print STDERR "Error getting server id: ", $EVAL_ERROR,
"\nRetrying query for server ", $slave_dsn->{h}, ":", $slave_dsn->{P}, "\n";
sleep 1;
$dbh->disconnect();
$get_dbh->();
} else {
die $EVAL_ERROR;
}
}
} until ($id);
PTDEBUG && _d('Working on server ID', $id);
my $master_thinks_i_am = $dsn->{server_id};
if ( !defined $id
@@ -994,18 +1016,39 @@ sub get_cxn_from_dsn_table {
. "or a database-qualified table (t)";
}
my $done = 0;
my $dsn_tbl_cxn = $make_cxn->(dsn => $dsn);
my $dbh = $dsn_tbl_cxn->connect();
my $sql = "SELECT dsn FROM $dsn_table ORDER BY id";
PTDEBUG && _d($sql);
my $dsn_strings = $dbh->selectcol_arrayref($sql);
my @cxn;
if ( $dsn_strings ) {
foreach my $dsn_string ( @$dsn_strings ) {
PTDEBUG && _d('DSN from DSN table:', $dsn_string);
push @cxn, $make_cxn->(dsn_string => $dsn_string);
use Data::Dumper;
DSN:
do {
@cxn = ();
my $dsn_strings = $dbh->selectcol_arrayref($sql);
if ( $dsn_strings ) {
foreach my $dsn_string ( @$dsn_strings ) {
PTDEBUG && _d('DSN from DSN table:', $dsn_string);
if ($args{wait_no_die}) {
my $lcxn;
eval {
$lcxn = $make_cxn->(dsn_string => $dsn_string);
};
if ( $EVAL_ERROR && ($dsn_tbl_cxn->lost_connection($EVAL_ERROR)
|| $EVAL_ERROR =~ m/Can't connect to MySQL server/)) {
PTDEBUG && _d("Server is not accessible, waiting when it is online again");
sleep(1);
goto DSN;
}
push @cxn, $lcxn;
} else {
push @cxn, $make_cxn->(dsn_string => $dsn_string);
}
}
}
}
$done = 1;
} until $done;
return \@cxn;
}

View File

@@ -87,9 +87,9 @@ sub wait {
my ($self) = @_;
my ($slaves, $refresher) = ($self->{slaves}, $self->{get_slaves_cb});
return $slaves if ( not defined $refresher );
my $before = join ' ', sort map {$_->name()} @$slaves;
my $before = join ' ', sort map {$_->description()} @$slaves;
$slaves = $refresher->();
my $after = join ' ', sort map {$_->name()} @$slaves;
my $after = join ' ', sort map {$_->description()} @$slaves;
if ($before ne $after) {
$self->{slaves} = $slaves;
printf STDERR "Slave set to watch has changed\n Was: %s\n Now: %s\n",
@@ -106,9 +106,10 @@ sub wait {
$pr_callback = sub {
my ($fraction, $elapsed, $remaining, $eta, $completed) = @_;
my $dsn_name = $worst->{cxn}->name();
my $dsn_description = $worst->{cxn}->description();
if ( defined $worst->{lag} ) {
print STDERR "Replica lag is " . ($worst->{lag} || '?')
. " seconds on $dsn_name. Waiting.\n";
. " seconds on $dsn_description. Waiting.\n";
}
else {
if ($self->{fail_on_stopped_replication}) {

View File

@@ -392,7 +392,7 @@ collect_mysql_data_two() {
}
open_tables() {
local open_tables=$($CMD_MYSQLADMIN $EXT_ARGV ext | grep "Open_tables" | awk '{print $4}')
local open_tables=$($CMD_MYSQLADMIN $EXT_ARGV ext | grep "Open_tables" | grep -v "Open_tables_with_triggers" | awk '{print $4}')
if [ -n "$open_tables" -a $open_tables -le 1000 ]; then
$CMD_MYSQL $EXT_ARGV -e 'SHOW OPEN TABLES' &
else