PT-1869: Enable slave list reloading (#456)

* PT-1869: Enable slave list reloading

* PT-1869: Fix pt-osc/slave_lag sample sizes for more consistent testing results

* PT-1869: Move slaves_to_skip to get_slaves_cb
This commit is contained in:
Mateus Dubiela Oliveira
2020-08-12 11:30:56 -03:00
committed by GitHub
parent f9b510e22f
commit d6ada6a7bf
4 changed files with 213 additions and 84 deletions

View File

@@ -5015,10 +5015,32 @@ sub wait {
my $worst; # most lagging slave my $worst; # most lagging slave
my $pr_callback; my $pr_callback;
my $pr_first_report; my $pr_first_report;
### refresh list of slaves. In: self passed to wait()
### Returns: new slave list
my $pr_refresh_slave_list = sub {
my ($self) = @_;
my ($slaves, $refresher) = ($self->{slaves}, $self->{get_slaves_cb});
return $slaves if ( not defined $refresher );
my $before = join ' ', sort map {$_->name()} @$slaves;
$slaves = $refresher->();
my $after = join ' ', sort map {$_->name()} @$slaves;
if ($before ne $after) {
$self->{slaves} = $slaves;
printf STDERR "Slave set to watch has changed\n Was: %s\n Now: %s\n",
$before, $after;
}
return($self->{slaves});
};
$slaves = $pr_refresh_slave_list->($self);
if ( $pr ) { if ( $pr ) {
# If you use the default Progress report callback, you'll need to
# to add Transformers.pm to this tool.
$pr_callback = sub { $pr_callback = sub {
my ($fraction, $elapsed, $remaining, $eta, $completed) = @_; my ($fraction, $elapsed, $remaining, $eta, $completed) = @_;
my $dsn_name = $worst->{cxn}->{dsn_name}; my $dsn_name = $worst->{cxn}->name();
if ( defined $worst->{lag} ) { if ( defined $worst->{lag} ) {
print STDERR "Replica lag is " . ($worst->{lag} || '?') print STDERR "Replica lag is " . ($worst->{lag} || '?')
. " seconds on $dsn_name. Waiting.\n"; . " seconds on $dsn_name. Waiting.\n";
@@ -5033,21 +5055,34 @@ sub wait {
}; };
$pr->set_callback($pr_callback); $pr->set_callback($pr_callback);
# If a replic is stopped, don't wait 30s (or whatever interval)
# to report this. Instead, report it once, immediately, then
# keep reporting it every interval.
$pr_first_report = sub { $pr_first_report = sub {
my $dsn_name = $worst->{cxn}->{dsn_name}; my $dsn_name = $worst->{cxn}->name();
if ( !defined $worst->{lag} ) { if ( !defined $worst->{lag} ) {
if ($self->{fail_on_stopped_replication}) { if ($self->{fail_on_stopped_replication}) {
die 'replication is stopped'; die 'replication is stopped';
} }
print STDERR "(2) Replica $dsn_name is stopped. Waiting.\n"; print STDERR "(2) Replica '$dsn_name' is stopped. Waiting.\n";
} }
return; return;
}; };
} }
# First check all slaves.
my @lagged_slaves = map { {cxn=>$_, lag=>undef} } @$slaves; my @lagged_slaves = map { {cxn=>$_, lag=>undef} } @$slaves;
while ( $oktorun->() && @lagged_slaves ) { while ( $oktorun->() && @lagged_slaves ) {
PTDEBUG && _d('Checking slave lag'); PTDEBUG && _d('Checking slave lag');
### while we were waiting our list of slaves may have changed
$slaves = $pr_refresh_slave_list->($self);
my $watched = 0;
@lagged_slaves = grep {
my $slave_name = $_->{cxn}->name();
grep {$slave_name eq $_->name()} @{$slaves // []}
} @lagged_slaves;
for my $i ( 0..$#lagged_slaves ) { for my $i ( 0..$#lagged_slaves ) {
my $lag; my $lag;
eval { eval {
@@ -5066,8 +5101,10 @@ sub wait {
} }
} }
# Remove slaves that aren't lagging.
@lagged_slaves = grep { defined $_ } @lagged_slaves; @lagged_slaves = grep { defined $_ } @lagged_slaves;
if ( @lagged_slaves ) { if ( @lagged_slaves ) {
# Sort lag, undef is highest because it means the slave is stopped.
@lagged_slaves = reverse sort { @lagged_slaves = reverse sort {
defined $a->{lag} && defined $b->{lag} ? $a->{lag} <=> $b->{lag} defined $a->{lag} && defined $b->{lag} ? $a->{lag} <=> $b->{lag}
: defined $a->{lag} ? -1 : defined $a->{lag} ? -1
@@ -5078,6 +5115,10 @@ sub wait {
$worst->{lag}, 'on', Dumper($worst->{cxn}->dsn())); $worst->{lag}, 'on', Dumper($worst->{cxn}->dsn()));
if ( $pr ) { if ( $pr ) {
# There's no real progress because we can't estimate how long
# it will take all slaves to catch up. The progress reports
# are just to inform the user every 30s which slave is still
# lagging this most.
$pr->update( $pr->update(
sub { return 0; }, sub { return 0; },
first_report => $pr_first_report, first_report => $pr_first_report,
@@ -8759,13 +8800,42 @@ sub main {
channel => $o->get('channel'), channel => $o->get('channel'),
); );
$slaves = $ms->get_slaves( my $slaves_to_skip = $o->get('skip-check-slave-lag');
dbh => $cxn->dbh(),
dsn => $cxn->dsn(), my $get_slaves_cb = sub {
make_cxn => sub { my ($intolerant) = @_;
return $make_cxn->(@_, prev_dsn => $cxn->dsn()); my $slaves =$ms->get_slaves(
}, dbh => $cxn->dbh(),
); dsn => $cxn->dsn(),
make_cxn => sub {
return $make_cxn->(
@_,
prev_dsn => $cxn->dsn(),
errok => (not $intolerant)
);
},
);
if ($slaves_to_skip) {
my $filtered_slaves = [];
for my $slave (@$slaves) {
for my $slave_to_skip (@$slaves_to_skip) {
if ($slave->{dsn}->{h} eq $slave_to_skip->{h} && $slave->{dsn}->{P} eq $slave_to_skip->{P}) {
print "Skipping slave " . $slave->description() . "\n";
} else {
push @$filtered_slaves, $slave;
}
}
}
$slaves = $filtered_slaves;
}
return $slaves;
};
### first ever call only: do not tolerate connection errors
$slaves = $get_slaves_cb->('intolerant');
PTDEBUG && _d(scalar @$slaves, 'slaves found'); PTDEBUG && _d(scalar @$slaves, 'slaves found');
if ( scalar @$slaves ) { if ( scalar @$slaves ) {
print "Found " . scalar(@$slaves) . " slaves:\n"; print "Found " . scalar(@$slaves) . " slaves:\n";
@@ -8789,6 +8859,7 @@ sub main {
#prev_dsn => $cxn->dsn(), #prev_dsn => $cxn->dsn(),
); );
$slave_lag_cxns = [ $cxn ]; $slave_lag_cxns = [ $cxn ];
$get_slaves_cb = undef;
} }
else { else {
PTDEBUG && _d('Will check slave lag on all slaves'); PTDEBUG && _d('Will check slave lag on all slaves');
@@ -8796,31 +8867,9 @@ sub main {
} }
if ( $slave_lag_cxns && scalar @$slave_lag_cxns ) { if ( $slave_lag_cxns && scalar @$slave_lag_cxns ) {
if ($o->get('skip-check-slave-lag')) { print "Will check slave lag on:\n";
my $slaves_to_skip = $o->get('skip-check-slave-lag'); foreach my $cxn ( @$slave_lag_cxns ) {
my $filtered_slaves = []; print $cxn->description()."\n";
for my $slave (@$slave_lag_cxns) {
my $found=0;
for my $slave_to_skip (@$slaves_to_skip) {
if ($slave->{dsn}->{h} eq $slave_to_skip->{h} && $slave->{dsn}->{P} eq $slave_to_skip->{P}) {
$found=1;
}
}
if ($found) {
print "Skipping slave ". $slave->description()."\n";
} else {
push @$filtered_slaves, $slave;
}
}
$slave_lag_cxns = $filtered_slaves;
}
if (!scalar @$slave_lag_cxns) {
print "Not checking slave lag because all slaves were skipped\n";
} else{
print "Will check slave lag on:\n";
foreach my $cxn ( @$slave_lag_cxns ) {
print $cxn->description()."\n";
}
} }
} }
else { else {
@@ -8931,11 +8980,12 @@ sub main {
} }
$replica_lag = new ReplicaLagWaiter( $replica_lag = new ReplicaLagWaiter(
slaves => $slave_lag_cxns, slaves => $slave_lag_cxns,
max_lag => $o->get('max-lag'), get_slaves_cb => $get_slaves_cb,
oktorun => sub { return $oktorun }, max_lag => $o->get('max-lag'),
get_lag => $get_lag, oktorun => sub { return $oktorun },
sleep => $sleep, get_lag => $get_lag,
sleep => $sleep,
); );
my $get_status; my $get_status;

View File

@@ -80,6 +80,26 @@ sub wait {
my $worst; # most lagging slave my $worst; # most lagging slave
my $pr_callback; my $pr_callback;
my $pr_first_report; my $pr_first_report;
### refresh list of slaves. In: self passed to wait()
### Returns: new slave list
my $pr_refresh_slave_list = sub {
my ($self) = @_;
my ($slaves, $refresher) = ($self->{slaves}, $self->{get_slaves_cb});
return $slaves if ( not defined $refresher );
my $before = join ' ', sort map {$_->name()} @$slaves;
$slaves = $refresher->();
my $after = join ' ', sort map {$_->name()} @$slaves;
if ($before ne $after) {
$self->{slaves} = $slaves;
printf STDERR "Slave set to watch has changed\n Was: %s\n Now: %s\n",
$before, $after;
}
return($self->{slaves});
};
$slaves = $pr_refresh_slave_list->($self);
if ( $pr ) { if ( $pr ) {
# If you use the default Progress report callback, you'll need to # If you use the default Progress report callback, you'll need to
# to add Transformers.pm to this tool. # to add Transformers.pm to this tool.
@@ -119,8 +139,23 @@ sub wait {
my @lagged_slaves = map { {cxn=>$_, lag=>undef} } @$slaves; my @lagged_slaves = map { {cxn=>$_, lag=>undef} } @$slaves;
while ( $oktorun->() && @lagged_slaves ) { while ( $oktorun->() && @lagged_slaves ) {
PTDEBUG && _d('Checking slave lag'); PTDEBUG && _d('Checking slave lag');
### while we were waiting our list of slaves may have changed
$slaves = $pr_refresh_slave_list->($self);
my $watched = 0;
@lagged_slaves = grep {
my $slave_name = $_->{cxn}->name();
grep {$slave_name eq $_->name()} @{$slaves // []}
} @lagged_slaves;
for my $i ( 0..$#lagged_slaves ) { for my $i ( 0..$#lagged_slaves ) {
my $lag = $get_lag->($lagged_slaves[$i]->{cxn}); my $lag;
eval {
$lag = $get_lag->($lagged_slaves[$i]->{cxn});
};
if ($EVAL_ERROR) {
die $EVAL_ERROR;
}
PTDEBUG && _d($lagged_slaves[$i]->{cxn}->name(), PTDEBUG && _d($lagged_slaves[$i]->{cxn}->name(),
'slave lag:', $lag); 'slave lag:', $lag);
if ( !defined $lag || $lag > $max_lag ) { if ( !defined $lag || $lag > $max_lag ) {

View File

@@ -17,12 +17,12 @@ use Data::Dumper;
use PerconaTest; use PerconaTest;
use Sandbox; use Sandbox;
use SqlModes; use SqlModes;
use File::Temp qw/ tempdir /; use File::Temp qw/ tempdir tempfile /;
if ($ENV{PERCONA_SLOW_BOX}) { if ($ENV{PERCONA_SLOW_BOX}) {
plan skip_all => 'This test needs a fast machine'; plan skip_all => 'This test needs a fast machine';
} else { } else {
plan tests => 4; plan tests => 6;
} }
our $delay = 30; our $delay = 30;
@@ -37,6 +37,7 @@ my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
my $master_dbh = $sb->get_dbh_for('master'); my $master_dbh = $sb->get_dbh_for('master');
my $slave_dbh = $sb->get_dbh_for('slave1'); my $slave_dbh = $sb->get_dbh_for('slave1');
my $master_dsn = 'h=127.0.0.1,P=12345,u=msandbox,p=msandbox'; my $master_dsn = 'h=127.0.0.1,P=12345,u=msandbox,p=msandbox';
my $slave_dsn = 'h=127.0.0.1,P=12346,u=msandbox,p=msandbox';
if ( !$master_dbh ) { if ( !$master_dbh ) {
plan skip_all => 'Cannot connect to sandbox master'; plan skip_all => 'Cannot connect to sandbox master';
@@ -58,19 +59,19 @@ $slave_dbh->do('STOP SLAVE');
$slave_dbh->do('RESET SLAVE'); $slave_dbh->do('RESET SLAVE');
$slave_dbh->do('START SLAVE'); $slave_dbh->do('START SLAVE');
diag('Loading test data');
$sb->load_file('master', "t/pt-online-schema-change/samples/slave_lag.sql");
my $num_rows = 5000;
diag("Loading $num_rows into the table. This might take some time.");
diag(`util/mysql_random_data_load --host=127.0.0.1 --port=12345 --user=msandbox --password=msandbox test pt178 --bulk-size=1 --max-threads=1 $num_rows`);
diag("Setting slave delay to $delay seconds"); diag("Setting slave delay to $delay seconds");
$slave_dbh->do('STOP SLAVE'); $slave_dbh->do('STOP SLAVE');
$slave_dbh->do("CHANGE MASTER TO MASTER_DELAY=$delay"); $slave_dbh->do("CHANGE MASTER TO MASTER_DELAY=$delay");
$slave_dbh->do('START SLAVE'); $slave_dbh->do('START SLAVE');
diag('Loading test data');
$sb->load_file('master', "t/pt-online-schema-change/samples/slave_lag.sql");
my $num_rows = 10000;
diag("Loading $num_rows into the table. This might take some time.");
diag(`util/mysql_random_data_load --host=127.0.0.1 --port=12345 --user=msandbox --password=msandbox test pt178 $num_rows`);
# Run a full table scan query to ensure the slave is behind the master # Run a full table scan query to ensure the slave is behind the master
# There is no query cache in MySQL 8.0+ # There is no query cache in MySQL 8.0+
reset_query_cache($master_dbh, $master_dbh); reset_query_cache($master_dbh, $master_dbh);
@@ -80,7 +81,7 @@ $master_dbh->do('UPDATE `test`.`pt178` SET f2 = f2 + 1 WHERE f1 = ""');
# pt-online-schema-change will wait on the slave at port 12346 # pt-online-schema-change will wait on the slave at port 12346
my $max_lag = $delay / 2; my $max_lag = $delay / 2;
my $args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 1 --max-lag 5 --alter 'ENGINE=InnoDB' --pid $tmp_file_name"; my $args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 10 --max-lag $max_lag --alter 'ENGINE=InnoDB' --pid $tmp_file_name";
diag("Starting base test. This is going to take some time due to the delay in the slave"); diag("Starting base test. This is going to take some time due to the delay in the slave");
diag("pid: $tmp_file_name"); diag("pid: $tmp_file_name");
my $output = `$trunk/bin/pt-online-schema-change $args 2>&1`; my $output = `$trunk/bin/pt-online-schema-change $args 2>&1`;
@@ -92,8 +93,8 @@ like(
); );
# Repeat the test now using --check-slave-lag # Repeat the test now using --check-slave-lag
$args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 1 --max-lag 5 --alter 'ENGINE=InnoDB' " $args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 1 --max-lag $max_lag --alter 'ENGINE=InnoDB' "
. "--check-slave-lag h=127.0.0.1,P=12346,u=msandbox,p=msandbox,D=test,t=sbtest"; . "--check-slave-lag h=127.0.0.1,P=12346,u=msandbox,p=msandbox,D=test,t=sbtest --pid $tmp_file_name";
# Run a full table scan query to ensure the slave is behind the master # Run a full table scan query to ensure the slave is behind the master
reset_query_cache($master_dbh, $master_dbh); reset_query_cache($master_dbh, $master_dbh);
@@ -108,13 +109,56 @@ like(
"--check-slave-lag waits on the correct slave", "--check-slave-lag waits on the correct slave",
); );
# Repeat the test new adding and removing a slave during the process
$args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 1 --max-lag $max_lag --alter 'ENGINE=InnoDB' "
. "--recursion-method=dsn=D=test,t=dynamic_replicas --recurse 0 --pid $tmp_file_name";
$master_dbh->do('CREATE TABLE `test`.`dynamic_replicas` (id INTEGER PRIMARY KEY, dsn VARCHAR(255) )');
$master_dbh->do("INSERT INTO `test`.`dynamic_replicas` (id, dsn) VALUES (1, '$slave_dsn')");
# Run a full table scan query to ensure the slave is behind the master
reset_query_cache($master_dbh, $master_dbh);
$master_dbh->do('UPDATE `test`.`pt178` SET f2 = f2 + 1 WHERE f1 = ""');
diag("Starting --recursion-method with changes during the process");
my ($fh, $filename) = tempfile();
my $pid = fork();
if (!$pid) {
open(STDERR, '>', $filename);
open(STDOUT, '>', $filename);
exec("$trunk/bin/pt-online-schema-change $args");
}
sleep(60);
$master_dbh->do("DELETE FROM `test`.`dynamic_replicas` WHERE id = 1;");
waitpid($pid, 0);
$output = do {
local $/ = undef;
<$fh>;
};
unlink $filename;
like(
$output,
qr/Slave set to watch has changed/s,
"--recursion-method=dsn updates the slave list",
);
like(
$output,
qr/Replica lag is \d+ seconds on .* Waiting/s,
"--recursion-method waits on a replica",
);
# Repeat the test now using --skip-check-slave-lag # Repeat the test now using --skip-check-slave-lag
# Run a full table scan query to ensure the slave is behind the master # Run a full table scan query to ensure the slave is behind the master
reset_query_cache($master_dbh, $master_dbh); reset_query_cache($master_dbh, $master_dbh);
$master_dbh->do('UPDATE `test`.`pt178` SET f2 = f2 + 1 WHERE f1 = ""'); $master_dbh->do('UPDATE `test`.`pt178` SET f2 = f2 + 1 WHERE f1 = ""');
$args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 1 --max-lag 5 --alter 'ENGINE=InnoDB' " $args = "$master_dsn,D=test,t=pt178 --execute --chunk-size 1 --max-lag $max_lag --alter 'ENGINE=InnoDB' "
. "--skip-check-slave-lag h=127.0.0.1,P=12346,u=msandbox,p=msandbox,D=test,t=sbtest"; . "--skip-check-slave-lag h=127.0.0.1,P=12346,u=msandbox,p=msandbox,D=test,t=sbtest --pid $tmp_file_name";
diag("Starting --skip-check-slave-lag test. This is going to take some time due to the delay in the slave"); diag("Starting --skip-check-slave-lag test. This is going to take some time due to the delay in the slave");
$output = `$trunk/bin/pt-online-schema-change $args 2>&1`; $output = `$trunk/bin/pt-online-schema-change $args 2>&1`;