Compare commits

...

21 Commits

Author SHA1 Message Date
Aly Kafoury
c07d9e9f9e update pod selection and confirming crashed state 2020-10-01 14:57:58 +02:00
Aly Kafoury
9fbb1c0352 updating error messages 2020-09-15 19:31:07 +02:00
Aly Kafoury
e2dab351c6 update get pods command 2020-09-15 19:22:30 +02:00
AlyHKafoury
20bacb32f4 Update src/go/pt-k8s-pxc-recovery/recover/recover.go
Co-authored-by: Andrew Pogrebnoy <absourd.noise@gmail.com>
2020-09-15 18:23:11 +03:00
Aly Kafoury
e0ae594493 adding cluster verification step 2020-08-24 19:27:56 +02:00
Aly Kafoury
7fb4cfaa6c adding custom debug image 2020-08-24 14:58:59 +02:00
Aly Kafoury
5379899d8c fix pr comments Aug-24-2020 2020-08-24 14:14:34 +02:00
Aly Kafoury
7c07edbd13 finishing remaining steps 2020-08-17 18:23:24 +02:00
Aly Kafoury
c635b3eff7 add cluster struct 2020-07-19 19:07:39 +02:00
AlyHKafoury
e5df960bf4 minor bug fixes 2020-07-13 13:05:39 +02:00
AlyHKafoury
15f400bd52 exec commands 2020-07-10 22:11:32 +02:00
AlyHKafoury
2a9f4e4cda find most recent pod 2020-07-09 20:04:06 +02:00
AlyHKafoury
6803ed064e waiting for all pods to be ready 2020-07-07 21:20:49 +02:00
AlyHKafoury
e475428acf wait for podzero to be ready 2020-07-06 22:12:21 +02:00
AlyHKafoury
e6dc63c68b prepartion steps 2020-07-06 21:25:47 +02:00
AlyHKafoury
7016982726 init recovery tool 2020-07-05 20:12:15 +02:00
Carlos Salguero
de27179da8 Merge pull request #453 from percona/PT-1853_self_ref_fks
PT-1853 Added --no-check-foreing-keys to pt-osc
2020-06-30 21:50:50 -03:00
Carlos
8ff3451362 PT-1853 Changed wording 2020-06-30 20:54:08 -03:00
Carlos
9f2b72e0df PT-1853 Added disable fk checks in MySQL 2020-06-30 20:09:39 -03:00
Carlos
2e62d07ba0 PT-1853 Disabled FK checks in MySQL 2020-06-30 10:12:27 -03:00
Carlos Salguero
c6b4bd747e PT-1852 Added --no-check-foreing-keys to pt-osc 2020-06-21 18:53:47 -03:00
8 changed files with 597 additions and 13 deletions

View File

@@ -8594,6 +8594,12 @@ sub main {
# ########################################################################
my $set_on_connect = sub {
my ($dbh) = @_;
if (!$o->get('check-foreign-keys')) {
my $sql = "SET foreign_key_checks=0";
PTDEBUG && _d($sql);
print $sql, "\n" if $o->get('print');
$dbh->do($sql);
}
return;
};
@@ -9102,6 +9108,15 @@ sub main {
$child_table->{name},
$child_table->{row_est} || '?';
}
# TODO: Fix self referencing foreign keys handling.
# See: https://jira.percona.com/browse/PT-1802
# https://jira.percona.com/browse/PT-1853
if (_has_self_ref_fks($orig_tbl->{db}, $orig_tbl->{tbl}, $child_tables) && $o->get('check-foreign-keys')) {
print "The table has self-referencing foreign keys and that might lead to errors.\n";
print "Use --no-check-foreign-keys to disable this check.\n";
return 1;
}
if ( $alter_fk_method ) {
# Let the user know how we're going to update the child table
@@ -10396,6 +10411,20 @@ sub check_alter {
return;
}
sub _has_self_ref_fks {
my ($orig_db, $orig_table, $child_tables) = @_;
my $db_tbl = sprintf('`%s`.`%s`', $orig_db, $orig_table);
foreach my $child_table ( @$child_tables ) {
if ("$db_tbl" eq "$child_table->{name}") {
return 1;
}
}
return 0;
}
# This function tries to detect if the --alter param is adding unique indexes.
# It returns an array of arrays, having a list of fields for each unique index
# found.
@@ -12168,6 +12197,15 @@ L<"--print"> and verify that the triggers are correct.
=back
=item --[no]check-foreign-keys
default: yes
Check for self-referencing foreign keys. Currently self referencing FKs are
not full supported, so, to prevent errors, this program won't run if the table
has self-referencing foreign keys. Use this parameter to disable self-referencing
FK checks.
=item --check-interval
type: time; default: 1

View File

@@ -0,0 +1,27 @@
package kubectl
import (
"errors"
"os/exec"
"runtime"
)
func getKubectl() string {
switch runtime.GOOS {
case "windows":
return "kubectl.exe"
default:
return "kubectl"
}
}
func RunCmd(namespace string, args ...string) (string, error) {
args = append([]string{"-v=0", "--namespace", namespace}, args...)
cmd := exec.Command(getKubectl(), args...)
stdouterr, err := cmd.CombinedOutput()
if err != nil {
return "", errors.New(string(stdouterr))
}
output := string(stdouterr)
return output, nil
}

View File

@@ -0,0 +1,92 @@
package main
import (
"flag"
"fmt"
"log"
"time"
"github.com/percona/percona-toolkit/src/go/pt-k8s-pxc-recovery/recover"
)
func stepOrError(err error) {
if err != nil {
log.Fatal("Error:", err)
}
}
func main() {
namespace, clusterName, debugImage := "", "", ""
flag.StringVar(&namespace, "namespace", "default", "Select the namespace in which the cluster is deployed in")
flag.StringVar(&clusterName, "cluster", "test-cluster", "Select the cluster to recover")
flag.StringVar(&debugImage, "debug-image", "percona/percona-xtradb-cluster:8.0.19-10.1-debug", "Name and version of the debug image to use")
flag.Parse()
c := recover.Cluster{Namespace: namespace, Name: clusterName}
log.SetPrefix("\n" + log.Prefix())
log.Printf("Starting recovery process")
go func() {
for {
time.Sleep(300 * time.Millisecond)
fmt.Print(".")
}
}()
log.Printf("Getting cluster size")
stepOrError(c.SetClusterSize())
log.Printf("Getting cluster image")
clusterImage, err := c.GetClusterImage()
stepOrError(err)
log.Printf("Confirming crashed status")
stepOrError(c.ConfirmCrashedStatus())
log.Printf("Patching cluster image")
stepOrError(c.PatchClusterImage(debugImage))
log.Printf("Restarting pods")
stepOrError(c.RestartPods())
log.Printf("Make sure pod zero is ready")
stepOrError(c.PodZeroReady())
log.Printf("Make sure all pods are running")
stepOrError(c.AllPodsRunning())
log.Print("Set SST in progress")
stepOrError(c.SetSSTInProgress())
log.Print("Waiting for all pods to be ready")
stepOrError(c.AllPodsReady())
log.Printf("Finding the most recent pod")
stepOrError(c.FindMostRecentPod())
log.Printf("Recovering most recent pod")
go func() {
err := c.RecoverMostRecentPod()
if err != nil {
log.Printf("Recovering most recent pod still in progress")
}
}()
time.Sleep(10 * time.Second)
log.Printf("Patching cluster image")
stepOrError(c.PatchClusterImage(clusterImage))
log.Printf("Restart all pods execpt most recent pod")
stepOrError(c.RestartAllPodsExceptMostRecent())
log.Printf("Make sure all pods are running")
stepOrError(c.AllPodsRunning())
log.Printf("Restart Most Recent Pod")
stepOrError(c.RestartMostRecentPod())
log.Print("Waiting for all pods to be ready")
stepOrError(c.AllPodsReady())
log.Printf("Completed the restore process")
}

View File

@@ -0,0 +1,326 @@
package recover
import (
"fmt"
"regexp"
"strconv"
"strings"
"time"
"github.com/percona/percona-toolkit/src/go/pt-k8s-pxc-recovery/kubectl"
)
type Cluster struct {
Name string
Size int
MostRecentPod string
Namespace string
}
func (c *Cluster) SetClusterSize() error {
args := []string{
"get",
"pxc",
c.Name,
"-o",
"jsonpath='{.spec.pxc.size}'",
}
strSize, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return err
}
strSize = strings.Trim(strSize, "'")
c.Size, err = strconv.Atoi(strSize)
if err != nil {
return fmt.Errorf("error getting cluster size, %s", err)
}
return nil
}
func (c *Cluster) GetClusterImage() (string, error) {
args := []string{
"get",
"pod",
c.Name + "-pxc-0",
"-o",
"jsonpath='{.spec.containers[0].image}'",
}
clusterImage, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return "", fmt.Errorf("Error getting cluster image %s", err)
}
clusterImage = strings.Trim(clusterImage, "'")
return clusterImage, nil
}
func (c *Cluster) getPods() ([]string, error) {
args := []string{
"get",
"pods",
"--no-headers",
"-o",
"custom-columns=:metadata.name",
}
out, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return []string{}, err
}
formatedOutput := strings.Split(out, "\n")
podNames := []string{}
for _, podName := range formatedOutput {
if strings.Contains(podName, c.Name) && strings.Contains(podName, "pxc") {
podNames = append(podNames, podName)
}
}
return podNames, nil
}
func (c *Cluster) ConfirmCrashedStatus() error {
podNames, err := c.getPods()
if err != nil {
return fmt.Errorf("Error getting pods : %s", err)
}
for _, pod := range podNames {
logs, err := kubectl.RunCmd(c.Namespace, "logs", pod)
if err != nil {
return fmt.Errorf("error confirming crashed cluster status %s", err)
}
if !strings.Contains(logs, "grastate.dat") && !strings.Contains(logs, "safe_to_bootstrap") &&
!strings.Contains(logs, "It may not be safe to bootstrap the cluster from this node") {
return fmt.Errorf("found one or more pods in healthy state, can't use recovery tool, please restart failed pods manually")
}
}
return nil
}
func (c *Cluster) PatchClusterImage(image string) error {
args := []string{
"patch",
"pxc",
c.Name,
"--type=merge",
`--patch={"spec":{"pxc":{"image":"` + image + `"}}}`,
}
_, err := kubectl.RunCmd(c.Namespace, args...)
return fmt.Errorf("error patching cluster image: %s", err)
}
func (c *Cluster) RestartPods() error {
podNames, err := c.getPods()
if err != nil {
return fmt.Errorf("error getting pods to restart pods: %s", err)
}
for _, podName := range podNames {
args := []string{
"delete",
"pod",
podName,
"--force",
"--grace-period=0",
}
_, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil && !strings.Contains(err.Error(), "pods") && !strings.Contains(err.Error(), "not found") {
return fmt.Errorf("error restarting pods: %s", err)
}
}
return nil
}
func (c *Cluster) CheckPodReady(podName string) (bool, error) {
args := []string{
"get",
"pod",
podName,
"-o",
"jsonpath='{.status.containerStatuses[0].ready}'",
}
output, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return false, fmt.Errorf("error checking pod ready: %s", err)
}
return strings.Trim(output, "'") == "true", nil
}
func (c *Cluster) PodZeroReady() error {
podNames, err := c.getPods()
if err != nil {
return err
}
podZeroStatus := false
for !podZeroStatus {
time.Sleep(time.Second * 10)
podZeroStatus, err = c.CheckPodReady(podNames[0])
if err != nil {
return err
}
}
return nil
}
func (c *Cluster) CheckPodPhase(podName string, phase string) (bool, error) {
args := []string{
"get",
"pod",
podName,
"-o",
"jsonpath='{.status.phase}'",
}
output, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return false, fmt.Errorf("error checking pod phase: %s", err)
}
return strings.Trim(output, "'") == phase, nil
}
func (c *Cluster) AllPodsRunning() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
running := false
var err error
for !running {
time.Sleep(time.Second * 10)
running, err = c.CheckPodPhase(podName, "Running")
if err != nil && !strings.Contains(err.Error(), "NotFound") {
return err
}
}
}
return nil
}
func (c *Cluster) RunCommandInPod(podName string, cmd ...string) (string, error) {
args := []string{
"exec",
podName,
"--",
}
args = append(args, cmd...)
output, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return "", err
}
return output, nil
}
func (c *Cluster) SetSSTInProgress() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
_, err := c.RunCommandInPod(podName, "touch", "/var/lib/mysql/sst_in_progress")
if err != nil {
return fmt.Errorf("error setting sst in progress", err)
}
}
return nil
}
func (c *Cluster) AllPodsReady() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
podReadyStatus := false
for !podReadyStatus {
time.Sleep(time.Second * 10)
podReadyStatus, err = c.CheckPodReady(podName)
if err != nil {
return err
}
}
}
return nil
}
func (c *Cluster) FindMostRecentPod() error {
podNames, err := c.getPods()
if err != nil {
return err
}
var recentPodName string
seqNo := 0
re := regexp.MustCompile(`(?m)seqno:\s*(\d*)`)
for _, podName := range podNames {
output, err := c.RunCommandInPod(podName, "cat", "/var/lib/mysql/grastate.dat")
if err != nil {
return err
}
match := re.FindStringSubmatch(output)
if len(match) < 2 {
return fmt.Errorf("error finding the most recent pod : unable to get seqno")
}
currentSeqNo, err := strconv.Atoi(string(match[1]))
if err != nil {
return err
}
if currentSeqNo > seqNo {
seqNo = currentSeqNo
recentPodName = podName
}
}
c.MostRecentPod = recentPodName
return nil
}
func (c *Cluster) RecoverMostRecentPod() error {
_, err := c.RunCommandInPod(c.MostRecentPod, "mysqld", "--wsrep_recover")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
_, err = c.RunCommandInPod(c.MostRecentPod, "bash", "-c", "sed -i 's/safe_to_bootstrap: 0/safe_to_bootstrap: 1/g' /var/lib/mysql/grastate.dat")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
_, err = c.RunCommandInPod(c.MostRecentPod, "bash", "-c", "sed -i 's/wsrep_cluster_address=.*/wsrep_cluster_address=gcomm:\\/\\//g' /etc/mysql/node.cnf")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
_, err = c.RunCommandInPod(c.MostRecentPod, "mysqld")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
return nil
}
func (c *Cluster) RestartAllPodsExceptMostRecent() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
if podName != c.MostRecentPod {
args := []string{
"delete",
"pod",
podName,
"--force",
"--grace-period=0",
}
_, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return fmt.Errorf("error restarting pods : %s", err)
}
}
}
return nil
}
func (c *Cluster) RestartMostRecentPod() error {
args := []string{
"delete",
"pod",
c.MostRecentPod,
"--force",
"--grace-period=0",
}
_, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return fmt.Errorf("error restarting most recent pod : %s", err)
}
return nil
}

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env perl
BEGIN {
die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n"
unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH};
unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib";
};
use strict;
use warnings FATAL => 'all';
use threads;
use threads::shared;
use Thread::Semaphore;
use English qw(-no_match_vars);
use Test::More;
use Data::Dumper;
use PerconaTest;
use Sandbox;
use SqlModes;
use File::Temp qw/ tempdir /;
require "$trunk/bin/pt-online-schema-change";
plan tests => 3;
my $dp = new DSNParser(opts=>$dsn_opts);
my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
my $master_dbh = $sb->get_dbh_for("master");
my $master_dsn = $sb->dsn_for("master");
# The sandbox servers run with lock_wait_timeout=3 and it's not dynamic
# so we need to specify --set-vars innodb_lock_wait_timeout=3 else the
# tool will die.
my @args = (qw(--set-vars innodb_lock_wait_timeout=3));
my $output;
my $exit_status;
$sb->load_file('master', "t/pt-online-schema-change/samples/pt-1853.sql");
($output, $exit_status) = full_output(
sub { pt_online_schema_change::main(@args, "$master_dsn,D=test,t=jointit",
'--execute',
'--alter', "engine=innodb",
'--alter-foreign-keys-method', 'rebuild_constraints'
),
},
stderr => 1,
);
isnt(
$exit_status,
0,
"PT-1853, there are self-referencing FKs -> exit status != 0",
);
($output, $exit_status) = full_output(
sub { pt_online_schema_change::main(@args, "$master_dsn,D=test,t=jointit",
'--execute',
'--alter', "engine=innodb",
'--alter-foreign-keys-method', 'rebuild_constraints',
'--no-check-foreign-keys'
),
},
stderr => 1,
);
isnt(
$exit_status,
0,
"PT-1853, there are self-referencing FKs but --no-check-foreign-keys was specified -> exit status = 0",
);
# #############################################################################
# Done.
# #############################################################################
$sb->wipe_clean($master_dbh);
ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox");
done_testing;

View File

@@ -64,9 +64,9 @@ my $constraints = $master_dbh->selectall_arrayref($query);
is_deeply(
$constraints,
[
['person', '_fk_testId'],
['test_table', '_fk_person'],
['test_table', '__fk_refId'],
['person', 'fk_testId'],
['test_table', 'fk_person'],
['test_table', 'fk_refId'],
],
"First run adds or removes underscore from constraint names, accordingly"
);
@@ -94,9 +94,9 @@ $constraints = $master_dbh->selectall_arrayref($query);
is_deeply(
$constraints,
[
['person', '__fk_testId'],
['test_table', '_fk_refId'],
['test_table', '__fk_person'],
['person', 'fk_testId'],
['test_table', 'fk_person'],
['test_table', 'fk_refId'],
],
"Second run self-referencing will be one due to rebuild_constraints"
);

View File

@@ -60,13 +60,14 @@ my $query = <<"END";
ORDER BY TABLE_NAME, CONSTRAINT_NAME
END
my $constraints = $master_dbh->selectall_arrayref($query);
my @constraints = sort { @$a[0].@$a[1] cmp @$b[0].@$b[1] } @$constraints;
is_deeply(
$constraints,
[
['person', '_fk_testId'],
['test_table', '_fk_person'],
['test_table', '__fk_refId'],
['person', 'fk_testId'],
['test_table', 'fk_person'],
['test_table', 'fk_refId'],
],
"First run adds or removes underscore from constraint names, accordingly"
);
@@ -90,13 +91,14 @@ ORDER BY TABLE_NAME, CONSTRAINT_NAME
END
$constraints = $master_dbh->selectall_arrayref($query);
@constraints = sort { @$a[0].@$a[1] cmp @$b[0].@$b[1] } @$constraints;
is_deeply(
$constraints,
\@constraints,
[
['person', '__fk_testId'],
['test_table', '_fk_refId'],
['test_table', '__fk_person'],
['person', 'fk_testId'],
['test_table', 'fk_person'],
['test_table', 'fk_refId'],
],
"Second run self-referencing will be one due to rebuild_constraints"
);

View File

@@ -0,0 +1,19 @@
DROP DATABASE IF EXISTS test;
CREATE DATABASE test;
USE test;
CREATE TABLE t1 (
id int,
f1 int
);
CREATE TABLE `joinit` (
`i` int(11) NOT NULL AUTO_INCREMENT,
`s` varchar(64) DEFAULT NULL,
`t` time NOT NULL,
`g` int(11) NOT NULL,
`j` int(11) NOT NULL DEFAULT 1,
PRIMARY KEY (`i`))
ENGINE=InnoDB;
ALTER TABLE joinit ADD FOREIGN KEY i_fk (j) REFERENCES joinit (i) ON UPDATE cascade ON DELETE restrict;