mirror of
https://github.com/percona/percona-toolkit.git
synced 2025-09-03 19:15:54 +00:00
Compare commits
16 Commits
pmm-2.18.0
...
PT-1867
Author | SHA1 | Date | |
---|---|---|---|
![]() |
c07d9e9f9e | ||
![]() |
9fbb1c0352 | ||
![]() |
e2dab351c6 | ||
![]() |
20bacb32f4 | ||
![]() |
e0ae594493 | ||
![]() |
7fb4cfaa6c | ||
![]() |
5379899d8c | ||
![]() |
7c07edbd13 | ||
![]() |
c635b3eff7 | ||
![]() |
e5df960bf4 | ||
![]() |
15f400bd52 | ||
![]() |
2a9f4e4cda | ||
![]() |
6803ed064e | ||
![]() |
e475428acf | ||
![]() |
e6dc63c68b | ||
![]() |
7016982726 |
27
src/go/pt-k8s-pxc-recovery/kubectl/kubectl.go
Normal file
27
src/go/pt-k8s-pxc-recovery/kubectl/kubectl.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package kubectl
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
func getKubectl() string {
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
return "kubectl.exe"
|
||||
default:
|
||||
return "kubectl"
|
||||
}
|
||||
}
|
||||
|
||||
func RunCmd(namespace string, args ...string) (string, error) {
|
||||
args = append([]string{"-v=0", "--namespace", namespace}, args...)
|
||||
cmd := exec.Command(getKubectl(), args...)
|
||||
stdouterr, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return "", errors.New(string(stdouterr))
|
||||
}
|
||||
output := string(stdouterr)
|
||||
return output, nil
|
||||
}
|
92
src/go/pt-k8s-pxc-recovery/main.go
Normal file
92
src/go/pt-k8s-pxc-recovery/main.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/percona/percona-toolkit/src/go/pt-k8s-pxc-recovery/recover"
|
||||
)
|
||||
|
||||
func stepOrError(err error) {
|
||||
if err != nil {
|
||||
log.Fatal("Error:", err)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
namespace, clusterName, debugImage := "", "", ""
|
||||
flag.StringVar(&namespace, "namespace", "default", "Select the namespace in which the cluster is deployed in")
|
||||
flag.StringVar(&clusterName, "cluster", "test-cluster", "Select the cluster to recover")
|
||||
flag.StringVar(&debugImage, "debug-image", "percona/percona-xtradb-cluster:8.0.19-10.1-debug", "Name and version of the debug image to use")
|
||||
flag.Parse()
|
||||
c := recover.Cluster{Namespace: namespace, Name: clusterName}
|
||||
log.SetPrefix("\n" + log.Prefix())
|
||||
|
||||
log.Printf("Starting recovery process")
|
||||
go func() {
|
||||
for {
|
||||
time.Sleep(300 * time.Millisecond)
|
||||
fmt.Print(".")
|
||||
}
|
||||
}()
|
||||
|
||||
log.Printf("Getting cluster size")
|
||||
stepOrError(c.SetClusterSize())
|
||||
|
||||
log.Printf("Getting cluster image")
|
||||
clusterImage, err := c.GetClusterImage()
|
||||
stepOrError(err)
|
||||
|
||||
log.Printf("Confirming crashed status")
|
||||
stepOrError(c.ConfirmCrashedStatus())
|
||||
|
||||
log.Printf("Patching cluster image")
|
||||
stepOrError(c.PatchClusterImage(debugImage))
|
||||
|
||||
log.Printf("Restarting pods")
|
||||
stepOrError(c.RestartPods())
|
||||
|
||||
log.Printf("Make sure pod zero is ready")
|
||||
stepOrError(c.PodZeroReady())
|
||||
|
||||
log.Printf("Make sure all pods are running")
|
||||
stepOrError(c.AllPodsRunning())
|
||||
|
||||
log.Print("Set SST in progress")
|
||||
stepOrError(c.SetSSTInProgress())
|
||||
|
||||
log.Print("Waiting for all pods to be ready")
|
||||
stepOrError(c.AllPodsReady())
|
||||
|
||||
log.Printf("Finding the most recent pod")
|
||||
stepOrError(c.FindMostRecentPod())
|
||||
|
||||
log.Printf("Recovering most recent pod")
|
||||
go func() {
|
||||
err := c.RecoverMostRecentPod()
|
||||
if err != nil {
|
||||
log.Printf("Recovering most recent pod still in progress")
|
||||
}
|
||||
}()
|
||||
|
||||
time.Sleep(10 * time.Second)
|
||||
|
||||
log.Printf("Patching cluster image")
|
||||
stepOrError(c.PatchClusterImage(clusterImage))
|
||||
|
||||
log.Printf("Restart all pods execpt most recent pod")
|
||||
stepOrError(c.RestartAllPodsExceptMostRecent())
|
||||
|
||||
log.Printf("Make sure all pods are running")
|
||||
stepOrError(c.AllPodsRunning())
|
||||
|
||||
log.Printf("Restart Most Recent Pod")
|
||||
stepOrError(c.RestartMostRecentPod())
|
||||
|
||||
log.Print("Waiting for all pods to be ready")
|
||||
stepOrError(c.AllPodsReady())
|
||||
|
||||
log.Printf("Completed the restore process")
|
||||
}
|
326
src/go/pt-k8s-pxc-recovery/recover/recover.go
Normal file
326
src/go/pt-k8s-pxc-recovery/recover/recover.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package recover
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/percona/percona-toolkit/src/go/pt-k8s-pxc-recovery/kubectl"
|
||||
)
|
||||
|
||||
type Cluster struct {
|
||||
Name string
|
||||
Size int
|
||||
MostRecentPod string
|
||||
Namespace string
|
||||
}
|
||||
|
||||
func (c *Cluster) SetClusterSize() error {
|
||||
args := []string{
|
||||
"get",
|
||||
"pxc",
|
||||
c.Name,
|
||||
"-o",
|
||||
"jsonpath='{.spec.pxc.size}'",
|
||||
}
|
||||
strSize, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
strSize = strings.Trim(strSize, "'")
|
||||
c.Size, err = strconv.Atoi(strSize)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting cluster size, %s", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) GetClusterImage() (string, error) {
|
||||
args := []string{
|
||||
"get",
|
||||
"pod",
|
||||
c.Name + "-pxc-0",
|
||||
"-o",
|
||||
"jsonpath='{.spec.containers[0].image}'",
|
||||
}
|
||||
clusterImage, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Error getting cluster image %s", err)
|
||||
}
|
||||
clusterImage = strings.Trim(clusterImage, "'")
|
||||
return clusterImage, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) getPods() ([]string, error) {
|
||||
args := []string{
|
||||
"get",
|
||||
"pods",
|
||||
"--no-headers",
|
||||
"-o",
|
||||
"custom-columns=:metadata.name",
|
||||
}
|
||||
out, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return []string{}, err
|
||||
}
|
||||
formatedOutput := strings.Split(out, "\n")
|
||||
podNames := []string{}
|
||||
for _, podName := range formatedOutput {
|
||||
if strings.Contains(podName, c.Name) && strings.Contains(podName, "pxc") {
|
||||
podNames = append(podNames, podName)
|
||||
}
|
||||
}
|
||||
return podNames, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) ConfirmCrashedStatus() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Error getting pods : %s", err)
|
||||
}
|
||||
for _, pod := range podNames {
|
||||
logs, err := kubectl.RunCmd(c.Namespace, "logs", pod)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error confirming crashed cluster status %s", err)
|
||||
}
|
||||
if !strings.Contains(logs, "grastate.dat") && !strings.Contains(logs, "safe_to_bootstrap") &&
|
||||
!strings.Contains(logs, "It may not be safe to bootstrap the cluster from this node") {
|
||||
return fmt.Errorf("found one or more pods in healthy state, can't use recovery tool, please restart failed pods manually")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) PatchClusterImage(image string) error {
|
||||
args := []string{
|
||||
"patch",
|
||||
"pxc",
|
||||
c.Name,
|
||||
"--type=merge",
|
||||
`--patch={"spec":{"pxc":{"image":"` + image + `"}}}`,
|
||||
}
|
||||
_, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
return fmt.Errorf("error patching cluster image: %s", err)
|
||||
}
|
||||
|
||||
func (c *Cluster) RestartPods() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting pods to restart pods: %s", err)
|
||||
}
|
||||
for _, podName := range podNames {
|
||||
args := []string{
|
||||
"delete",
|
||||
"pod",
|
||||
podName,
|
||||
"--force",
|
||||
"--grace-period=0",
|
||||
}
|
||||
_, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil && !strings.Contains(err.Error(), "pods") && !strings.Contains(err.Error(), "not found") {
|
||||
return fmt.Errorf("error restarting pods: %s", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) CheckPodReady(podName string) (bool, error) {
|
||||
args := []string{
|
||||
"get",
|
||||
"pod",
|
||||
podName,
|
||||
"-o",
|
||||
"jsonpath='{.status.containerStatuses[0].ready}'",
|
||||
}
|
||||
output, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("error checking pod ready: %s", err)
|
||||
}
|
||||
return strings.Trim(output, "'") == "true", nil
|
||||
}
|
||||
|
||||
func (c *Cluster) PodZeroReady() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
podZeroStatus := false
|
||||
for !podZeroStatus {
|
||||
time.Sleep(time.Second * 10)
|
||||
podZeroStatus, err = c.CheckPodReady(podNames[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) CheckPodPhase(podName string, phase string) (bool, error) {
|
||||
args := []string{
|
||||
"get",
|
||||
"pod",
|
||||
podName,
|
||||
"-o",
|
||||
"jsonpath='{.status.phase}'",
|
||||
}
|
||||
output, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("error checking pod phase: %s", err)
|
||||
}
|
||||
return strings.Trim(output, "'") == phase, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) AllPodsRunning() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, podName := range podNames {
|
||||
running := false
|
||||
var err error
|
||||
for !running {
|
||||
time.Sleep(time.Second * 10)
|
||||
running, err = c.CheckPodPhase(podName, "Running")
|
||||
if err != nil && !strings.Contains(err.Error(), "NotFound") {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) RunCommandInPod(podName string, cmd ...string) (string, error) {
|
||||
args := []string{
|
||||
"exec",
|
||||
podName,
|
||||
"--",
|
||||
}
|
||||
args = append(args, cmd...)
|
||||
output, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return output, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) SetSSTInProgress() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, podName := range podNames {
|
||||
_, err := c.RunCommandInPod(podName, "touch", "/var/lib/mysql/sst_in_progress")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error setting sst in progress", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) AllPodsReady() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, podName := range podNames {
|
||||
podReadyStatus := false
|
||||
for !podReadyStatus {
|
||||
time.Sleep(time.Second * 10)
|
||||
podReadyStatus, err = c.CheckPodReady(podName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) FindMostRecentPod() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var recentPodName string
|
||||
seqNo := 0
|
||||
re := regexp.MustCompile(`(?m)seqno:\s*(\d*)`)
|
||||
for _, podName := range podNames {
|
||||
output, err := c.RunCommandInPod(podName, "cat", "/var/lib/mysql/grastate.dat")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
match := re.FindStringSubmatch(output)
|
||||
if len(match) < 2 {
|
||||
return fmt.Errorf("error finding the most recent pod : unable to get seqno")
|
||||
}
|
||||
currentSeqNo, err := strconv.Atoi(string(match[1]))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if currentSeqNo > seqNo {
|
||||
seqNo = currentSeqNo
|
||||
recentPodName = podName
|
||||
}
|
||||
}
|
||||
c.MostRecentPod = recentPodName
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) RecoverMostRecentPod() error {
|
||||
_, err := c.RunCommandInPod(c.MostRecentPod, "mysqld", "--wsrep_recover")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error recovering most recent pod: %s", err)
|
||||
}
|
||||
_, err = c.RunCommandInPod(c.MostRecentPod, "bash", "-c", "sed -i 's/safe_to_bootstrap: 0/safe_to_bootstrap: 1/g' /var/lib/mysql/grastate.dat")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error recovering most recent pod: %s", err)
|
||||
}
|
||||
_, err = c.RunCommandInPod(c.MostRecentPod, "bash", "-c", "sed -i 's/wsrep_cluster_address=.*/wsrep_cluster_address=gcomm:\\/\\//g' /etc/mysql/node.cnf")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error recovering most recent pod: %s", err)
|
||||
}
|
||||
_, err = c.RunCommandInPod(c.MostRecentPod, "mysqld")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error recovering most recent pod: %s", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) RestartAllPodsExceptMostRecent() error {
|
||||
podNames, err := c.getPods()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, podName := range podNames {
|
||||
if podName != c.MostRecentPod {
|
||||
args := []string{
|
||||
"delete",
|
||||
"pod",
|
||||
podName,
|
||||
"--force",
|
||||
"--grace-period=0",
|
||||
}
|
||||
_, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error restarting pods : %s", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) RestartMostRecentPod() error {
|
||||
args := []string{
|
||||
"delete",
|
||||
"pod",
|
||||
c.MostRecentPod,
|
||||
"--force",
|
||||
"--grace-period=0",
|
||||
}
|
||||
_, err := kubectl.RunCmd(c.Namespace, args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error restarting most recent pod : %s", err)
|
||||
}
|
||||
return nil
|
||||
}
|
Reference in New Issue
Block a user