Compare commits

...

16 Commits

Author SHA1 Message Date
Aly Kafoury
c07d9e9f9e update pod selection and confirming crashed state 2020-10-01 14:57:58 +02:00
Aly Kafoury
9fbb1c0352 updating error messages 2020-09-15 19:31:07 +02:00
Aly Kafoury
e2dab351c6 update get pods command 2020-09-15 19:22:30 +02:00
AlyHKafoury
20bacb32f4 Update src/go/pt-k8s-pxc-recovery/recover/recover.go
Co-authored-by: Andrew Pogrebnoy <absourd.noise@gmail.com>
2020-09-15 18:23:11 +03:00
Aly Kafoury
e0ae594493 adding cluster verification step 2020-08-24 19:27:56 +02:00
Aly Kafoury
7fb4cfaa6c adding custom debug image 2020-08-24 14:58:59 +02:00
Aly Kafoury
5379899d8c fix pr comments Aug-24-2020 2020-08-24 14:14:34 +02:00
Aly Kafoury
7c07edbd13 finishing remaining steps 2020-08-17 18:23:24 +02:00
Aly Kafoury
c635b3eff7 add cluster struct 2020-07-19 19:07:39 +02:00
AlyHKafoury
e5df960bf4 minor bug fixes 2020-07-13 13:05:39 +02:00
AlyHKafoury
15f400bd52 exec commands 2020-07-10 22:11:32 +02:00
AlyHKafoury
2a9f4e4cda find most recent pod 2020-07-09 20:04:06 +02:00
AlyHKafoury
6803ed064e waiting for all pods to be ready 2020-07-07 21:20:49 +02:00
AlyHKafoury
e475428acf wait for podzero to be ready 2020-07-06 22:12:21 +02:00
AlyHKafoury
e6dc63c68b prepartion steps 2020-07-06 21:25:47 +02:00
AlyHKafoury
7016982726 init recovery tool 2020-07-05 20:12:15 +02:00
3 changed files with 445 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
package kubectl
import (
"errors"
"os/exec"
"runtime"
)
func getKubectl() string {
switch runtime.GOOS {
case "windows":
return "kubectl.exe"
default:
return "kubectl"
}
}
func RunCmd(namespace string, args ...string) (string, error) {
args = append([]string{"-v=0", "--namespace", namespace}, args...)
cmd := exec.Command(getKubectl(), args...)
stdouterr, err := cmd.CombinedOutput()
if err != nil {
return "", errors.New(string(stdouterr))
}
output := string(stdouterr)
return output, nil
}

View File

@@ -0,0 +1,92 @@
package main
import (
"flag"
"fmt"
"log"
"time"
"github.com/percona/percona-toolkit/src/go/pt-k8s-pxc-recovery/recover"
)
func stepOrError(err error) {
if err != nil {
log.Fatal("Error:", err)
}
}
func main() {
namespace, clusterName, debugImage := "", "", ""
flag.StringVar(&namespace, "namespace", "default", "Select the namespace in which the cluster is deployed in")
flag.StringVar(&clusterName, "cluster", "test-cluster", "Select the cluster to recover")
flag.StringVar(&debugImage, "debug-image", "percona/percona-xtradb-cluster:8.0.19-10.1-debug", "Name and version of the debug image to use")
flag.Parse()
c := recover.Cluster{Namespace: namespace, Name: clusterName}
log.SetPrefix("\n" + log.Prefix())
log.Printf("Starting recovery process")
go func() {
for {
time.Sleep(300 * time.Millisecond)
fmt.Print(".")
}
}()
log.Printf("Getting cluster size")
stepOrError(c.SetClusterSize())
log.Printf("Getting cluster image")
clusterImage, err := c.GetClusterImage()
stepOrError(err)
log.Printf("Confirming crashed status")
stepOrError(c.ConfirmCrashedStatus())
log.Printf("Patching cluster image")
stepOrError(c.PatchClusterImage(debugImage))
log.Printf("Restarting pods")
stepOrError(c.RestartPods())
log.Printf("Make sure pod zero is ready")
stepOrError(c.PodZeroReady())
log.Printf("Make sure all pods are running")
stepOrError(c.AllPodsRunning())
log.Print("Set SST in progress")
stepOrError(c.SetSSTInProgress())
log.Print("Waiting for all pods to be ready")
stepOrError(c.AllPodsReady())
log.Printf("Finding the most recent pod")
stepOrError(c.FindMostRecentPod())
log.Printf("Recovering most recent pod")
go func() {
err := c.RecoverMostRecentPod()
if err != nil {
log.Printf("Recovering most recent pod still in progress")
}
}()
time.Sleep(10 * time.Second)
log.Printf("Patching cluster image")
stepOrError(c.PatchClusterImage(clusterImage))
log.Printf("Restart all pods execpt most recent pod")
stepOrError(c.RestartAllPodsExceptMostRecent())
log.Printf("Make sure all pods are running")
stepOrError(c.AllPodsRunning())
log.Printf("Restart Most Recent Pod")
stepOrError(c.RestartMostRecentPod())
log.Print("Waiting for all pods to be ready")
stepOrError(c.AllPodsReady())
log.Printf("Completed the restore process")
}

View File

@@ -0,0 +1,326 @@
package recover
import (
"fmt"
"regexp"
"strconv"
"strings"
"time"
"github.com/percona/percona-toolkit/src/go/pt-k8s-pxc-recovery/kubectl"
)
type Cluster struct {
Name string
Size int
MostRecentPod string
Namespace string
}
func (c *Cluster) SetClusterSize() error {
args := []string{
"get",
"pxc",
c.Name,
"-o",
"jsonpath='{.spec.pxc.size}'",
}
strSize, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return err
}
strSize = strings.Trim(strSize, "'")
c.Size, err = strconv.Atoi(strSize)
if err != nil {
return fmt.Errorf("error getting cluster size, %s", err)
}
return nil
}
func (c *Cluster) GetClusterImage() (string, error) {
args := []string{
"get",
"pod",
c.Name + "-pxc-0",
"-o",
"jsonpath='{.spec.containers[0].image}'",
}
clusterImage, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return "", fmt.Errorf("Error getting cluster image %s", err)
}
clusterImage = strings.Trim(clusterImage, "'")
return clusterImage, nil
}
func (c *Cluster) getPods() ([]string, error) {
args := []string{
"get",
"pods",
"--no-headers",
"-o",
"custom-columns=:metadata.name",
}
out, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return []string{}, err
}
formatedOutput := strings.Split(out, "\n")
podNames := []string{}
for _, podName := range formatedOutput {
if strings.Contains(podName, c.Name) && strings.Contains(podName, "pxc") {
podNames = append(podNames, podName)
}
}
return podNames, nil
}
func (c *Cluster) ConfirmCrashedStatus() error {
podNames, err := c.getPods()
if err != nil {
return fmt.Errorf("Error getting pods : %s", err)
}
for _, pod := range podNames {
logs, err := kubectl.RunCmd(c.Namespace, "logs", pod)
if err != nil {
return fmt.Errorf("error confirming crashed cluster status %s", err)
}
if !strings.Contains(logs, "grastate.dat") && !strings.Contains(logs, "safe_to_bootstrap") &&
!strings.Contains(logs, "It may not be safe to bootstrap the cluster from this node") {
return fmt.Errorf("found one or more pods in healthy state, can't use recovery tool, please restart failed pods manually")
}
}
return nil
}
func (c *Cluster) PatchClusterImage(image string) error {
args := []string{
"patch",
"pxc",
c.Name,
"--type=merge",
`--patch={"spec":{"pxc":{"image":"` + image + `"}}}`,
}
_, err := kubectl.RunCmd(c.Namespace, args...)
return fmt.Errorf("error patching cluster image: %s", err)
}
func (c *Cluster) RestartPods() error {
podNames, err := c.getPods()
if err != nil {
return fmt.Errorf("error getting pods to restart pods: %s", err)
}
for _, podName := range podNames {
args := []string{
"delete",
"pod",
podName,
"--force",
"--grace-period=0",
}
_, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil && !strings.Contains(err.Error(), "pods") && !strings.Contains(err.Error(), "not found") {
return fmt.Errorf("error restarting pods: %s", err)
}
}
return nil
}
func (c *Cluster) CheckPodReady(podName string) (bool, error) {
args := []string{
"get",
"pod",
podName,
"-o",
"jsonpath='{.status.containerStatuses[0].ready}'",
}
output, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return false, fmt.Errorf("error checking pod ready: %s", err)
}
return strings.Trim(output, "'") == "true", nil
}
func (c *Cluster) PodZeroReady() error {
podNames, err := c.getPods()
if err != nil {
return err
}
podZeroStatus := false
for !podZeroStatus {
time.Sleep(time.Second * 10)
podZeroStatus, err = c.CheckPodReady(podNames[0])
if err != nil {
return err
}
}
return nil
}
func (c *Cluster) CheckPodPhase(podName string, phase string) (bool, error) {
args := []string{
"get",
"pod",
podName,
"-o",
"jsonpath='{.status.phase}'",
}
output, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return false, fmt.Errorf("error checking pod phase: %s", err)
}
return strings.Trim(output, "'") == phase, nil
}
func (c *Cluster) AllPodsRunning() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
running := false
var err error
for !running {
time.Sleep(time.Second * 10)
running, err = c.CheckPodPhase(podName, "Running")
if err != nil && !strings.Contains(err.Error(), "NotFound") {
return err
}
}
}
return nil
}
func (c *Cluster) RunCommandInPod(podName string, cmd ...string) (string, error) {
args := []string{
"exec",
podName,
"--",
}
args = append(args, cmd...)
output, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return "", err
}
return output, nil
}
func (c *Cluster) SetSSTInProgress() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
_, err := c.RunCommandInPod(podName, "touch", "/var/lib/mysql/sst_in_progress")
if err != nil {
return fmt.Errorf("error setting sst in progress", err)
}
}
return nil
}
func (c *Cluster) AllPodsReady() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
podReadyStatus := false
for !podReadyStatus {
time.Sleep(time.Second * 10)
podReadyStatus, err = c.CheckPodReady(podName)
if err != nil {
return err
}
}
}
return nil
}
func (c *Cluster) FindMostRecentPod() error {
podNames, err := c.getPods()
if err != nil {
return err
}
var recentPodName string
seqNo := 0
re := regexp.MustCompile(`(?m)seqno:\s*(\d*)`)
for _, podName := range podNames {
output, err := c.RunCommandInPod(podName, "cat", "/var/lib/mysql/grastate.dat")
if err != nil {
return err
}
match := re.FindStringSubmatch(output)
if len(match) < 2 {
return fmt.Errorf("error finding the most recent pod : unable to get seqno")
}
currentSeqNo, err := strconv.Atoi(string(match[1]))
if err != nil {
return err
}
if currentSeqNo > seqNo {
seqNo = currentSeqNo
recentPodName = podName
}
}
c.MostRecentPod = recentPodName
return nil
}
func (c *Cluster) RecoverMostRecentPod() error {
_, err := c.RunCommandInPod(c.MostRecentPod, "mysqld", "--wsrep_recover")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
_, err = c.RunCommandInPod(c.MostRecentPod, "bash", "-c", "sed -i 's/safe_to_bootstrap: 0/safe_to_bootstrap: 1/g' /var/lib/mysql/grastate.dat")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
_, err = c.RunCommandInPod(c.MostRecentPod, "bash", "-c", "sed -i 's/wsrep_cluster_address=.*/wsrep_cluster_address=gcomm:\\/\\//g' /etc/mysql/node.cnf")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
_, err = c.RunCommandInPod(c.MostRecentPod, "mysqld")
if err != nil {
return fmt.Errorf("error recovering most recent pod: %s", err)
}
return nil
}
func (c *Cluster) RestartAllPodsExceptMostRecent() error {
podNames, err := c.getPods()
if err != nil {
return err
}
for _, podName := range podNames {
if podName != c.MostRecentPod {
args := []string{
"delete",
"pod",
podName,
"--force",
"--grace-period=0",
}
_, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return fmt.Errorf("error restarting pods : %s", err)
}
}
}
return nil
}
func (c *Cluster) RestartMostRecentPod() error {
args := []string{
"delete",
"pod",
c.MostRecentPod,
"--force",
"--grace-period=0",
}
_, err := kubectl.RunCmd(c.Namespace, args...)
if err != nil {
return fmt.Errorf("error restarting most recent pod : %s", err)
}
return nil
}