Add debug with breakpoint onFailure to TaskRun Spec

Based on TEP-0042, this commit adds the ability for the user to debug TaskRuns with the spec.debug delaration where the user can provide breakpoints in spec.debug.breakpoint. With this commit the only breakpoint supported right now is the "onFailure" breakpoint which pauses the TaskRun during a step failure so the user can get shell access to the failed step container and debug. The following additions have been done. - Add -breakpoint_on_failure to entrypointer which disables exit of container upon failure - Add debug helper scripts to /tekton/debug/scripts which contains continuing the TaskRun and marking the step as a success or a failure by writing <n>.breakpointexit file to /tekton/tools - Add /tekton/debug/info/<n> mount which is used by helper scripts to understand which step they are running in where <n> denotes the step number eg: First step = 0, Second step = 1 and so on. - Exit code is propogated from .breakpointexit to end of TaskRun - Disable step skipError if breakpoint enabled - Add webhook validation for debug - Add Debug as a alpha feature Signed-off-by: Vincent Demeester <vdemeest@redhat.com> Signed-off-by: Vibhav Bobade <vibhav.bobde@gmail.com>
tektoncd · Jun 28, 2021 · 067aed7 · 067aed7
1 parent 005c31e
commit 067aed7
Show file tree

Hide file tree

Showing 20 changed files with 975 additions and 65 deletions.
diff --git a/cmd/entrypoint/main.go b/cmd/entrypoint/main.go
@@ -34,16 +34,36 @@ import (
 )
 
 var (
-	ep              = flag.String("entrypoint", "", "Original specified entrypoint to execute")
-	waitFiles       = flag.String("wait_file", "", "Comma-separated list of paths to wait for")
-	waitFileContent = flag.Bool("wait_file_content", false, "If specified, expect wait_file to have content")
-	postFile        = flag.String("post_file", "", "If specified, file to write upon completion")
-	terminationPath = flag.String("termination_path", "/tekton/termination", "If specified, file to write upon termination")
-	results         = flag.String("results", "", "If specified, list of file names that might contain task results")
-	timeout         = flag.Duration("timeout", time.Duration(0), "If specified, sets timeout for step")
+	ep                  = flag.String("entrypoint", "", "Original specified entrypoint to execute")
+	waitFiles           = flag.String("wait_file", "", "Comma-separated list of paths to wait for")
+	waitFileContent     = flag.Bool("wait_file_content", false, "If specified, expect wait_file to have content")
+	postFile            = flag.String("post_file", "", "If specified, file to write upon completion")
+	terminationPath     = flag.String("termination_path", "/tekton/termination", "If specified, file to write upon termination")
+	results             = flag.String("results", "", "If specified, list of file names that might contain task results")
+	timeout             = flag.Duration("timeout", time.Duration(0), "If specified, sets timeout for step")
+	breakpointOnFailure = flag.Bool("breakpoint_on_failure", false, "If specified, expect steps to not skip on failure")
 )
 
-const defaultWaitPollingInterval = time.Second
+const (
+	defaultWaitPollingInterval = time.Second
+	breakpointExitSuffix       = ".breakpointexit"
+)
+
+func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) {
+	if e.BreakpointOnFailure {
+		if waitErr := e.Waiter.Wait(breakpointExitPostFile, false, false); waitErr != nil {
+			log.Println("error occurred while waiting for " + breakpointExitPostFile)
+		}
+		// get exitcode from .breakpointexit
+		exitCode, readErr := e.BreakpointExitCode(breakpointExitPostFile)
+		// if readErr exists, the exitcode with default to 0 as we would like
+		// to encourage to continue running the next steps in the taskRun
+		if readErr != nil {
+			log.Println("error occurred while reading breakpoint exit code : " + readErr.Error())
+		}
+		os.Exit(exitCode)
+	}
+}
 
 func main() {
 	// Add credential flags originally introduced with our legacy credentials helper
@@ -75,17 +95,18 @@ func main() {
 	}
 
 	e := entrypoint.Entrypointer{
-		Entrypoint:      *ep,
-		WaitFiles:       strings.Split(*waitFiles, ","),
-		WaitFileContent: *waitFileContent,
-		PostFile:        *postFile,
-		TerminationPath: *terminationPath,
-		Args:            flag.Args(),
-		Waiter:          &realWaiter{waitPollingInterval: defaultWaitPollingInterval},
-		Runner:          &realRunner{},
-		PostWriter:      &realPostWriter{},
-		Results:         strings.Split(*results, ","),
-		Timeout:         timeout,
+		Entrypoint:          *ep,
+		WaitFiles:           strings.Split(*waitFiles, ","),
+		WaitFileContent:     *waitFileContent,
+		PostFile:            *postFile,
+		TerminationPath:     *terminationPath,
+		Args:                flag.Args(),
+		Waiter:              &realWaiter{waitPollingInterval: defaultWaitPollingInterval, breakpointOnFailure: *breakpointOnFailure},
+		Runner:              &realRunner{},
+		PostWriter:          &realPostWriter{},
+		Results:             strings.Split(*results, ","),
+		Timeout:             timeout,
+		BreakpointOnFailure: *breakpointOnFailure,
 	}
 
 	// Copy any creds injected by the controller into the $HOME directory of the current
@@ -95,6 +116,7 @@ func main() {
 	}
 
 	if err := e.Go(); err != nil {
+		breakpointExitPostFile := e.PostFile + breakpointExitSuffix
 		switch t := err.(type) {
 		case skipError:
 			log.Print("Skipping step because a previous step failed")
@@ -110,10 +132,12 @@ func main() {
 			// in both cases has an ExitStatus() method with the
 			// same signature.
 			if status, ok := t.Sys().(syscall.WaitStatus); ok {
+				checkForBreakpointOnFailure(e, breakpointExitPostFile)
 				os.Exit(status.ExitStatus())
 			}
 			log.Fatalf("Error executing command (ExitError): %v", err)
 		default:
+			checkForBreakpointOnFailure(e, breakpointExitPostFile)
 			log.Fatalf("Error executing command: %v", err)
 		}
 	}

diff --git a/cmd/entrypoint/waiter.go b/cmd/entrypoint/waiter.go
@@ -11,6 +11,7 @@ import (
 // realWaiter actually waits for files, by polling.
 type realWaiter struct {
 	waitPollingInterval time.Duration
+	breakpointOnFailure bool
 }
 
 var _ entrypoint.Waiter = (*realWaiter)(nil)
@@ -30,7 +31,7 @@ func (rw *realWaiter) setWaitPollingInterval(pollingInterval time.Duration) *rea
 //
 // If a file of the same name with a ".err" extension exists then this Wait
 // will end with a skipError.
-func (rw *realWaiter) Wait(file string, expectContent bool) error {
+func (rw *realWaiter) Wait(file string, expectContent bool, breakpointOnFailure bool) error {
 	if file == "" {
 		return nil
 	}
@@ -42,7 +43,16 @@ func (rw *realWaiter) Wait(file string, expectContent bool) error {
 		} else if !os.IsNotExist(err) {
 			return fmt.Errorf("waiting for %q: %w", file, err)
 		}
+		// When a .err file is read by this step, it means that a previous step has failed
+		// We wouldn't want this step to stop executing because the previous step failed during debug
+		// That is counterproductive to debugging
+		// Hence we disable skipError here so that the other steps in the failed taskRun can continue
+		// executing if breakpointOnFailure is enabled for the taskRun
+		// TLDR: Do not return skipError when breakpointOnFailure is enabled as it breaks execution of the TaskRun
 		if _, err := os.Stat(file + ".err"); err == nil {
+			if breakpointOnFailure {
+				return nil
+			}
 			return skipError("error file present, bail and skip the step")
 		}
 	}

diff --git a/cmd/entrypoint/waiter_test.go b/cmd/entrypoint/waiter_test.go
@@ -19,6 +19,7 @@ package main
 import (
 	"io/ioutil"
 	"os"
+	"strings"
 	"testing"
 	"time"
 )
@@ -37,7 +38,7 @@ func TestRealWaiterWaitMissingFile(t *testing.T) {
 	rw := realWaiter{}
 	doneCh := make(chan struct{})
 	go func() {
-		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false)
+		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false, false)
 		if err != nil {
 			t.Errorf("error waiting on tmp file %q", tmp.Name())
 		}
@@ -60,7 +61,7 @@ func TestRealWaiterWaitWithFile(t *testing.T) {
 	rw := realWaiter{}
 	doneCh := make(chan struct{})
 	go func() {
-		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false)
+		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), false, false)
 		if err != nil {
 			t.Errorf("error waiting on tmp file %q", tmp.Name())
 		}
@@ -83,7 +84,7 @@ func TestRealWaiterWaitMissingContent(t *testing.T) {
 	rw := realWaiter{}
 	doneCh := make(chan struct{})
 	go func() {
-		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true)
+		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true, false)
 		if err != nil {
 			t.Errorf("error waiting on tmp file %q", tmp.Name())
 		}
@@ -106,7 +107,7 @@ func TestRealWaiterWaitWithContent(t *testing.T) {
 	rw := realWaiter{}
 	doneCh := make(chan struct{})
 	go func() {
-		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true)
+		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmp.Name(), true, false)
 		if err != nil {
 			t.Errorf("error waiting on tmp file %q", tmp.Name())
 		}
@@ -122,3 +123,58 @@ func TestRealWaiterWaitWithContent(t *testing.T) {
 		t.Errorf("expected Wait() to have detected a non-zero file size by now")
 	}
 }
+
+func TestRealWaiterWaitWithErrorWaitfile(t *testing.T) {
+	tmp, err := ioutil.TempFile("", "real_waiter_test_file*.err")
+	if err != nil {
+		t.Errorf("error creating temp file: %v", err)
+	}
+	tmpFileName := strings.Replace(tmp.Name(), ".err", "", 1)
+	defer os.Remove(tmp.Name())
+	rw := realWaiter{}
+	doneCh := make(chan struct{})
+	go func() {
+		// error of type skipError is returned after encountering a error waitfile
+		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmpFileName, false, false)
+		if err == nil {
+			t.Errorf("expected skipError upon encounter error waitfile")
+		}
+		switch typ := err.(type) {
+		case skipError:
+			close(doneCh)
+		default:
+			t.Errorf("unexpected error type %T", typ)
+		}
+	}()
+	select {
+	case <-doneCh:
+		// Success
+	case <-time.After(2 * testWaitPollingInterval):
+		t.Errorf("expected Wait() to have detected a non-zero file size by now")
+	}
+}
+
+func TestRealWaiterWaitWithBreakpointOnFailure(t *testing.T) {
+	tmp, err := ioutil.TempFile("", "real_waiter_test_file*.err")
+	if err != nil {
+		t.Errorf("error creating temp file: %v", err)
+	}
+	tmpFileName := strings.Replace(tmp.Name(), ".err", "", 1)
+	defer os.Remove(tmp.Name())
+	rw := realWaiter{}
+	doneCh := make(chan struct{})
+	go func() {
+		// When breakpoint on failure is enabled skipError shouldn't be returned for a error waitfile
+		err := rw.setWaitPollingInterval(testWaitPollingInterval).Wait(tmpFileName, false, true)
+		if err != nil {
+			t.Errorf("error waiting on tmp file %q", tmp.Name())
+		}
+		close(doneCh)
+	}()
+	select {
+	case <-doneCh:
+		// Success
+	case <-time.After(2 * testWaitPollingInterval):
+		t.Errorf("expected Wait() to have detected a non-zero file size by now")
+	}
+}
diff --git a/docs/debug.md b/docs/debug.md
@@ -0,0 +1,82 @@
+<!--
+---
+linkTitle: "Debug"
+weight: 11
+---
+-->
+# Debug
+
+- [Overview](#overview)
+- [Debugging TaskRuns](#debugging-taskruns)
+  - [Adding Breakpoints](#adding-breakpoints)
+    - [Breakpoint on Failure](#breakpoint-on-failure)
+      - [Failure of a Step](#failure-of-a-step)
+      - [Halting a Step on failure](#halting-a-step-on-failure)
+      - [Exiting breakpoint](#exiting-breakpoint)
+- [Debug Environment](#debug-environment)
+  - [Mounts](#mounts)
+  - [Debug Scripts](#debug-scripts)
+
+
+## Overview
+
+`Debug` spec is used for troubleshooting and breakpointing runtime resources. This doc helps understand the inner 
+workings of debug in Tekton. Currently only the `TaskRun` resource is supported. 
+
+## Debugging TaskRuns
+
+The following provides explanation on how Debugging TaskRuns is possible through Tekton. To understand how to use 
+the debug spec for TaskRuns follow the [TaskRun Debugging Documentation](taskruns.md#debugging-a-taskrun).
+
+### Breakpoint on Failure
+
+Halting a TaskRun execution on Failure of a step.
+
+#### Failure of a Step
+
+The entrypoint binary is used to manage the lifecycle of a step. Steps are aligned beforehand by the TaskRun controller
+allowing each step to run in a particular order. This is done using `-wait_file` and the `-post_file` flags. The former 
+let's the entrypoint binary know that it has to wait on creation of a particular file before starting execution of the step.
+And the latter provides information on the step number and signal the next step on completion of the step.
+
+On success of a step, the `-post-file` is written as is, signalling the next step which would have the same argument given
+for `-wait_file` to resume the entrypoint process and move ahead with the step. 
+
+On failure of a step, the `-post_file` is written with appending `.err` to it denoting that the previous step has failed with
+and error. The subsequent steps are skipped in this case as well, marking the TaskRun as a failure.
+
+#### Halting a Step on failure
+
+The failed step writes `<step-no>.err` to `/tekton/tools` and stops running completely. To be able to debug a step we would
+need it to continue running (not exit), not skip the next steps and signal health of the step. By disabling step skipping, 
+stopping write of the `<step-no>.err` file and waiting on a signal by the user to disable the halt, we would be simulating a 
+"breakpoint".
+
+In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step 
+environment using a CLI or an IDE. 
+
+#### Exiting breakpoint
+
+To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which 
+would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/tools`
+would unpause and exit the step container.
+
+## Debug Environment 
+
+Additional environment augmentations made available to the TaskRun Pod to aid in troubleshooting and managing step lifecycle.
+
+### Mounts
+
+`/tekton/debug/scripts` : Contains scripts which the user can run to mark the step as a success, failure or exit the breakpoint.
+Shared between all the containers.
+
+`/tekton/debug/info/<n>` : Contains information about the step. Single EmptyDir shared between all step containers, but renamed 
+to reflect step number. eg: Step 0 will have `/tekton/debug/info/0`, Step 1 will have `/tekton/debug/info/1` etc.
+
+### Debug Scripts
+
+`/tekton/debug/scripts/debug-continue` : Mark the step as completed with success by writing to `/tekton/tools`. eg: User wants to exit
+breakpoint for failed step 0. Running this script would create `/tekton/tools/0` and `/tekton/tools/0.breakpointexit`.
+
+`/tekton/debug/scripts/debug-fail-continue` : Mark the step as completed with failure by writing to `/tekton/tools`. eg: User wants to exit
+breakpoint for failed step 0. Running this script would create `/tekton/tools/0.err` and `/tekton/tools/0.breakpointexit`.
diff --git a/docs/taskruns.md b/docs/taskruns.md
@@ -23,13 +23,15 @@ weight: 300
   - [Monitoring `Steps`](#monitoring-steps)
   - [Monitoring `Results`](#monitoring-results)
 - [Cancelling a `TaskRun`](#cancelling-a-taskrun)
+- [Debugging a `TaskRun`](#debugging-a-taskrun)
 - [Events](events.md#taskruns)
 - [Running a TaskRun Hermetically](hermetic.md)
 - [Code examples](#code-examples)
   - [Example `TaskRun` with a referenced `Task`](#example-taskrun-with-a-referenced-task)
   - [Example `TaskRun` with an embedded `Task`](#example-taskrun-with-an-embedded-task)
   - [Reusing a `Task`](#reusing-a-task)
   - [Using custom `ServiceAccount` credentials](#using-custom-serviceaccount-credentials)
+  - [Running step containers as a non-root user](#running-step-containers-as-a-non-root-user)
 
 # Overview
 
@@ -447,6 +449,41 @@ spec:
   status: "TaskRunCancelled"
 ```
 
+
+### Debugging a `TaskRun`
+
+#### Breakpoint on Failure
+
+TaskRuns can be halted on failure for troubleshooting by providing the following spec patch as seen below.
+
+```yaml
+spec:
+  debug:
+    breakpoint: ["onFailure"]
+```
+
+Upon failure of a step, the TaskRun Pod execution is halted. During this time, the user/client can get remote
+shell access to the step container with a command such as the following.
+
+```bash
+kubectl exec -it print-date-d7tj5-pod-w5qrn -c step-print-date-human-readable 
+```
+
+#### Debug Environment
+
+After the user/client has access to the container environment, they can scour for any missing parts because of which 
+their step might have failed. 
+
+To control the lifecycle of the step to mark it as a success or a failure or close the breakpoint, there are scripts
+provided in the `/tekton/debug/scripts` directory in the container. The following are the scripts and the tasks they
+perform :-
+
+`debug-continue`: Mark the step as a success and exit the breakpoint.
+
+`debug-fail-continue`: Mark the step as a failure and exit the breakpoint.
+
+*More information on the inner workings of debug can be found in the [Debug documentation](debug.md)*
+
 ## Code examples
 
 To better understand `TaskRuns`, study the following code examples: