From 92c82d4ac1651e50ad8acee1674570439adeaf22 Mon Sep 17 00:00:00 2001 From: Oleg Avdeev Date: Thu, 16 Sep 2021 17:14:00 -0700 Subject: [PATCH] better error handling for evicted pods --- metaflow/plugins/aws/eks/kubernetes_client.py | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/metaflow/plugins/aws/eks/kubernetes_client.py b/metaflow/plugins/aws/eks/kubernetes_client.py index 2e7cd0a3e96..0ab7dcbbb8b 100644 --- a/metaflow/plugins/aws/eks/kubernetes_client.py +++ b/metaflow/plugins/aws/eks/kubernetes_client.py @@ -579,19 +579,35 @@ def _done(): if not _done(): # If pod status is dirty, check for newer status self._pod = self._fetch_pod() - if self._pod: - for k, v in ( - self._pod["status"] - .get("container_statuses", [{}])[0] - .get("state", {}) - .items() - ): - if v is not None: - return v.get("exit_code"), ": ".join( - filter( - None, - [v.get("reason"), v.get("message")], + try: + if self._pod: + pod_status = self._pod["status"] + if pod_status.get("container_statuses") is None: + # We're done, but no container_statuses is set + # This can happen when the pod is evicted + return None, ": ".join( + filter( + None, + [pod_status.get("reason"), pod_status.get("message")], + ) ) - ) + + for k, v in ( + pod_status + .get("container_statuses", [{}])[0] + .get("state", {}) + .items() + ): + if v is not None: + return v.get("exit_code"), ": ".join( + filter( + None, + [v.get("reason"), v.get("message")], + ) + ) + except TypeError: + import sys + print("self._pod", self._pod, file=sys.stderr) + raise return None, None