From 967edd8ee3c3c048b4d3bfa4d5c13578225eee05 Mon Sep 17 00:00:00 2001
From: AntreasAntoniou <iam@antreas.io>
Date: Mon, 30 Oct 2023 16:29:30 +0000
Subject: [PATCH] * feat(web_pod_info.py): add GPU utilization details and
 refresh functionality * refactor(web_pod_info.py): replace rich library with
 tqdm for progress tracking * refactor(web_pod_info.py): move utility
 functions to top of the file * refactor(web_pod_info.py): replace console
 print with streamlit dataframe for displaying data *
 refactor(web_pod_info.py): handle command errors in run_command function *
 refactor(web_pod_info.py): change default values for cpu_request and
 gpu_limit from -1 to 0

---
 kubejobs/web_pod_info.py | 169 +++++++++++++++++++++++++--------------
 1 file changed, 111 insertions(+), 58 deletions(-)

diff --git a/kubejobs/web_pod_info.py b/kubejobs/web_pod_info.py
index 68579ef..2f27f55 100644
--- a/kubejobs/web_pod_info.py
+++ b/kubejobs/web_pod_info.py
@@ -1,14 +1,14 @@
 import json
 import subprocess
+import time
 from datetime import datetime, timezone
 
 import fire
+import numpy as np
 import pandas as pd
 import rich
 import streamlit as st
-from rich.console import Console
-from rich.progress import Progress
-from rich.table import Table
+from tqdm import tqdm
 
 
 def parse_iso_time(time_str: str) -> datetime:
@@ -68,46 +68,71 @@ def convert_to_gigabytes(value: str) -> float:
         )
 
 
-def fetch_and_render_pod_info(namespace="informatics"):
-    get_pods_cmd = f"kubectl get pods -n {namespace} -o json"
-    pods_output = run_command(get_pods_cmd)
-    pod_data = json.loads(pods_output)
-
-    console = Console()
-
-    columns = [
-        "Name",
-        "Namespace",
-        "Username",
-        "UID",
-        "Status",
-        "Node",
-        "Image",
-        "CPU Request",
-        "Memory Request",
-        "GPU Type",
-        "GPU Limit",
-        "Creation Time",
-        "Age",
-    ]
-    data = []
-
-    table = Table(
-        show_header=True, header_style="bold magenta", box=rich.box.SQUARE
+def parse_iso_time(time_str: str) -> datetime:
+    return datetime.strptime(time_str, "%Y-%m-%dT%H:%M:%SZ").replace(
+        tzinfo=timezone.utc
     )
-    for col in columns:
-        table.add_column(col)
 
-    current_time = datetime.now(timezone.utc)
 
-    with Progress() as progress:
-        task = progress.add_task(
-            "[cyan]Processing Pods...", total=len(pod_data["items"])
-        )
+def time_diff_to_human_readable(start: datetime, end: datetime) -> str:
+    diff = end - start
+    minutes, seconds = divmod(diff.seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    return f"{diff.days}d {hours}h {minutes}m {seconds}s"
 
-        for pod in pod_data["items"]:
-            progress.update(task, advance=1)
 
+def run_command(command: str) -> str:
+    result = subprocess.run(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        shell=True,
+    )
+    return result.stdout, result.stderr
+
+
+def ssh_into_pod_and_run_command(
+    pod_name: str, namespace: str, command: str
+) -> str:
+    ssh_command = f"kubectl exec -n {namespace} {pod_name} -- {command}"
+    stdout, stderr = run_command(ssh_command)
+    if stderr:
+        print(f"Error executing command in pod {pod_name}: {stderr}")
+    return stdout
+
+
+def fetch_and_render_pod_info(
+    namespace="informatics", loop=True, refresh_interval=10
+):
+    while True:
+        get_pods_cmd = f"kubectl get pods -n {namespace} -o json"
+        pods_output, _ = run_command(get_pods_cmd)
+        pod_data = json.loads(pods_output)
+
+        columns = [
+            "Name",
+            "Namespace",
+            "Username",
+            "UID",
+            "Status",
+            "Node",
+            "Image",
+            "CPU Request",
+            "Memory Request",
+            "GPU Type",
+            "GPU Limit",
+            "GPU Memory Used",
+            "GPU Memory Total",
+            "GPU Utilization",
+            "Creation Time",
+            "Age",
+        ]
+        data = []
+
+        current_time = datetime.now(timezone.utc)
+
+        for pod in tqdm(pod_data["items"]):
             metadata = pod["metadata"]
             spec = pod.get("spec", {})
             status = pod["status"]
@@ -125,16 +150,52 @@ def fetch_and_render_pod_info(namespace="informatics"):
             image = container.get("image", "N/A")
 
             resources = container.get("resources", {})
-            cpu_request = resources.get("requests", {}).get("cpu", "-1")
+            cpu_request = resources.get("requests", {}).get("cpu", "0")
             memory_request = resources.get("requests", {}).get("memory", "N/A")
             gpu_type = spec.get("nodeSelector", {}).get(
                 "nvidia.com/gpu.product", "N/A"
             )
-            gpu_limit = resources.get("limits", {}).get("nvidia.com/gpu", "-1")
+            gpu_limit = resources.get("limits", {}).get("nvidia.com/gpu", "0")
 
             creation_time = parse_iso_time(metadata["creationTimestamp"])
             age = time_diff_to_human_readable(creation_time, current_time)
 
+            # SSH into the pod and get GPU utilization details
+            gpu_usage_output = ssh_into_pod_and_run_command(
+                name,
+                namespace,
+                "nvidia-smi --query-gpu=memory.total,memory.used,utilization.gpu --format=csv,noheader,nounits",
+            )
+
+            gpu_memory_total_list = []
+            gpu_memory_used_list = []
+            gpu_utilization_list = []
+            for line in gpu_usage_output.splitlines():
+                (
+                    gpu_memory_total,
+                    gpu_memory_used,
+                    gpu_utilization,
+                ) = line.split(",")
+                gpu_memory_total_list.append(float(gpu_memory_total))
+                gpu_memory_used_list.append(float(gpu_memory_used))
+                gpu_utilization_list.append(float(gpu_utilization))
+
+            gpu_memory_total = (
+                np.mean(gpu_memory_total_list)
+                if len(gpu_memory_total_list) > 0
+                else -1
+            )
+            gpu_memory_used = (
+                np.mean(gpu_memory_used_list)
+                if len(gpu_memory_used_list) > 0
+                else -1
+            )
+            gpu_utilization = (
+                np.mean(gpu_utilization_list)
+                if len(gpu_utilization_list) > 0
+                else -1
+            )
+
             data.append(
                 [
                     str(name),
@@ -148,30 +209,22 @@ def fetch_and_render_pod_info(namespace="informatics"):
                     convert_to_gigabytes(memory_request),
                     gpu_type,
                     int(gpu_limit),
+                    gpu_memory_used,
+                    gpu_memory_total,
+                    gpu_utilization,
                     str(creation_time),
                     age,
                 ]
             )
-            table.add_row(
-                str(name),
-                str(namespace),
-                str(username),
-                str(uid),
-                pod_status,
-                node,
-                image,
-                cpu_request,
-                str(convert_to_gigabytes(memory_request)),
-                gpu_type,
-                str(gpu_limit),
-                str(creation_time),
-                age,
-            )
 
-    console.print(table)
+        df = pd.DataFrame(data, columns=columns)
+        st.dataframe(df)
 
-    df = pd.DataFrame(data, columns=columns)
-    st.dataframe(df)
+        if not loop:
+            break
+        time.sleep(
+            refresh_interval
+        )  # Refresh every specified number of seconds
 
 
 if __name__ == "__main__":