Skip to content

Commit

Permalink
* feat(web_pod_info.py): add exponential moving average function for …
Browse files Browse the repository at this point in the history
…GPU metrics

* refactor(web_pod_info.py): change refresh_interval and samples_per_gpu default values
* refactor(web_pod_info.py): move columns definition outside the loop
* refactor(web_pod_info.py): replace np.mean with exponential_moving_average_efficient for GPU metrics
* refactor(web_pod_info.py): replace st_table.dataframe with st.dataframe
  • Loading branch information
AntreasAntoniou committed Oct 30, 2023
1 parent 95897b3 commit 022bee1
Showing 1 changed file with 82 additions and 35 deletions.
117 changes: 82 additions & 35 deletions kubejobs/web_pod_info.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import subprocess
import time
from collections import defaultdict
from datetime import datetime, timezone

import fire
Expand All @@ -11,6 +12,34 @@
from tqdm import tqdm


def exponential_moving_average_efficient(data, N):
"""
Compute the Exponential Moving Average (EMA) for a list of values.
Args:
data (list): The list of data points.
span (int): The span N for the EMA calculation.
Returns:
float: The EMA of the data.
"""
# Convert list to numpy array for vectorized operations
data_array = np.array(data)

# Calculate the smoothing factor alpha
alpha = 2 / (N + 1)

# Initialize the EMA array
ema = np.zeros_like(data_array)
ema[0] = data_array[0]

# Calculate the EMA
for i in range(1, len(data_array)):
ema[i] = (alpha * data_array[i]) + ((1 - alpha) * ema[i - 1])

return ema[-1]


def convert_to_gigabytes(value: str) -> float:
"""
Convert the given storage/memory value to base Gigabytes (GB).
Expand Down Expand Up @@ -81,8 +110,8 @@ def ssh_into_pod_and_run_command(
def fetch_and_render_pod_info(
namespace="informatics",
loop=True,
refresh_interval=60,
samples_per_gpu=3,
refresh_interval=0,
samples_per_gpu=1,
):
"""
Fetches information about Kubernetes pods and renders it in a Streamlit table.
Expand All @@ -96,32 +125,36 @@ def fetch_and_render_pod_info(
# Outside of your loop, before you start refreshing data
st_table = st.empty()

columns = [
"Name",
"Namespace",
"Username",
"UID",
"Status",
"Node",
"Image",
"CPU Request",
"Memory Request",
"GPU Type",
"GPU Limit",
"GPU Memory Used",
"GPU Memory Total",
"GPU Utilization",
"Creation Time",
"Age",
]

per_user_total_gpu_memory = defaultdict(list)
per_user_total_gpu_utilization = defaultdict(list)
per_user_total_gpu_memory_used = defaultdict(list)

while True:
get_pods_cmd = f"kubectl get pods -n {namespace} -o json"
pods_output, _ = run_command(get_pods_cmd)
pod_data = json.loads(pods_output)

columns = [
"Name",
"Namespace",
"Username",
"UID",
"Status",
"Node",
"Image",
"CPU Request",
"Memory Request",
"GPU Type",
"GPU Limit",
"GPU Memory Used",
"GPU Memory Total",
"GPU Utilization",
"Creation Time",
"Age",
]
data = []

current_time = datetime.now(timezone.utc)
data = []

for pod in tqdm(pod_data["items"]):
metadata = pod["metadata"]
Expand Down Expand Up @@ -152,9 +185,6 @@ def fetch_and_render_pod_info(
age = time_diff_to_human_readable(creation_time, current_time)

# SSH into the pod and get GPU utilization details
gpu_memory_total_list = []
gpu_memory_used_list = []
gpu_utilization_list = []

for _ in range(samples_per_gpu):
gpu_usage_output = ssh_into_pod_and_run_command(
Expand All @@ -169,23 +199,39 @@ def fetch_and_render_pod_info(
gpu_memory_used,
gpu_utilization,
) = line.split(",")
gpu_memory_total_list.append(float(gpu_memory_total))
gpu_memory_used_list.append(float(gpu_memory_used))
gpu_utilization_list.append(float(gpu_utilization))

per_user_total_gpu_memory[username].append(
float(gpu_memory_total)
)
per_user_total_gpu_memory_used[username].append(
float(gpu_memory_used)
)
per_user_total_gpu_utilization[username].append(
float(gpu_utilization)
)

gpu_memory_total = (
np.mean(gpu_memory_total_list)
if len(gpu_memory_total_list) > 0
exponential_moving_average_efficient(
data=per_user_total_gpu_memory[username],
N=min(25, len(per_user_total_gpu_memory[username])),
)
if len(per_user_total_gpu_memory[username]) > 0
else -1
)
gpu_memory_used = (
np.mean(gpu_memory_used_list)
if len(gpu_memory_used_list) > 0
exponential_moving_average_efficient(
data=per_user_total_gpu_memory_used[username],
N=min(25, len(per_user_total_gpu_memory_used[username])),
)
if len(per_user_total_gpu_memory_used[username]) > 0
else -1
)
gpu_utilization = (
np.mean(gpu_utilization_list)
if len(gpu_utilization_list) > 0
exponential_moving_average_efficient(
data=per_user_total_gpu_utilization[username],
N=min(25, len(per_user_total_gpu_utilization[username])),
)
if len(per_user_total_gpu_utilization[username]) > 0
else -1
)

Expand All @@ -212,7 +258,8 @@ def fetch_and_render_pod_info(

df = pd.DataFrame(data, columns=columns)
# Inside your loop, when you update the DataFrame
st_table.dataframe(

st.dataframe(
df
) # This will update the existing table instead of creating a new one

Expand Down

0 comments on commit 022bee1

Please sign in to comment.