Skip to content

Commit

Permalink
* feat(count_gpu_usage_general.py): add INFORMATICS_GPU_ALLOWANCE con…
Browse files Browse the repository at this point in the history
…stant and calculation

* refactor(wandb_pod_injection.py): replace wandb_project with infinite_loop, add error handling
* refactor(web_pod_info.py): change convert_to_gigabytes to return -1 for unknown units
  • Loading branch information
AntreasAntoniou committed Nov 1, 2023
1 parent 7bdc713 commit 0f977fa
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 22 deletions.
20 changes: 20 additions & 0 deletions kubejobs/useful_single_liners/count_gpu_usage_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
"NVIDIA-A100-SXM4-40GB-MIG-1g.5gb": 140,
}

INFORMATICS_GPU_ALLOWANCE = 60


# 🚀 Execute the shell command and get the output
def run_command(command: str) -> str:
Expand Down Expand Up @@ -110,6 +112,24 @@ def count_gpu_usage():
k: v - gpu_usage.get("Running", {}).get(k, 0)
for k, v in GPU_DETAIL_DICT.items()
}
used_gpus_total = sum(
[
gpu_usage.get("Running", {}).get(k, 0)
for k, v in GPU_DETAIL_DICT.items()
]
)
gpu_usage["Informatics Allowance Available"] = {
k: min(
v - gpu_usage.get("Running", {}).get(k, 0),
INFORMATICS_GPU_ALLOWANCE - used_gpus_total,
)
for k, v in GPU_DETAIL_DICT.items()
}

gpu_usage["Total Available"] = {
k: v - gpu_usage.get("Running", {}).get(k, 0)
for k, v in GPU_DETAIL_DICT.items()
}

return gpu_usage

Expand Down
43 changes: 24 additions & 19 deletions kubejobs/wandb_pod_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,27 +95,32 @@ def create_and_copy_wandb_script(pod_name, namespace, metadata):
def fetch_and_render_pod_info(
namespace="informatics",
refresh_interval=10,
wandb_project="gpu_monitoring",
infinite_loop=True,
):
name_set = set()

get_pods_cmd = f"kubectl get pods -n {namespace} -o json"
pods_output, pods_error = run_command(get_pods_cmd)
pod_data = json.loads(pods_output)

current_time = datetime.now(timezone.utc)

for pod in tqdm(pod_data["items"]):
metadata = pod["metadata"]
spec = pod.get("spec", {})
status = pod["status"]

name = metadata["name"]
namespace = metadata["namespace"]
if name in name_set:
continue
name_set.add(name)
create_and_copy_wandb_script(name, namespace, pod)
while True:
get_pods_cmd = f"kubectl get pods -n {namespace} -o json"
pods_output, pods_error = run_command(get_pods_cmd)
pod_data = json.loads(pods_output)

current_time = datetime.now(timezone.utc)

for pod in tqdm(pod_data["items"]):
metadata = pod["metadata"]
spec = pod.get("spec", {})
status = pod["status"]

name = metadata["name"]
namespace = metadata["namespace"]
if name in name_set:
continue
name_set.add(name)

try:
create_and_copy_wandb_script(name, namespace, pod)
except Exception as e:
print(f"Error on {pod}, {e}")
time.sleep(refresh_interval)


if __name__ == "__main__":
Expand Down
4 changes: 1 addition & 3 deletions kubejobs/web_pod_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@ def convert_to_gigabytes(value: str) -> float:
elif value == "N/A":
return -1
else:
raise ValueError(
f"Unknown unit {unit_part}. Supported units are {list(factor_gb.keys())}."
)
return -1


def parse_iso_time(time_str: str) -> datetime:
Expand Down

0 comments on commit 0f977fa

Please sign in to comment.