Skip to content

Commit

Permalink
fix: Update Phi 3 Requirements (#550)
Browse files Browse the repository at this point in the history
**Reason for Change**:
Using `monitor_gpu_memory.sh` script on Phi-3 tuning jobs to get updated
metrics on GPU memory required for running inference and tuning for
Phi-3
  • Loading branch information
ishaansehgal99 authored Aug 6, 2024
1 parent 7899717 commit 5bde8f4
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 10 deletions.
30 changes: 30 additions & 0 deletions hack/monitor_gpu_memory.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

if [ -z "$1" ]; then
echo "Usage: $0 <device_number>"
exit 1
fi

device_number=$1

max_memory_usage=0

# Log file to store the memory usage, including the device number
log_file="gpu_memory_usage_${device_number}.log"

update_max_memory_usage() {
current_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $device_number | tr -d '[:space:]')

# Update the maximum memory usage if the current usage is higher
if (( current_memory_usage > max_memory_usage )); then
max_memory_usage=$current_memory_usage
# Log the new maximum memory usage
echo "$(date): New max GPU memory usage on GPU $device_number: ${max_memory_usage}MiB" | tee -a $log_file
fi
}

# Monitor GPU memory usage every second
while true; do
update_max_memory_usage
sleep 1
done
20 changes: 10 additions & 10 deletions presets/models/phi3/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ func (*phi3Mini4KInst) GetTuningParameters() *model.PresetParam {
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
PerGPUMemoryRequirement: "16Gi",
TotalGPUMemoryRequirement: "72Gi",
PerGPUMemoryRequirement: "72Gi",
// TorchRunParams: inference.DefaultAccelerateParams,
// ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
Expand Down Expand Up @@ -115,8 +115,8 @@ func (*phi3Mini128KInst) GetTuningParameters() *model.PresetParam {
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
PerGPUMemoryRequirement: "16Gi",
TotalGPUMemoryRequirement: "72Gi",
PerGPUMemoryRequirement: "72Gi",
// TorchRunParams: inference.DefaultAccelerateParams,
// ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
Expand All @@ -139,7 +139,7 @@ func (*Phi3Medium4kInstruct) GetInferenceParameters() *model.PresetParam {
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
TotalGPUMemoryRequirement: "28Gi",
PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
TorchRunParams: inference.DefaultAccelerateParams,
ModelRunParams: phiRunParams,
Expand All @@ -154,8 +154,8 @@ func (*Phi3Medium4kInstruct) GetTuningParameters() *model.PresetParam {
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "13Gi",
PerGPUMemoryRequirement: "13Gi",
TotalGPUMemoryRequirement: "80Gi",
PerGPUMemoryRequirement: "80Gi",
// TorchRunParams: inference.DefaultAccelerateParams,
// ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
Expand All @@ -178,7 +178,7 @@ func (*Phi3Medium128kInstruct) GetInferenceParameters() *model.PresetParam {
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
TotalGPUMemoryRequirement: "28Gi",
PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
TorchRunParams: inference.DefaultAccelerateParams,
ModelRunParams: phiRunParams,
Expand All @@ -193,8 +193,8 @@ func (*Phi3Medium128kInstruct) GetTuningParameters() *model.PresetParam {
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "13Gi",
PerGPUMemoryRequirement: "13Gi",
TotalGPUMemoryRequirement: "80Gi",
PerGPUMemoryRequirement: "80Gi",
// TorchRunParams: inference.DefaultAccelerateParams,
// ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
Expand Down

0 comments on commit 5bde8f4

Please sign in to comment.