Skip to content

Commit

Permalink
[serving] Ignore CUDA OOM when collecting metrics (#1581)
Browse files Browse the repository at this point in the history
  • Loading branch information
frankfliu authored Feb 28, 2024
1 parent 3440f12 commit 71477b2
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
12 changes: 9 additions & 3 deletions serving/src/main/java/ai/djl/serving/ModelServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import ai.djl.Device;
import ai.djl.engine.Engine;
import ai.djl.engine.EngineException;
import ai.djl.metric.Dimension;
import ai.djl.metric.Metric;
import ai.djl.metric.Unit;
Expand Down Expand Up @@ -224,9 +225,14 @@ public List<ChannelFuture> start()
Metric metric = new Metric("ServerStartup", duration, Unit.MICROSECONDS);
SERVER_METRIC.info("{}", metric);
for (int i = 0; i < CudaUtils.getGpuCount(); ++i) {
Device device = Device.gpu(i);
MemoryUsage mem = CudaUtils.getGpuMemory(device);
SERVER_METRIC.info("{}", new Metric("GPU-" + i, mem.getCommitted(), Unit.BYTES));
try {
Device device = Device.gpu(i);
MemoryUsage mem = CudaUtils.getGpuMemory(device);
SERVER_METRIC.info("{}", new Metric("GPU-" + i, mem.getCommitted(), Unit.BYTES));
} catch (IllegalArgumentException | EngineException e) {
logger.warn("Failed get GPU memory", e);
break;
}
}

if (stopped.get()) {
Expand Down
8 changes: 7 additions & 1 deletion wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,13 @@ void checkAvailableMemory(Device device) throws IOException {
}

if (device.isGpu()) {
MemoryUsage usage = CudaUtils.getGpuMemory(device);
MemoryUsage usage;
try {
usage = CudaUtils.getGpuMemory(device);
} catch (IllegalArgumentException | EngineException e) {
logger.warn("Failed to get GPU memory", e);
throw new WlmOutOfMemoryException("No enough memory to load the model."); // NOPMD
}
free = usage.getMax() - usage.getCommitted();
long gpuMem = intValue(prop, "gpu.reserved_memory_mb", -1) * 1024L * 1024;
if (gpuMem > 0) {
Expand Down

0 comments on commit 71477b2

Please sign in to comment.