From e9ad45ee659174d5c32055949953b52ff41476a6 Mon Sep 17 00:00:00 2001 From: Yang Gu Date: Tue, 5 Nov 2024 13:44:09 +0800 Subject: [PATCH 1/2] [js/webgpu] Destroy staging buffers aggressively during weights uploading In current implmentation, all the staging buffers for weights uploading are destoryed after first batch of kernel execution. It requires a lot of memory as all the staging buffers couldn't be reused. It also hurts the startup time (weights uploading only happens in session creation), as weights uploading is delayed to a very late time. This PR uses a very aggressive way to submit queue and destroy staging buffers, so that the related gpu memory could be reused as much as possible, though the real situation depends on the WebGPU and driver implementation. The aggressive queue submission also moves gpu operations to a very early time, which helps the startup time. Some buffer uploading benchmarks are composed to compare multiple solutions, regarding to the memory and time consumption. Benchmarks can be found at https://github.com/webatintel/webbench/blob/master/webgpu/buffer-upload.html, while detailed test data can be found at https://docs.google.com/document/d/1KgygOkb9ZNzkgzQ_tWOGlEI9ScmMBHDjDojjPFLmVXU/edit. I also tested phi3.5 on 2 machines, first inference time improved from 5141ms to 3579ms and from 4327ms to 2947ms separately. --- js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts index 1860870a1130b..1fc2aae6a8752 100644 --- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts +++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts @@ -191,8 +191,6 @@ class GpuDataManagerImpl implements GpuDataManager { // GPU Data ID => GPU Data ( storage buffer ) private storageCache: Map; - // pending buffers for uploading ( data is unmapped ) - private buffersForUploadingPending: GPUBuffer[]; // pending buffers for computing private buffersPending: GPUBuffer[]; @@ -212,7 +210,6 @@ class GpuDataManagerImpl implements GpuDataManager { this.storageCache = new Map(); this.freeBuffers = new Map(); this.freeUniformBuffers = new Map(); - this.buffersForUploadingPending = []; this.buffersPending = []; this.capturedPendingBuffers = new Map(); @@ -256,9 +253,10 @@ class GpuDataManagerImpl implements GpuDataManager { this.backend.endComputePass(); commandEncoder.copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size); - LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.upload(id=${id})`); + this.backend.device.queue.submit([commandEncoder.finish()]); + gpuBufferForUploading.destroy(); - this.buffersForUploadingPending.push(gpuBufferForUploading); + LOG_DEBUG('verbose', () => `[WebGPU] GpuDataManager.upload(id=${id})`); } memcpy(sourceId: GpuDataId, destinationId: GpuDataId): void { @@ -395,12 +393,6 @@ class GpuDataManagerImpl implements GpuDataManager { } refreshPendingBuffers(): void { - for (const buffer of this.buffersForUploadingPending) { - // upload buffer is only useful in the session creation time. So we don't need to reuse them in session running. - buffer.destroy(); - } - this.buffersForUploadingPending = []; - if (this.buffersPending.length === 0) { return; } From 62f1be0dc55c3a841245ac47d1b30dd23541e142 Mon Sep 17 00:00:00 2001 From: Yang Gu Date: Tue, 5 Nov 2024 15:28:58 +0800 Subject: [PATCH 2/2] Use a standalone commandEncoder --- js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts index 1fc2aae6a8752..1c6016500e7d3 100644 --- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts +++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts @@ -249,10 +249,8 @@ class GpuDataManagerImpl implements GpuDataManager { gpuBufferForUploading.unmap(); // GPU copy - const commandEncoder = this.backend.getCommandEncoder(); - this.backend.endComputePass(); + const commandEncoder = this.backend.device.createCommandEncoder(); commandEncoder.copyBufferToBuffer(gpuBufferForUploading, 0, gpuDataCache.gpuData.buffer, 0, size); - this.backend.device.queue.submit([commandEncoder.finish()]); gpuBufferForUploading.destroy();