NVIDIA · jbrennan333 · Dec 19, 2023 · Dec 14, 2023 · Dec 17, 2023 · Dec 18, 2023
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
@@ -147,55 +147,61 @@ public ColumnarBatch next() {
     // Update our estimate for number of rows with the final size used to allocate the buffers.
     numRowsEstimate = (int) bufsAndNumRows._2.targetSize();
     long dataLength = calcDataLengthEstimate(numRowsEstimate);
-    try (
-        SpillableHostBuffer sdb = bufsAndNumRows._1[0];
+    int used[];
+    try (SpillableHostBuffer sdb = bufsAndNumRows._1[0];
         SpillableHostBuffer sob = bufsAndNumRows._1[1];
     ) {
-      // Fill in buffer under write lock for host buffers
-      batchAndRange = sdb.withHostBufferWriteLock( (dataBuffer) -> {
-        return sob.withHostBufferWriteLock( (offsetsBuffer) -> {
-          int[] used = fillBatch(dataBuffer, offsetsBuffer, dataLength, numRowsEstimate);
-          int dataOffset = used[0];
-          int currentRow = used[1];
-          // We don't want to loop forever trying to copy nothing
-          assert (currentRow > 0);
-          if (numInputRows != null) {
-            numInputRows.add(currentRow);
-          }
-          if (numOutputRows != null) {
-            numOutputRows.add(currentRow);
-          }
-          if (numOutputBatches != null) {
-            numOutputBatches.add(1);
-          }
-          // Now that we have filled the buffers with the data, we need to turn them into a
-          // HostColumnVector and copy them to the device so the GPU can turn it into a Table.
-          // To do this we first need to make a HostColumnCoreVector for the data, and then
-          // put that into a HostColumnVector as its child.  This the basics of building up
-          // a column of lists of bytes in CUDF but it is typically hidden behind the higer level
-          // APIs.
-          dataBuffer.incRefCount();
-          offsetsBuffer.incRefCount();
-          try (HostColumnVectorCore dataCv =
-                   new HostColumnVectorCore(DType.INT8, dataOffset, Optional.of(0L),
-                       dataBuffer, null, null, new ArrayList<>());
-               HostColumnVector hostColumn = new HostColumnVector(DType.LIST,
-                   currentRow, Optional.of(0L), null, null,
-                   offsetsBuffer, Collections.singletonList(dataCv))) {
+      HostMemoryBuffer[] hBufs = getHostBuffersWithRetry(sdb, sob);
+      try(HostMemoryBuffer dataBuffer = hBufs[0];
+          HostMemoryBuffer offsetsBuffer = hBufs[1];
+      ) {
+        used = fillBatch(dataBuffer, offsetsBuffer, dataLength, numRowsEstimate);
+      }
+      hBufs = getHostBuffersWithRetry(sdb, sob);
+      try (
+          HostMemoryBuffer dataBuffer = hBufs[0];
+          HostMemoryBuffer offsetsBuffer = hBufs[1];
+      ) {
+        int dataOffset = used[0];
+        int currentRow = used[1];
+        // We don't want to loop forever trying to copy nothing
+        assert (currentRow > 0);
+        if (numInputRows != null) {
+          numInputRows.add(currentRow);
+        }
+        if (numOutputRows != null) {
+          numOutputRows.add(currentRow);
+        }
+        if (numOutputBatches != null) {
+          numOutputBatches.add(1);
+        }
+        // Now that we have filled the buffers with the data, we need to turn them into a
+        // HostColumnVector and copy them to the device so the GPU can turn it into a Table.
+        // To do this we first need to make a HostColumnCoreVector for the data, and then
+        // put that into a HostColumnVector as its child.  This the basics of building up
+        // a column of lists of bytes in CUDF but it is typically hidden behind the higer level
+        // APIs.
+        dataBuffer.incRefCount();
+        offsetsBuffer.incRefCount();
+        try (HostColumnVectorCore dataCv =
+                 new HostColumnVectorCore(DType.INT8, dataOffset, Optional.of(0L),
+                     dataBuffer, null, null, new ArrayList<>());
+             HostColumnVector hostColumn = new HostColumnVector(DType.LIST,
+                 currentRow, Optional.of(0L), null, null,
+                 offsetsBuffer, Collections.singletonList(dataCv))) {
 
-            long ct = System.nanoTime() - collectStart;
-            streamTime.add(ct);
+          long ct = System.nanoTime() - collectStart;
+          streamTime.add(ct);
 
-            // Grab the semaphore because we are about to put data onto the GPU.
-            GpuSemaphore$.MODULE$.acquireIfNecessary(TaskContext.get());
-            NvtxRange range = NvtxWithMetrics.apply("RowToColumnar: build", NvtxColor.GREEN,
-                Option.apply(opTime));
-            ColumnVector devColumn =
-                RmmRapidsRetryIterator.withRetryNoSplit(hostColumn::copyToDevice);
-            return Tuple2.apply(makeSpillableBatch(devColumn), range);
-          }
-        });
-      });
+          // Grab the semaphore because we are about to put data onto the GPU.
+          GpuSemaphore$.MODULE$.acquireIfNecessary(TaskContext.get());
+          NvtxRange range = NvtxWithMetrics.apply("RowToColumnar: build", NvtxColor.GREEN,
+              Option.apply(opTime));
+          ColumnVector devColumn =
+              RmmRapidsRetryIterator.withRetryNoSplit(hostColumn::copyToDevice);
+          batchAndRange = Tuple2.apply(makeSpillableBatch(devColumn), range);
+        }
+      }
     }
     try (NvtxRange ignored = batchAndRange._2;
          Table tab =
@@ -208,6 +214,23 @@ public ColumnarBatch next() {
     }
   }
 
+  private HostMemoryBuffer[] getHostBuffersWithRetry(SpillableHostBuffer sdb, SpillableHostBuffer sob) {
-  private HostMemoryBuffer[] getHostBuffersWithRetry(SpillableHostBuffer sdb, SpillableHostBuffer sob) {
+  private HostMemoryBuffer[] getHostBuffersWithRetry(SpillableHostBuffer spillableDataBuffer, SpillableHostBuffer spillableOffsetsBuffer) {
-  private HostMemoryBuffer[] getHostBuffersWithRetry(SpillableHostBuffer sdb, SpillableHostBuffer sob) {
+  private HostMemoryBuffer[] getHostBuffersWithRetry(SpillableHostBuffer spillableDataBuffer, SpillableHostBuffer spillableOffsetsBuffer) {
+    return RmmRapidsRetryIterator.withRetryNoSplit( () -> {
+      HostMemoryBuffer[] hBufs = new HostMemoryBuffer[]{ null, null };
+      try {
+        hBufs[0] = sdb.getHostBuffer();
+        hBufs[1] = sob.getHostBuffer();
+        return hBufs;
+      } finally {
+        // If the second buffer is null, we must have thrown, so close the first one.
+        if ((hBufs[1] == null) && (hBufs[0] != null)) {
+          hBufs[0].close();
+          hBufs[0] = null;
+        }
+      }
-      HostMemoryBuffer[] hBufs = new HostMemoryBuffer[]{ null, null };
-      try {
-        hBufs[0] = sdb.getHostBuffer();
-        hBufs[1] = sob.getHostBuffer();
-        return hBufs;
-      } finally {
-        // If the second buffer is null, we must have thrown, so close the first one.
-        if ((hBufs[1] == null) && (hBufs[0] != null)) {
-          hBufs[0].close();
-          hBufs[0] = null;
-        }
-      }
+      HostMemoryBuffer dataBuffer = spillableDataBuffer.getHostBuffer();
+      HostMemoryBuffer offsetsBuffer = null;
+      try {
+        offsetsBuffer = spillableOffestBuffer.getHostBuffer();
+      } catch (Throwable t) {
+        dataBuffer.close();
+      }
+      return new HostMemoryBuffer[] {dataBuffer, offsetsBuffer};
-      HostMemoryBuffer[] hBufs = new HostMemoryBuffer[]{ null, null };
-      try {
-        hBufs[0] = sdb.getHostBuffer();
-        hBufs[1] = sob.getHostBuffer();
-        return hBufs;
-      } finally {
-        // If the second buffer is null, we must have thrown, so close the first one.
-        if ((hBufs[1] == null) && (hBufs[0] != null)) {
-          hBufs[0].close();
-          hBufs[0] = null;
-        }
-      }
+      HostMemoryBuffer dataBuffer = spillableDataBuffer.getHostBuffer();
+      HostMemoryBuffer offsetsBuffer = null;
+      try {
+        offsetsBuffer = spillableOffestBuffer.getHostBuffer();
+      } catch (Throwable t) {
+        dataBuffer.close();
+      }
+      return new HostMemoryBuffer[] {dataBuffer, offsetsBuffer};
+    });
+  }
+
   private Tuple2<SpillableHostBuffer[], AutoCloseableTargetSize>
   allocBuffers(SpillableHostBuffer[] sBufs, AutoCloseableTargetSize numRowsWrapper) {
     HostMemoryBuffer[] hBufs = new HostMemoryBuffer[]{ null, null };

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
@@ -21,7 +21,7 @@ import java.nio.channels.WritableByteChannel
 
 import scala.collection.mutable.ArrayBuffer
 
-import ai.rapids.cudf.{Cuda, DeviceMemoryBuffer, MemoryBuffer, Table}
+import ai.rapids.cudf.{Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer, Table}
 import com.nvidia.spark.rapids.Arm.withResource
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.StorageTier.StorageTier
@@ -320,6 +320,15 @@ trait RapidsBuffer extends AutoCloseable {
    */
   def getDeviceMemoryBuffer: DeviceMemoryBuffer
 
+  /**
+   * Get the host memory buffer from the underlying storage. If the buffer currently resides
+   * outside of host memory, a new HostMemoryBuffer is created with the data copied over.
+   * The caller must have successfully acquired the buffer beforehand.
+   * @see [[addReference]]
+   * @note It is the responsibility of the caller to close the buffer.
+   */
+  def getHostMemoryBuffer: HostMemoryBuffer
+
   /**
    * Try to add a reference to this buffer to acquire it.
    * @note The close method must be called for every successfully obtained reference.
@@ -425,6 +434,9 @@ sealed class DegenerateRapidsBuffer(
   override def getDeviceMemoryBuffer: DeviceMemoryBuffer =
     throw new UnsupportedOperationException("degenerate buffer has no device memory buffer")
 
+  override def getHostMemoryBuffer: HostMemoryBuffer =
+    throw new UnsupportedOperationException("degenerate buffer has no host memory buffer")
+
   override def addReference(): Boolean = true
 
   override def getSpillPriority: Long = Long.MaxValue

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
@@ -647,6 +647,34 @@ class RapidsBufferCatalog(
     }
   }
 
+  /**
+   * Copies `buffer` to the `hostStorage` store, registering a new `RapidsBuffer` in
+   * the process
+   *
+   * @param buffer - buffer to copy
+   * @param stream - Cuda.Stream to synchronize on
+   * @return - The `RapidsBuffer` instance that was added to the host store.
+   */
+  def unspillBufferToHostStore(
+      buffer: RapidsBuffer,
+      stream: Cuda.Stream): RapidsBuffer = synchronized {
+    // try to acquire the buffer, if it's already in the store
+    // do not create a new one, else add a reference
+    acquireBuffer(buffer.id, StorageTier.HOST) match {
+      case Some(existingBuffer) => existingBuffer
+      case None =>
+        val maybeNewBuffer = hostStorage.copyBuffer(buffer, this, stream)
+        maybeNewBuffer.map { newBuffer =>
+          logDebug(s"got new RapidsHostMemoryStore buffer ${newBuffer.id}")
+          newBuffer.addReference() // add a reference since we are about to use it
+          updateTiers(BufferSpill(buffer, Some(newBuffer)))
+          buffer.safeFree()
+          newBuffer
+        }.get // the GPU store has to return a buffer here for now, or throw OOM
-        }.get // the GPU store has to return a buffer here for now, or throw OOM
+        }.get // the host store has to return a buffer here for now, or throw OOM
-        }.get // the GPU store has to return a buffer here for now, or throw OOM
+        }.get // the host store has to return a buffer here for now, or throw OOM
+    }
+  }
+
+
   /**
    * Remove a buffer ID from the catalog at the specified storage tier.
    * @note public for testing

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable
 import ai.rapids.cudf.{BaseDeviceMemoryBuffer, Cuda, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer, NvtxColor, NvtxRange}
 import com.nvidia.spark.rapids.Arm._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
-import com.nvidia.spark.rapids.StorageTier.{DEVICE, StorageTier}
+import com.nvidia.spark.rapids.StorageTier.{DEVICE, HOST, StorageTier}
 import com.nvidia.spark.rapids.format.TableMeta
 
 import org.apache.spark.internal.Logging
@@ -516,6 +516,31 @@ abstract class RapidsBufferStore(val tier: StorageTier)
       }
     }
 
+    override def getHostMemoryBuffer: HostMemoryBuffer = {
+      (0 until MAX_UNSPILL_ATTEMPTS).foreach { _ =>
+        catalog.acquireBuffer(id, HOST) match {
+          case Some(buffer) =>
+            withResource(buffer) { _ =>
+              return buffer.getHostMemoryBuffer
+            }
+          case _ =>
+            try {
+              logDebug(s"Unspilling $this $id to $HOST")
+              val newBuffer = catalog.unspillBufferToHostStore(
+                this,
+                Cuda.DEFAULT_STREAM)
+              withResource(newBuffer) { _ =>
+                return newBuffer.getHostMemoryBuffer
+              }
+            } catch {
+              case _: DuplicateBufferException =>
+                logDebug(s"Lost host buffer registration race for buffer $id, retrying...")
+            }
+        }
+      }
+      throw new IllegalStateException(s"Unable to get host memory buffer for ID: $id")
+    }
+
     /**
      * close() is called by client code to decrease the ref count of this RapidsBufferBase.
      * In the off chance that by the time close is invoked, the buffer was freed (not valid)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDiskStore.scala
@@ -140,62 +140,47 @@ class RapidsDiskStore(diskBlockManager: RapidsDiskBlockManager)
       meta: TableMeta,
       spillPriority: Long)
       extends RapidsBufferBase(id, meta, spillPriority) {
-    private[this] var hostBuffer: Option[HostMemoryBuffer] = None
 
     // FIXME: Need to be clean up. Tracked in https://github.com/NVIDIA/spark-rapids/issues/9496
     override val memoryUsedBytes: Long = uncompressedSize
 
     override val storageTier: StorageTier = StorageTier.DISK
 
     override def getMemoryBuffer: MemoryBuffer = synchronized {
-      if (hostBuffer.isEmpty) {
-        require(onDiskSizeInBytes > 0,
-          s"$this attempted an invalid 0-byte mmap of a file")
-        val path = id.getDiskPath(diskBlockManager)
-        val serializerManager = diskBlockManager.getSerializerManager()
-        val memBuffer = if (serializerManager.isRapidsSpill(id)) {
-          // Only go through serializerManager's stream wrapper for spill case
-          closeOnExcept(HostMemoryBuffer.allocate(uncompressedSize)) { decompressed =>
-            GpuTaskMetrics.get.readSpillFromDiskTime {
-              withResource(FileChannel.open(path.toPath, StandardOpenOption.READ)) { c =>
-                c.position(fileOffset)
-                withResource(Channels.newInputStream(c)) { compressed =>
-                  withResource(serializerManager.wrapStream(id, compressed)) { in =>
-                    withResource(new HostMemoryOutputStream(decompressed)) { out =>
-                      IOUtils.copy(in, out)
-                    }
-                    decompressed
+      require(onDiskSizeInBytes > 0,
+        s"$this attempted an invalid 0-byte mmap of a file")
+      val path = id.getDiskPath(diskBlockManager)
+      val serializerManager = diskBlockManager.getSerializerManager()
+      val memBuffer = if (serializerManager.isRapidsSpill(id)) {
+        // Only go through serializerManager's stream wrapper for spill case
+          closeOnExcept(HostAlloc.alloc(uncompressedSize)) {
+            decompressed => GpuTaskMetrics.get.readSpillFromDiskTime {
+            withResource(FileChannel.open(path.toPath, StandardOpenOption.READ)) { c =>
+              c.position(fileOffset)
+              withResource(Channels.newInputStream(c)) { compressed =>
+                withResource(serializerManager.wrapStream(id, compressed)) { in =>
+                  withResource(new HostMemoryOutputStream(decompressed)) { out =>
+                    IOUtils.copy(in, out)
                   }
+                  decompressed
                 }
               }
             }
           }
-        } else {
-          // Reserved mmap read fashion for UCX shuffle path. Also it's skipping encryption and
-          // compression.
-          HostMemoryBuffer.mapFile(path, MapMode.READ_WRITE, fileOffset, onDiskSizeInBytes)
         }
-        hostBuffer = Some(memBuffer)
+      } else {
+        // Reserved mmap read fashion for UCX shuffle path. Also it's skipping encryption and
+        // compression.
+        HostMemoryBuffer.mapFile(path, MapMode.READ_WRITE, fileOffset, onDiskSizeInBytes)
       }
-      hostBuffer.foreach(_.incRefCount())
-      hostBuffer.get
+      memBuffer
     }
 
     override def close(): Unit = synchronized {
-      if (refcount == 1) {
-        // free the memory mapping since this is the last active reader
-        hostBuffer.foreach { b =>
-          logDebug(s"closing mmap buffer $b")
-          b.close()
-        }
-        hostBuffer = None
-      }
       super.close()
     }
 
     override protected def releaseResources(): Unit = {
-      require(hostBuffer.isEmpty,
-        "Releasing a disk buffer with non-empty host buffer")
       // Buffers that share paths must be cleaned up elsewhere
       if (id.canShareDiskPaths) {
         sharedBufferFiles.remove(id)