apache · edrevo · Sep 25, 2019 · Sep 27, 2019 · Nov 14, 2019 · Nov 14, 2019
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java
@@ -55,6 +55,16 @@ default WriteBuilder withInputDataSchema(StructType schema) {
     return this;
   }
 
+  /**
+   * Passes the number of partitions of the input data from Spark to data source.
+   *
+   * @return a new builder with the `schema`. By default it returns `this`, which means the given
+   *         `numPartitions` is ignored. Please override this method to take the `numPartitions`.
+   */
+  default WriteBuilder withNumPartitions(int numPartitions) {
+    return this;
+  }
+
   /**
    * Returns a {@link BatchWrite} to write data to batch source. By default this method throws
    * exception, data sources must overwrite this method to provide an implementation, if the

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala
@@ -98,6 +98,9 @@ class InMemoryTable(
 
     new WriteBuilder with SupportsTruncate with SupportsOverwrite with SupportsDynamicOverwrite {
       private var writer: BatchWrite = Append
+      private var numPartitionsProvided = false
+      private var queryIdProvided = false
+      private var inputDataSchemaProvided = false
 
       override def truncate(): WriteBuilder = {
         assert(writer == Append)
@@ -117,7 +120,36 @@ class InMemoryTable(
         this
       }
 
-      override def buildForBatch(): BatchWrite = writer
+      override def withNumPartitions(numPartitions: Int): WriteBuilder = {
+        assert(!numPartitionsProvided, "numPartitions provided twice")
+        numPartitionsProvided = true
+        this
+      }
+
+      override def withQueryId(queryId: String): WriteBuilder = {
+        assert(!queryIdProvided, "queryId provided twice")
+        queryIdProvided = true
+        this
+      }
+
+      override def withInputDataSchema(schema: StructType): WriteBuilder = {
+        assert(!queryIdProvided, "schema provided twice")
+        inputDataSchemaProvided = true
+        this
+      }
+
+      override def buildForBatch(): BatchWrite = {
+        assert(
+          inputDataSchemaProvided,
+          "Input data schema wasn't provided before calling buildForBatch")
+        assert(
+          queryIdProvided,
+          "Query id wasn't provided before calling buildForBatch")
+        assert(
+          numPartitionsProvided,
+          "Number of partitions schema wasn't provided before calling buildForBatch")
+        writer
+      }
     }
   }
 

diff --git a/...rc/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/...rc/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala
@@ -86,6 +86,7 @@ case class CreateTableAsSelectExec(
         case table: SupportsWrite =>
           val writeBuilder = table.newWriteBuilder(writeOptions)
             .withInputDataSchema(schema)
+            .withNumPartitions(rdd.getNumPartitions)
             .withQueryId(UUID.randomUUID().toString)
 
           writeBuilder match {
@@ -181,6 +182,7 @@ case class ReplaceTableAsSelectExec(
         case table: SupportsWrite =>
           val writeBuilder = table.newWriteBuilder(writeOptions)
             .withInputDataSchema(schema)
+            .withNumPartitions(rdd.getNumPartitions)
             .withQueryId(UUID.randomUUID().toString)
 
           writeBuilder match {
@@ -332,11 +334,13 @@ case class WriteToDataSourceV2Exec(
 trait BatchWriteHelper {
   def table: SupportsWrite
   def query: SparkPlan
+  def rdd: RDD[InternalRow]
   def writeOptions: CaseInsensitiveStringMap
 
   def newWriteBuilder(): WriteBuilder = {
     table.newWriteBuilder(writeOptions)
       .withInputDataSchema(query.schema)
+      .withNumPartitions(rdd.getNumPartitions)
       .withQueryId(UUID.randomUUID().toString)
   }
 }
@@ -347,34 +351,38 @@ trait BatchWriteHelper {
 trait V2TableWriteExec extends UnaryExecNode {
   def query: SparkPlan
 
+  lazy val rdd: RDD[InternalRow] = {
+    val tempRdd = query.execute()
+    // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single
+    // partition rdd to make sure we at least set up one write task to write the metadata.
+    if (tempRdd.partitions.length == 0) {
+      sparkContext.parallelize(Array.empty[InternalRow], 1)
+    } else {
+      tempRdd
+    }
+  }
+
   var commitProgress: Option[StreamWriterCommitProgress] = None
 
   override def child: SparkPlan = query
   override def output: Seq[Attribute] = Nil
 
   protected def writeWithV2(batchWrite: BatchWrite): RDD[InternalRow] = {
-    val writerFactory = batchWrite.createBatchWriterFactory()
     val useCommitCoordinator = batchWrite.useCommitCoordinator
-    val rdd = query.execute()
-    // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single
-    // partition rdd to make sure we at least set up one write task to write the metadata.
-    val rddWithNonEmptyPartitions = if (rdd.partitions.length == 0) {
-      sparkContext.parallelize(Array.empty[InternalRow], 1)
-    } else {
-      rdd
-    }
-    val messages = new Array[WriterCommitMessage](rddWithNonEmptyPartitions.partitions.length)
+    val messages = new Array[WriterCommitMessage](rdd.partitions.length)
     val totalNumRowsAccumulator = new LongAccumulator()
 
+    val writerFactory = batchWrite.createBatchWriterFactory()
+
     logInfo(s"Start processing data source write support: $batchWrite. " +
       s"The input RDD has ${messages.length} partitions.")
 
     try {
       sparkContext.runJob(
-        rddWithNonEmptyPartitions,
+        rdd,
         (context: TaskContext, iter: Iterator[InternalRow]) =>
           DataWritingSparkTask.run(writerFactory, context, iter, useCommitCoordinator),
-        rddWithNonEmptyPartitions.partitions.indices,
+        rdd.partitions.indices,
         (index, result: DataWritingSparkTaskResult) => {
           val commitMessage = result.writerCommitMessage
           messages(index) = commitMessage
@@ -480,6 +488,7 @@ private[v2] trait AtomicTableWriteExec extends V2TableWriteExec with SupportsV1W
         case table: SupportsWrite =>
           val writeBuilder = table.newWriteBuilder(writeOptions)
             .withInputDataSchema(query.schema)
+            .withNumPartitions(rdd.getNumPartitions)
             .withQueryId(UUID.randomUUID().toString)
 
           val writtenRows = writeBuilder match {