apache · brkyvz · Dec 16, 2019 · Dec 16, 2019 · Dec 16, 2019 · Dec 17, 2019
diff --git a/...catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsCatalogOptions.java b/...catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsCatalogOptions.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog;
+
+import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+
+/**
+ * An interface, which TableProviders can implement, to support table existence checks and creation
+ * through a catalog, without having to use table identifiers. For example, when file based data
+ * sources use the `DataFrameWriter.save(path)` method, the option `path` can translate to a
+ * PathIdentifier. A catalog can then use this PathIdentifier to check the existence of a table, or
+ * whether a table can be created at a given directory.
+ */
+@Evolving
+public interface SupportsCatalogOptions extends TableProvider {
+  /**
+   * Return a {@link Identifier} instance that can identify a table for a DataSource given
+   * DataFrame[Reader|Writer] options.
+   *
+   * @param options the user-specified options that can identify a table, e.g. file path, Kafka
+   *                topic name, etc. It's an immutable case-insensitive string-to-string map.
+   */
+  Identifier extractIdentifier(CaseInsensitiveStringMap options);
+
+  /**
+   * Return the name of a catalog that can be used to check the existence of, load, and create
+   * a table for this DataSource given the identifier that will be extracted by
+   * {@link #extractIdentifier(CaseInsensitiveStringMap) extractIdentifier}. A `null` value can
+   * be used to defer to the V2SessionCatalog.
+   *
+   * @param options the user-specified options that can identify a table, e.g. file path, Kafka
+   *                topic name, etc. It's an immutable case-insensitive string-to-string map.
+   */
+  default String extractCatalog(CaseInsensitiveStringMap options) {
+    return null;
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, InsertIntoStatement, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
-import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, SupportsWrite, TableCatalog, TableProvider, V1Table}
+import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Catalogs, Identifier, SupportsCatalogOptions, SupportsWrite, Table, TableCatalog, TableProvider, V1Table}
 import org.apache.spark.sql.connector.catalog.TableCapability._
 import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, LiteralValue, Transform}
 import org.apache.spark.sql.execution.SQLExecution
@@ -260,24 +260,44 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
       provider.getTable(dsOptions) match {
         case table: SupportsWrite if table.supports(BATCH_WRITE) =>
-          if (partitioningColumns.nonEmpty) {
-            throw new AnalysisException("Cannot write data to TableProvider implementation " +
-              "if partition columns are specified.")
-          }
           lazy val relation = DataSourceV2Relation.create(table, dsOptions)
           mode match {
             case SaveMode.Append =>
+              verifyV2Partitioning(table)
               runCommand(df.sparkSession, "save") {
                 AppendData.byName(relation, df.logicalPlan, extraOptions.toMap)
               }
 
             case SaveMode.Overwrite if table.supportsAny(TRUNCATE, OVERWRITE_BY_FILTER) =>
+              verifyV2Partitioning(table)
               // truncate the table
               runCommand(df.sparkSession, "save") {
                 OverwriteByExpression.byName(
                   relation, df.logicalPlan, Literal(true), extraOptions.toMap)
               }
 
+            case other if classOf[SupportsCatalogOptions].isAssignableFrom(provider.getClass) =>
+              val catalogOptions = provider.asInstanceOf[SupportsCatalogOptions]
+              val ident = catalogOptions.extractIdentifier(dsOptions)
+              val sessionState = df.sparkSession.sessionState
+              val catalog = Option(catalogOptions.extractCatalog(dsOptions))
+                .map(Catalogs.load(_, sessionState.conf))
+                .getOrElse(sessionState.catalogManager.v2SessionCatalog)
+                .asInstanceOf[TableCatalog]
+
+              val location = Option(dsOptions.get("path")).map(TableCatalog.PROP_LOCATION -> _)
+
+              runCommand(df.sparkSession, "save") {
+                CreateTableAsSelect(
+                  catalog,
+                  ident,
+                  getV2Transforms,
+                  df.queryExecution.analyzed,
+                  Map(TableCatalog.PROP_PROVIDER -> source) ++ location,
+                  extraOptions.toMap,
+                  ignoreIfExists = other == SaveMode.Ignore)
+              }
+
             case other =>
               throw new AnalysisException(s"TableProvider implementation $source cannot be " +
                 s"written with $other mode, please use Append or Overwrite " +
@@ -504,14 +524,6 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
 
   private def saveAsTable(catalog: TableCatalog, ident: Identifier): Unit = {
-    val partitioning = partitioningColumns.map { colNames =>
-      colNames.map(name => IdentityTransform(FieldReference(name)))
-    }.getOrElse(Seq.empty[Transform])
-    val bucketing = bucketColumnNames.map { cols =>
-      Seq(BucketTransform(LiteralValue(numBuckets.get, IntegerType), cols.map(FieldReference(_))))
-    }.getOrElse(Seq.empty[Transform])
-    val partitionTransforms = partitioning ++ bucketing
-
     val tableOpt = try Option(catalog.loadTable(ident)) catch {
       case _: NoSuchTableException => None
     }
@@ -526,13 +538,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         return saveAsTable(TableIdentifier(ident.name(), ident.namespace().headOption))
 
       case (SaveMode.Append, Some(table)) =>
+        verifyV2Partitioning(table)
         AppendData.byName(DataSourceV2Relation.create(table), df.logicalPlan, extraOptions.toMap)
 
       case (SaveMode.Overwrite, _) =>
         ReplaceTableAsSelect(
           catalog,
           ident,
-          partitionTransforms,
+          getV2Transforms,
           df.queryExecution.analyzed,
           Map(TableCatalog.PROP_PROVIDER -> source) ++ getLocationIfExists,
           extraOptions.toMap,
@@ -545,7 +558,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         CreateTableAsSelect(
           catalog,
           ident,
-          partitionTransforms,
+          getV2Transforms,
           df.queryExecution.analyzed,
           Map(TableCatalog.PROP_PROVIDER -> source) ++ getLocationIfExists,
           extraOptions.toMap,
@@ -623,6 +636,30 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       CreateTable(tableDesc, mode, Some(df.logicalPlan)))
   }
 
+  /** Converts the provided partitioning and bucketing information to DataSourceV2 Transforms. */
+  private def getV2Transforms: Seq[Transform] = {
+    val partitioning = partitioningColumns.map { colNames =>
+      colNames.map(name => IdentityTransform(FieldReference(name)))
+    }.getOrElse(Seq.empty[Transform])
+    val bucketing = bucketColumnNames.map { cols =>
+      Seq(BucketTransform(LiteralValue(numBuckets.get, IntegerType), cols.map(FieldReference(_))))
+    }.getOrElse(Seq.empty[Transform])
+    partitioning ++ bucketing
+  }
+
+  /**
+   * For V2 DataSources, performs if the provided partitioning matches that of the table.
+   * Partitioning information is not required when appending data to V2 tables.
+   */
+  private def verifyV2Partitioning(existingTable: Table): Unit = {
+    val v2Partitions = getV2Transforms
+    if (v2Partitions.isEmpty) return
+    require(v2Partitions.sameElements(existingTable.partitioning()),
+      "The provided partitioning does not match of the table.\n" +
+      s" - provided: ${v2Partitions.mkString(", ")}\n" +
+      s" - table: ${existingTable.partitioning().mkString(", ")}")
+  }
+
   /**
    * Saves the content of the `DataFrame` to an external database table via JDBC. In the case the
    * table already exists in the external database, behavior of this function depends on the