apache · yuchenhuo · Dec 20, 2019 · Dec 20, 2019 · Dec 24, 2019 · Dec 24, 2019
diff --git a/...rnal/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala b/...rnal/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
@@ -624,7 +624,7 @@ class KafkaRelationSuiteV2 extends KafkaRelationSuiteBase {
     val topic = newTopic()
     val df = createDF(topic)
     assert(df.logicalPlan.collect {
-      case DataSourceV2Relation(_, _, _) => true
+      case _: DataSourceV2Relation => true
     }.nonEmpty)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -809,22 +809,27 @@ class Analyzer(
           .getOrElse(i)
 
       case desc @ DescribeTable(u: UnresolvedV2Relation, _) =>
-        CatalogV2Util.loadRelation(u.catalog, u.tableName)
-            .map(rel => desc.copy(table = rel))
-            .getOrElse(desc)
+        CatalogV2Util
+          .loadRelation(u.catalog, catalogManager.catalogIdentifier(u.catalog), u.tableName)
+          .map(rel => desc.copy(table = rel))
+          .getOrElse(desc)
 
       case alter @ AlterTable(_, _, u: UnresolvedV2Relation, _) =>
-        CatalogV2Util.loadRelation(u.catalog, u.tableName)
-            .map(rel => alter.copy(table = rel))
-            .getOrElse(alter)
+        CatalogV2Util
+          .loadRelation(u.catalog, catalogManager.catalogIdentifier(u.catalog), u.tableName)
+          .map(rel => alter.copy(table = rel))
+          .getOrElse(alter)
 
       case show @ ShowTableProperties(u: UnresolvedV2Relation, _) =>
-        CatalogV2Util.loadRelation(u.catalog, u.tableName)
+        CatalogV2Util
+          .loadRelation(u.catalog, catalogManager.catalogIdentifier(u.catalog), u.tableName)
           .map(rel => show.copy(table = rel))
           .getOrElse(show)
 
       case u: UnresolvedV2Relation =>
-        CatalogV2Util.loadRelation(u.catalog, u.tableName).getOrElse(u)
+        CatalogV2Util
+          .loadRelation(u.catalog, catalogManager.catalogIdentifier(u.catalog), u.tableName)
+          .getOrElse(u)
     }
 
     /**
@@ -834,7 +839,11 @@ class Analyzer(
       expandRelationName(identifier) match {
         case NonSessionCatalogAndIdentifier(catalog, ident) =>
           CatalogV2Util.loadTable(catalog, ident) match {
-            case Some(table) => Some(DataSourceV2Relation.create(table))
+            case Some(table) =>
+              Some(DataSourceV2Relation.create(
+                table,
+                catalogManager.catalogIdentifier(catalog),
+                Seq(ident)))
             case None => None
           }
         case _ => None
@@ -915,7 +924,11 @@ class Analyzer(
               AnalysisContext.get.relationCache.getOrElseUpdate(
                 key, v1SessionCatalog.getRelation(v1Table.v1Table))
             case table =>
-              DataSourceV2Relation.create(table)
+              DataSourceV2Relation.create(
+                table,
+                catalogManager.catalogIdentifier(catalog),
+                Seq(ident)
+              )
           }
         case _ => None
       }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala
@@ -44,15 +44,35 @@ class CatalogManager(
   import CatalogManager.SESSION_CATALOG_NAME
 
   private val catalogs = mutable.HashMap.empty[String, CatalogPlugin]
+  // Map from catalog back to it's original name for easy name look up, we don't use the
+  // CatalogPlugin's name as it might be different from the catalog name depending on
+  // implementation.
+  private val catalogIdentifiers = mutable.HashMap.empty[CatalogPlugin, String]
+  catalogIdentifiers(defaultSessionCatalog) = SESSION_CATALOG_NAME
 
   def catalog(name: String): CatalogPlugin = synchronized {
     if (name.equalsIgnoreCase(SESSION_CATALOG_NAME)) {
       v2SessionCatalog
     } else {
-      catalogs.getOrElseUpdate(name, Catalogs.load(name, conf))
+      catalogs.getOrElseUpdate(name, {
+        val catalog = Catalogs.load(name, conf)
+        catalogIdentifiers(catalog) = name
+        catalog
+      })
     }
   }
 
+  /**
+   * Returns the identifier string for the given catalog
+   *
+   * @param catalog catalog to look up
+   * @return string identifier for the given catalog. If the catalog hasn't be registered return
+    *        None
+   */
+  def catalogIdentifier(catalog: CatalogPlugin): Option[String] = synchronized {
+    catalogIdentifiers.get(catalog)
+  }
+
   def isCatalogRegistered(name: String): Boolean = {
     try {
       catalog(name)
@@ -83,7 +103,11 @@ class CatalogManager(
   private[sql] def v2SessionCatalog: CatalogPlugin = {
     conf.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION).map { customV2SessionCatalog =>
       try {
-        catalogs.getOrElseUpdate(SESSION_CATALOG_NAME, loadV2SessionCatalog())
+        catalogs.getOrElseUpdate(SESSION_CATALOG_NAME, {
+          val catalog = loadV2SessionCatalog()
+          catalogIdentifiers(catalog) = SESSION_CATALOG_NAME
+          catalog
+        })
       } catch {
         case NonFatal(_) =>
           logError(

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala
@@ -256,8 +256,11 @@ private[sql] object CatalogV2Util {
       case _: NoSuchNamespaceException => None
     }
 
-  def loadRelation(catalog: CatalogPlugin, ident: Identifier): Option[NamedRelation] = {
-    loadTable(catalog, ident).map(DataSourceV2Relation.create)
+  def loadRelation(
+      catalog: CatalogPlugin,
+      catalogIdentifier: Option[String],
+      ident: Identifier): Option[NamedRelation] = {
+    loadTable(catalog, ident).map(DataSourceV2Relation.create(_, catalogIdentifier, Seq(ident)))
   }
 
   def isSessionCatalog(catalog: CatalogPlugin): Boolean = {

diff --git a/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelat
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.truncatedString
-import org.apache.spark.sql.connector.catalog.{Table, TableCapability}
+import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, Table, TableCapability}
 import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, Statistics => V2Statistics, SupportsReportStatistics}
 import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream}
 import org.apache.spark.sql.connector.write.WriteBuilder
@@ -32,12 +32,18 @@ import org.apache.spark.util.Utils
  * A logical plan representing a data source v2 table.
  *
  * @param table   The table that this relation represents.
+ * @param output the output attributes of this relation
+ * @param catalogIdentifier the string identifier for the catalog
+ * @param identifiers the identifiers for the v2 relation. For multipath dataframe, there could be
+ *                    more than one identifier
  * @param options The options for this table operation. It's used to create fresh [[ScanBuilder]]
  *                and [[WriteBuilder]].
  */
 case class DataSourceV2Relation(
     table: Table,
     output: Seq[AttributeReference],
+    catalogIdentifier: Option[String],
+    identifiers: Seq[Identifier],
     options: CaseInsensitiveStringMap)
   extends LeafNode with MultiInstanceRelation with NamedRelation {
 
@@ -137,12 +143,20 @@ case class StreamingDataSourceV2Relation(
 }
 
 object DataSourceV2Relation {
-  def create(table: Table, options: CaseInsensitiveStringMap): DataSourceV2Relation = {
+  def create(
+      table: Table,
+      catalogIdentifier: Option[String],
+      identifiers: Seq[Identifier],
+      options: CaseInsensitiveStringMap): DataSourceV2Relation = {
     val output = table.schema().toAttributes
-    DataSourceV2Relation(table, output, options)
+    DataSourceV2Relation(table, output, catalogIdentifier, identifiers, options)
   }
 
-  def create(table: Table): DataSourceV2Relation = create(table, CaseInsensitiveStringMap.empty)
+  def create(
+      table: Table,
+      catalogIdentifier: Option[String],
+      identifiers: Seq[Identifier]): DataSourceV2Relation =
+    create(table, catalogIdentifier, identifiers, CaseInsensitiveStringMap.empty)
 
   /**
    * This is used to transform data source v2 statistics to logical.Statistics.

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogManagerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogManagerSuite.scala
@@ -110,6 +110,26 @@ class CatalogManagerSuite extends SparkFunSuite {
     catalogManager.setCurrentNamespace(Array("test2"))
     assert(v1SessionCatalog.getCurrentDatabase == "default")
   }
+
+  test("Catalog manager should be able to return the string identifier for registered catalog") {
+    val conf = new SQLConf
+    conf.setConfString("spark.sql.catalog.dummy", classOf[DummyCatalog].getName)
+    conf.setConfString(s"spark.sql.catalog.${CatalogManager.SESSION_CATALOG_NAME}",
+      classOf[DummyCatalog].getName)
+    val catalogManager = new CatalogManager(conf, FakeV2SessionCatalog, createSessionCatalog(conf))
+
+    assert(catalogManager.catalogIdentifier(catalogManager.catalog("dummy")) == Some("dummy"))
+    assert(catalogManager.catalogIdentifier(catalogManager.v2SessionCatalog) ==
+      Some(CatalogManager.SESSION_CATALOG_NAME))
+  }
+
+  test("Catalog manager should be able to return the string identifier for default catalog " +
+    "if no custom session catalog is provided") {
+    val conf = new SQLConf
+    val catalogManager = new CatalogManager(conf, FakeV2SessionCatalog, createSessionCatalog(conf))
+    assert(catalogManager.catalogIdentifier(catalogManager.v2SessionCatalog) ==
+      Some(CatalogManager.SESSION_CATALOG_NAME))
+  }
 }
 
 class DummyCatalog extends SupportsNamespaces {

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogV2UtilSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogV2UtilSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog
+
+import org.mockito.Mockito.{mock, when}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.types.StructType
+
+class CatalogV2UtilSuite extends SparkFunSuite {
+  test("Load relation should encode the identifiers for V2Relations") {
+    val testCatalog = mock(classOf[TableCatalog])
+    val ident = mock(classOf[Identifier])
+    val table = mock(classOf[Table])
+    when(table.schema()).thenReturn(mock(classOf[StructType]))
+    when(testCatalog.loadTable(ident)).thenReturn(table)
+    val r = CatalogV2Util.loadRelation(testCatalog, Some("dummy"), ident)
+    assert(r.isDefined)
+    assert(r.get.isInstanceOf[DataSourceV2Relation])
+    val v2Relation = r.get.asInstanceOf[DataSourceV2Relation]
+    assert(v2Relation.catalogIdentifier == Some("dummy"))
+    assert(v2Relation.identifiers == Seq(ident))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -195,6 +195,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
     }
 
     DataSource.lookupDataSourceV2(source, sparkSession.sessionState.conf).map { provider =>
+      val catalogManager = sparkSession.sessionState.catalogManager
       val sessionOptions = DataSourceV2Utils.extractSessionConfigs(
         source = provider, conf = sparkSession.sessionState.conf)
       val pathsOption = if (paths.isEmpty) {
@@ -206,27 +207,29 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
       val finalOptions = sessionOptions ++ extraOptions.toMap ++ pathsOption
       val dsOptions = new CaseInsensitiveStringMap(finalOptions.asJava)
-      val table = provider match {
+      val (table, catalogOpt, ident) = provider match {
         case _: SupportsCatalogOptions if userSpecifiedSchema.nonEmpty =>
           throw new IllegalArgumentException(
             s"$source does not support user specified schema. Please don't specify the schema.")
         case hasCatalog: SupportsCatalogOptions =>
           val ident = hasCatalog.extractIdentifier(dsOptions)
           val catalog = CatalogV2Util.getTableProviderCatalog(
             hasCatalog,
-            sparkSession.sessionState.catalogManager,
+            catalogManager,
             dsOptions)
-          catalog.loadTable(ident)
+          (catalog.loadTable(ident), catalogManager.catalogIdentifier(catalog), Seq(ident))
         case _ =>
           userSpecifiedSchema match {
-            case Some(schema) => provider.getTable(dsOptions, schema)
-            case _ => provider.getTable(dsOptions)
+            case Some(schema) => (provider.getTable(dsOptions, schema), None, Nil)
+            case _ => (provider.getTable(dsOptions), None, Nil)
           }
       }
       import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
       table match {
         case _: SupportsRead if table.supports(BATCH_READ) =>
-          Dataset.ofRows(sparkSession, DataSourceV2Relation.create(table, dsOptions))
+          Dataset.ofRows(
+            sparkSession,
+            DataSourceV2Relation.create(table, catalogOpt, ident, dsOptions))
 
         case _ => loadV1Source(paths: _*)
       }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -258,20 +258,20 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       val dsOptions = new CaseInsensitiveStringMap(options.asJava)
 
       import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
+      val catalogManager = df.sparkSession.sessionState.catalogManager
       mode match {
         case SaveMode.Append | SaveMode.Overwrite =>
-          val table = provider match {
+          val (table, catalogOpt, ident) = provider match {
             case supportsExtract: SupportsCatalogOptions =>
               val ident = supportsExtract.extractIdentifier(dsOptions)
-              val sessionState = df.sparkSession.sessionState
               val catalog = CatalogV2Util.getTableProviderCatalog(
-                supportsExtract, sessionState.catalogManager, dsOptions)
+                supportsExtract, catalogManager, dsOptions)
 
-              catalog.loadTable(ident)
+              (catalog.loadTable(ident), catalogManager.catalogIdentifier(catalog), Seq(ident))
             case tableProvider: TableProvider =>
               val t = tableProvider.getTable(dsOptions)
               if (t.supports(BATCH_WRITE)) {
-                t
+                (t, None, Nil)
               } else {
                 // Streaming also uses the data source V2 API. So it may be that the data source
                 // implements v2, but has no v2 implementation for batch writes. In that case, we
@@ -280,7 +280,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
               }
           }
 
-          val relation = DataSourceV2Relation.create(table, dsOptions)
+          val relation = DataSourceV2Relation.create(table, catalogOpt, ident, dsOptions)
           checkPartitioningMatchesV2Table(table)
           if (mode == SaveMode.Append) {
             runCommand(df.sparkSession, "save") {
@@ -299,9 +299,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           provider match {
             case supportsExtract: SupportsCatalogOptions =>
               val ident = supportsExtract.extractIdentifier(dsOptions)
-              val sessionState = df.sparkSession.sessionState
               val catalog = CatalogV2Util.getTableProviderCatalog(
-                supportsExtract, sessionState.catalogManager, dsOptions)
+                supportsExtract, catalogManager, dsOptions)
 
               val location = Option(dsOptions.get("path")).map(TableCatalog.PROP_LOCATION -> _)
 
@@ -413,13 +412,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   private def insertInto(catalog: CatalogPlugin, ident: Identifier): Unit = {
+    import df.sparkSession.sessionState.analyzer.catalogManager
     import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
 
     val table = catalog.asTableCatalog.loadTable(ident) match {
       case _: V1Table =>
         return insertInto(TableIdentifier(ident.name(), ident.namespace().headOption))
       case t =>
-        DataSourceV2Relation.create(t)
+        DataSourceV2Relation.create(t, catalogManager.catalogIdentifier(catalog), Seq(ident))
     }
 
     val command = mode match {
@@ -544,6 +544,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
 
   private def saveAsTable(catalog: TableCatalog, ident: Identifier): Unit = {
+    import df.sparkSession.sessionState.analyzer.catalogManager
+
     val tableOpt = try Option(catalog.loadTable(ident)) catch {
       case _: NoSuchTableException => None
     }
@@ -554,12 +556,17 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     }
 
     val command = (mode, tableOpt) match {
-      case (_, Some(table: V1Table)) =>
+      case (_, Some(_: V1Table)) =>
         return saveAsTable(TableIdentifier(ident.name(), ident.namespace().headOption))
 
       case (SaveMode.Append, Some(table)) =>
         checkPartitioningMatchesV2Table(table)
-        AppendData.byName(DataSourceV2Relation.create(table), df.logicalPlan, extraOptions.toMap)
+        AppendData.byName(
+          DataSourceV2Relation.create(
+            table,
+            catalogManager.catalogIdentifier(catalog),
+            Seq(ident)),
+          df.logicalPlan, extraOptions.toMap)
 
       case (SaveMode.Overwrite, _) =>
         ReplaceTableAsSelect(