apache · yhuai · Nov 12, 2015 · Nov 12, 2015 · Nov 12, 2015 · Nov 13, 2015
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -75,10 +75,11 @@ private[sql] object PartitioningUtils {
   private[sql] def parsePartitions(
       paths: Seq[Path],
       defaultPartitionName: String,
-      typeInference: Boolean): PartitionSpec = {
+      typeInference: Boolean,
+      rootPaths: Set[Path]): PartitionSpec = {
     // First, we need to parse every partition's path and see if we can find partition values.
     val (partitionValues, optBasePaths) = paths.map { path =>
-      parsePartition(path, defaultPartitionName, typeInference)
+      parsePartition(path, defaultPartitionName, typeInference, rootPaths)
     }.unzip
 
     // We create pairs of (path -> path's partition value) here
@@ -152,11 +153,14 @@ private[sql] object PartitioningUtils {
   private[sql] def parsePartition(
       path: Path,
       defaultPartitionName: String,
-      typeInference: Boolean): (Option[PartitionValues], Option[Path]) = {
+      typeInference: Boolean,
+      rootPaths: Set[Path]): (Option[PartitionValues], Option[Path]) = {
     val columns = ArrayBuffer.empty[(String, Literal)]
     // Old Hadoop versions don't have `Path.isRoot`
     var finished = path.getParent == null
+    // chopped path is the current path that we will use to parse partition column value.
     var chopped = path
+    // base path will be the child of chopped in the loop below.
     var basePath = path
 
     while (!finished) {
@@ -166,11 +170,37 @@ private[sql] object PartitioningUtils {
         return (None, None)
       }
 
+      // Let's say chopped is a path of /table/a=1/, chopped.getName will give us a=1.
+      // Once we get the string, we try to parse it and find the partition column and value.
       val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName, typeInference)
-      maybeColumn.foreach(columns += _)
+
+      // Now, basePath will be /table/a=1/
       basePath = chopped
+      // chopped will be /table/
       chopped = chopped.getParent
-      finished = (maybeColumn.isEmpty && !columns.isEmpty) || chopped.getParent == null
+
+      // Now, we determine if we should continue.
+      // When we hit any of the following three cases, we will not continue:
+      //  - In this iteration, we could not parse the value of partition column and value,
+      //    i.e. maybeColumn is None, and columns is not empty. At here we check if columns is
+      //    empty to handle cases like /table/a=1/_temporary/something (we need to find a=1 in
+      //    this case).
+      //  - After we get the new chopped, this new chopped represent the path of "/table", i.e.
+      //    chopped.getParent == null.
+      //  - The chopped we used to parse partition column and value (right now, it is basePath),
+      //    is already the root path of a table. For the example of /table/a=1/, /table/ is the
+      //    root path.
+      finished =
+        (maybeColumn.isEmpty && !columns.isEmpty) ||
+          chopped.getParent == null ||
+          rootPaths.contains(basePath)
+
+      if (maybeColumn.isDefined && !rootPaths.contains(basePath)) {
+        // If we can parse the partition column and its value, and the path
+        // we used for parsing is not the root path, we should append the parsed
+        // result to columns.
+        maybeColumn.foreach(columns += _)
+      }
     }
 
     if (columns.isEmpty) {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -519,7 +519,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   }
 
   /**
-   * Base paths of this relation.  For partitioned relations, it should be either root directories
+   * Base paths of this relation.  For partitioned relations, it should be root directories
    * of all partition directories.
    *
    * @since 1.4.0
@@ -554,12 +554,19 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
   }
 
   private def discoverPartitions(): PartitionSpec = {
+    val rootDirs = paths.map { path =>
+      new Path(path)
+    }.toSet
+
     // We use leaf dirs containing data files to discover the schema.
     val leafDirs = fileStatusCache.leafDirToChildrenFiles.keys.toSeq
     userDefinedPartitionColumns match {
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
-          leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME, typeInference = false)
+          leafDirs,
+          PartitioningUtils.DEFAULT_PARTITION_NAME,
+          typeInference = false,
+          rootPaths = rootDirs)
 
         // Without auto inference, all of value in the `row` should be null or in StringType,
         // we need to cast into the data type that user specified.
@@ -577,8 +584,11 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
       case _ =>
         // user did not provide a partitioning schema
-        PartitioningUtils.parsePartitions(leafDirs, PartitioningUtils.DEFAULT_PARTITION_NAME,
-          typeInference = sqlContext.conf.partitionColumnTypeInferenceEnabled())
+        PartitioningUtils.parsePartitions(
+          leafDirs,
+          PartitioningUtils.DEFAULT_PARTITION_NAME,
+          typeInference = sqlContext.conf.partitionColumnTypeInferenceEnabled(),
+          rootPaths = rootDirs)
     }
   }
 

diff --git a/...a/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/...a/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -66,7 +66,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       "hdfs://host:9000/path/a=10.5/b=hello")
 
     var exception = intercept[AssertionError] {
-      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true, Set.empty[Path])
     }
     assert(exception.getMessage().contains("Conflicting directory structures detected"))
 
@@ -76,7 +76,37 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       "hdfs://host:9000/path/a=10/b=20",
       "hdfs://host:9000/path/_temporary/path")
 
-    parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+    parsePartitions(
+      paths.map(new Path(_)),
+      defaultPartitionName,
+      true,
+      Set(new Path("hdfs://host:9000/path/")))
+
+    // Valid
+    paths = Seq(
+      "hdfs://host:9000/path/something=true/table/",
+      "hdfs://host:9000/path/something=true/table/_temporary",
+      "hdfs://host:9000/path/something=true/table/a=10/b=20",
+      "hdfs://host:9000/path/something=true/table/_temporary/path")
+
+    parsePartitions(
+      paths.map(new Path(_)),
+      defaultPartitionName,
+      true,
+      Set(new Path("hdfs://host:9000/path/something=true/table")))
+
+    // Valid
+    paths = Seq(
+      "hdfs://host:9000/path/table=true/",
+      "hdfs://host:9000/path/table=true/_temporary",
+      "hdfs://host:9000/path/table=true/a=10/b=20",
+      "hdfs://host:9000/path/table=true/_temporary/path")
+
+    parsePartitions(
+      paths.map(new Path(_)),
+      defaultPartitionName,
+      true,
+      Set(new Path("hdfs://host:9000/path/table=true")))
 
     // Invalid
     paths = Seq(
@@ -85,7 +115,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       "hdfs://host:9000/path/path1")
 
     exception = intercept[AssertionError] {
-      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+      parsePartitions(
+        paths.map(new Path(_)),
+        defaultPartitionName,
+        true,
+        Set(new Path("hdfs://host:9000/path/")))
     }
     assert(exception.getMessage().contains("Conflicting directory structures detected"))
 
@@ -101,19 +135,24 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       "hdfs://host:9000/tmp/tables/nonPartitionedTable2")
 
     exception = intercept[AssertionError] {
-      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true)
+      parsePartitions(
+        paths.map(new Path(_)),
+        defaultPartitionName,
+        true,
+        Set(new Path("hdfs://host:9000/tmp/tables/")))
     }
     assert(exception.getMessage().contains("Conflicting directory structures detected"))
   }
 
   test("parse partition") {
     def check(path: String, expected: Option[PartitionValues]): Unit = {
-      assert(expected === parsePartition(new Path(path), defaultPartitionName, true)._1)
+      val actual = parsePartition(new Path(path), defaultPartitionName, true, Set.empty[Path])._1
+      assert(expected === actual)
     }
 
     def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
       val message = intercept[T] {
-        parsePartition(new Path(path), defaultPartitionName, true)
+        parsePartition(new Path(path), defaultPartitionName, true, Set.empty[Path])
       }.getMessage
 
       assert(message.contains(expected))
@@ -152,8 +191,17 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
   }
 
   test("parse partitions") {
-    def check(paths: Seq[String], spec: PartitionSpec): Unit = {
-      assert(parsePartitions(paths.map(new Path(_)), defaultPartitionName, true) === spec)
+    def check(
+        paths: Seq[String],
+        spec: PartitionSpec,
+        rootPaths: Set[Path] = Set.empty[Path]): Unit = {
+      val actualSpec =
+        parsePartitions(
+          paths.map(new Path(_)),
+          defaultPartitionName,
+          true,
+          rootPaths)
+      assert(actualSpec === spec)
     }
 
     check(Seq(
@@ -232,7 +280,9 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
   test("parse partitions with type inference disabled") {
     def check(paths: Seq[String], spec: PartitionSpec): Unit = {
-      assert(parsePartitions(paths.map(new Path(_)), defaultPartitionName, false) === spec)
+      val actualSpec =
+        parsePartitions(paths.map(new Path(_)), defaultPartitionName, false, Set.empty[Path])
+      assert(actualSpec === spec)
     }
 
     check(Seq(
@@ -590,6 +640,40 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     }
   }
 
+  test("SPARK-11678: Partition discovery stops at the root path of the dataset") {
+    withTempPath { dir =>
+      val tablePath = new File(dir, "key=value")
+      val df = (1 to 3).map(i => (i, i, i, i)).toDF("a", "b", "c", "d")
+
+      df.write
+        .format("parquet")
+        .partitionBy("b", "c", "d")
+        .save(tablePath.getCanonicalPath)
+
+      Files.touch(new File(s"${tablePath.getCanonicalPath}/", "_SUCCESS"))
+      Files.createParentDirs(new File(s"${dir.getCanonicalPath}/b=1/c=1/.foo/bar"))
+
+      checkAnswer(sqlContext.read.format("parquet").load(tablePath.getCanonicalPath), df)
+    }
+
+    withTempPath { dir =>
+      val path = new File(dir, "key=value")
+      val tablePath = new File(path, "table")
+
+      val df = (1 to 3).map(i => (i, i, i, i)).toDF("a", "b", "c", "d")
+
+      df.write
+        .format("parquet")
+        .partitionBy("b", "c", "d")
+        .save(tablePath.getCanonicalPath)
+
+      Files.touch(new File(s"${tablePath.getCanonicalPath}/", "_SUCCESS"))
+      Files.createParentDirs(new File(s"${dir.getCanonicalPath}/b=1/c=1/.foo/bar"))
+
+      checkAnswer(sqlContext.read.format("parquet").load(tablePath.getCanonicalPath), df)
+    }
+  }
+
   test("listConflictingPartitionColumns") {
     def makeExpectedMessage(colNameLists: Seq[String], paths: Seq[String]): String = {
       val conflictingColNameLists = colNameLists.zipWithIndex.map { case (list, index) =>