Skip to content

Commit

Permalink
[SPARK-12682][SQL] Add support for (optionally) not storing tables in…
Browse files Browse the repository at this point in the history
… hive metadata format

This PR adds a new table option (`skip_hive_metadata`) that'd allow the user to skip storing the table metadata in hive metadata format. While this could be useful in general, the specific use-case for this change is that Hive doesn't handle wide schemas well (see https://issues.apache.org/jira/browse/SPARK-12682 and https://issues.apache.org/jira/browse/SPARK-6024) which in turn prevents such tables from being queried in SparkSQL.

Author: Sameer Agarwal <sameer@databricks.com>

Closes #10826 from sameeragarwal/skip-hive-metadata.
  • Loading branch information
sameeragarwal authored and yhuai committed Jan 26, 2016
1 parent ae0309a commit 08c781c
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,14 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive

// TODO: Support persisting partitioned data source relations in Hive compatible format
val qualifiedTableName = tableIdent.quotedString
val skipHiveMetadata = options.getOrElse("skipHiveMetadata", "false").toBoolean
val (hiveCompatibleTable, logMessage) = (maybeSerDe, dataSource.relation) match {
case _ if skipHiveMetadata =>
val message =
s"Persisting partitioned data source relation $qualifiedTableName into " +
"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive."
(None, message)

case (Some(serde), relation: HadoopFsRelation)
if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -900,4 +900,36 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
sqlContext.sql("""use default""")
sqlContext.sql("""drop database if exists testdb8156 CASCADE""")
}

test("skip hive metadata on table creation") {
val schema = StructType((1 to 5).map(i => StructField(s"c_$i", StringType)))

catalog.createDataSourceTable(
tableIdent = TableIdentifier("not_skip_hive_metadata"),
userSpecifiedSchema = Some(schema),
partitionColumns = Array.empty[String],
bucketSpec = None,
provider = "parquet",
options = Map("path" -> "just a dummy path", "skipHiveMetadata" -> "false"),
isExternal = false)

// As a proxy for verifying that the table was stored in Hive compatible format, we verify that
// each column of the table is of native type StringType.
assert(catalog.client.getTable("default", "not_skip_hive_metadata").schema
.forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == StringType))

catalog.createDataSourceTable(
tableIdent = TableIdentifier("skip_hive_metadata"),
userSpecifiedSchema = Some(schema),
partitionColumns = Array.empty[String],
bucketSpec = None,
provider = "parquet",
options = Map("path" -> "just a dummy path", "skipHiveMetadata" -> "true"),
isExternal = false)

// As a proxy for verifying that the table was stored in SparkSQL format, we verify that
// the table has a column type as array of StringType.
assert(catalog.client.getTable("default", "skip_hive_metadata").schema
.forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == ArrayType(StringType)))
}
}

0 comments on commit 08c781c

Please sign in to comment.