-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-23553][TESTS] Tests should not assume the default value of spark.sql.sources.default
#20705
Changes from all commits
eb62c2f
5192e6a
3ec9309
144460d
d9d2564
159489c
2975aff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2150,7 +2150,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | |
|
||
test("data source table created in InMemoryCatalog should be able to read/write") { | ||
withTable("tbl") { | ||
sql("CREATE TABLE tbl(i INT, j STRING) USING parquet") | ||
val provider = spark.sessionState.conf.defaultDataSourceName | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm .. how about just explicitly setting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So far, the purpose of this PR is setting once in BTW, |
||
sql(s"CREATE TABLE tbl(i INT, j STRING) USING $provider") | ||
checkAnswer(sql("SELECT i, j FROM tbl"), Nil) | ||
|
||
Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto("tbl") | ||
|
@@ -2474,9 +2475,9 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | |
|
||
test("SPARK-16975: Column-partition path starting '_' should be handled correctly") { | ||
withTempDir { dir => | ||
val parquetDir = new File(dir, "parquet").getCanonicalPath | ||
spark.range(10).withColumn("_col", $"id").write.partitionBy("_col").save(parquetDir) | ||
spark.read.parquet(parquetDir) | ||
val dataDir = new File(dir, "data").getCanonicalPath | ||
spark.range(10).withColumn("_col", $"id").write.partitionBy("_col").save(dataDir) | ||
spark.read.load(dataDir) | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,16 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha | |
val timeZone = TimeZone.getDefault() | ||
val timeZoneId = timeZone.getID | ||
|
||
protected override def beforeAll(): Unit = { | ||
super.beforeAll() | ||
spark.conf.set(SQLConf.DEFAULT_DATA_SOURCE_NAME.key, "parquet") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is |
||
} | ||
|
||
protected override def afterAll(): Unit = { | ||
spark.conf.unset(SQLConf.DEFAULT_DATA_SOURCE_NAME.key) | ||
super.afterAll() | ||
} | ||
|
||
test("column type inference") { | ||
def check(raw: String, literal: Literal, timeZone: TimeZone = timeZone): Unit = { | ||
assert(inferPartitionColumnValue(raw, true, timeZone) === literal) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -591,7 +591,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
} | ||
|
||
test("Pre insert nullability check (ArrayType)") { | ||
withTable("arrayInParquet") { | ||
withTable("array") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good, maybe in a future cleanup, to replace all these repeating string literals (e.g, "array" 7 times, "map" 7 times) with a variable name. |
||
{ | ||
val df = (Tuple1(Seq(Int.box(1), null: Integer)) :: Nil).toDF("a") | ||
val expectedSchema = | ||
|
@@ -604,9 +604,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
assert(df.schema === expectedSchema) | ||
|
||
df.write | ||
.format("parquet") | ||
.mode(SaveMode.Overwrite) | ||
.saveAsTable("arrayInParquet") | ||
.saveAsTable("array") | ||
} | ||
|
||
{ | ||
|
@@ -621,25 +620,24 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
assert(df.schema === expectedSchema) | ||
|
||
df.write | ||
.format("parquet") | ||
.mode(SaveMode.Append) | ||
.insertInto("arrayInParquet") | ||
.insertInto("array") | ||
} | ||
|
||
(Tuple1(Seq(4, 5)) :: Nil).toDF("a") | ||
.write | ||
.mode(SaveMode.Append) | ||
.saveAsTable("arrayInParquet") // This one internally calls df2.insertInto. | ||
.saveAsTable("array") // This one internally calls df2.insertInto. | ||
|
||
(Tuple1(Seq(Int.box(6), null: Integer)) :: Nil).toDF("a") | ||
.write | ||
.mode(SaveMode.Append) | ||
.saveAsTable("arrayInParquet") | ||
.saveAsTable("array") | ||
|
||
sparkSession.catalog.refreshTable("arrayInParquet") | ||
sparkSession.catalog.refreshTable("array") | ||
|
||
checkAnswer( | ||
sql("SELECT a FROM arrayInParquet"), | ||
sql("SELECT a FROM array"), | ||
Row(ArrayBuffer(1, null)) :: | ||
Row(ArrayBuffer(2, 3)) :: | ||
Row(ArrayBuffer(4, 5)) :: | ||
|
@@ -648,7 +646,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
} | ||
|
||
test("Pre insert nullability check (MapType)") { | ||
withTable("mapInParquet") { | ||
withTable("map") { | ||
{ | ||
val df = (Tuple1(Map(1 -> (null: Integer))) :: Nil).toDF("a") | ||
val expectedSchema = | ||
|
@@ -661,9 +659,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
assert(df.schema === expectedSchema) | ||
|
||
df.write | ||
.format("parquet") | ||
.mode(SaveMode.Overwrite) | ||
.saveAsTable("mapInParquet") | ||
.saveAsTable("map") | ||
} | ||
|
||
{ | ||
|
@@ -678,27 +675,24 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
assert(df.schema === expectedSchema) | ||
|
||
df.write | ||
.format("parquet") | ||
.mode(SaveMode.Append) | ||
.insertInto("mapInParquet") | ||
.insertInto("map") | ||
} | ||
|
||
(Tuple1(Map(4 -> 5)) :: Nil).toDF("a") | ||
.write | ||
.format("parquet") | ||
.mode(SaveMode.Append) | ||
.saveAsTable("mapInParquet") // This one internally calls df2.insertInto. | ||
.saveAsTable("map") // This one internally calls df2.insertInto. | ||
|
||
(Tuple1(Map(6 -> null.asInstanceOf[Integer])) :: Nil).toDF("a") | ||
.write | ||
.format("parquet") | ||
.mode(SaveMode.Append) | ||
.saveAsTable("mapInParquet") | ||
.saveAsTable("map") | ||
|
||
sparkSession.catalog.refreshTable("mapInParquet") | ||
sparkSession.catalog.refreshTable("map") | ||
|
||
checkAnswer( | ||
sql("SELECT a FROM mapInParquet"), | ||
sql("SELECT a FROM map"), | ||
Row(Map(1 -> null)) :: | ||
Row(Map(2 -> 3)) :: | ||
Row(Map(4 -> 5)) :: | ||
|
@@ -852,52 +846,52 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv | |
(from to to).map(i => i -> s"str$i").toDF("c1", "c2") | ||
} | ||
|
||
withTable("insertParquet") { | ||
createDF(0, 9).write.format("parquet").saveAsTable("insertParquet") | ||
withTable("t") { | ||
createDF(0, 9).write.saveAsTable("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"), | ||
sql("SELECT p.c1, p.c2 FROM t p WHERE p.c1 > 5"), | ||
(6 to 9).map(i => Row(i, s"str$i"))) | ||
|
||
intercept[AnalysisException] { | ||
createDF(10, 19).write.format("parquet").saveAsTable("insertParquet") | ||
createDF(10, 19).write.saveAsTable("t") | ||
} | ||
|
||
createDF(10, 19).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet") | ||
createDF(10, 19).write.mode(SaveMode.Append).saveAsTable("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, p.c2 FROM insertParquet p WHERE p.c1 > 5"), | ||
sql("SELECT p.c1, p.c2 FROM t p WHERE p.c1 > 5"), | ||
(6 to 19).map(i => Row(i, s"str$i"))) | ||
|
||
createDF(20, 29).write.mode(SaveMode.Append).format("parquet").saveAsTable("insertParquet") | ||
createDF(20, 29).write.mode(SaveMode.Append).saveAsTable("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 25"), | ||
sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 5 AND p.c1 < 25"), | ||
(6 to 24).map(i => Row(i, s"str$i"))) | ||
|
||
intercept[AnalysisException] { | ||
createDF(30, 39).write.saveAsTable("insertParquet") | ||
createDF(30, 39).write.saveAsTable("t") | ||
} | ||
|
||
createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("insertParquet") | ||
createDF(30, 39).write.mode(SaveMode.Append).saveAsTable("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 35"), | ||
sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 5 AND p.c1 < 35"), | ||
(6 to 34).map(i => Row(i, s"str$i"))) | ||
|
||
createDF(40, 49).write.mode(SaveMode.Append).insertInto("insertParquet") | ||
createDF(40, 49).write.mode(SaveMode.Append).insertInto("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 5 AND p.c1 < 45"), | ||
sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 5 AND p.c1 < 45"), | ||
(6 to 44).map(i => Row(i, s"str$i"))) | ||
|
||
createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("insertParquet") | ||
createDF(50, 59).write.mode(SaveMode.Overwrite).saveAsTable("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, c2 FROM insertParquet p WHERE p.c1 > 51 AND p.c1 < 55"), | ||
sql("SELECT p.c1, c2 FROM t p WHERE p.c1 > 51 AND p.c1 < 55"), | ||
(52 to 54).map(i => Row(i, s"str$i"))) | ||
createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("insertParquet") | ||
createDF(60, 69).write.mode(SaveMode.Ignore).saveAsTable("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, c2 FROM insertParquet p"), | ||
sql("SELECT p.c1, c2 FROM t p"), | ||
(50 to 59).map(i => Row(i, s"str$i"))) | ||
|
||
createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("insertParquet") | ||
createDF(70, 79).write.mode(SaveMode.Overwrite).insertInto("t") | ||
checkAnswer( | ||
sql("SELECT p.c1, c2 FROM insertParquet p"), | ||
sql("SELECT p.c1, c2 FROM t p"), | ||
(70 to 79).map(i => Row(i, s"str$i"))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious about why the test named "SPARK-8156:create table to specific database by 'use dbname'" still has a hard-coded format of parquet. Is it testing functionality that is orthogonal to the format maybe? I changed the hard-coded format to json, orc, and csv, and each time that test passed. Similarly with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is because this PR minimally changed only the test case causing failures. We cannot generalize all test cases at an one-shot huge PR for all modules. That will make it difficult to backport the other commits. The main goal of this PR is improving test-ability for new data sources. For example, although |
||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -516,24 +516,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { | |
test("CTAS with default fileformat") { | ||
val table = "ctas1" | ||
val ctas = s"CREATE TABLE IF NOT EXISTS $table SELECT key k, value FROM src" | ||
withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") { | ||
withSQLConf("hive.default.fileformat" -> "textfile") { | ||
Seq("orc", "parquet").foreach { dataSourceFormat => | ||
withSQLConf( | ||
SQLConf.CONVERT_CTAS.key -> "true", | ||
SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> dataSourceFormat, | ||
"hive.default.fileformat" -> "textfile") { | ||
withTable(table) { | ||
sql(ctas) | ||
// We should use parquet here as that is the default datasource fileformat. The default | ||
// datasource file format is controlled by `spark.sql.sources.default` configuration. | ||
// The default datasource file format is controlled by `spark.sql.sources.default`. | ||
// This testcase verifies that setting `hive.default.fileformat` has no impact on | ||
// the target table's fileformat in case of CTAS. | ||
assert(sessionState.conf.defaultDataSourceName === "parquet") | ||
checkRelation(tableName = table, isDataSourceTable = true, format = "parquet") | ||
checkRelation(tableName = table, isDataSourceTable = true, format = dataSourceFormat) | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously, |
||
withSQLConf("spark.sql.sources.default" -> "orc") { | ||
withTable(table) { | ||
sql(ctas) | ||
checkRelation(tableName = table, isDataSourceTable = true, format = "orc") | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unlike the other things, there is some difference from the original semantics.
As an alternative approach, we can add the following if we need to keep the original
spark.read.load
.