From d3629fd1993a391a4aac00d34edabfe58b28061c Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 23 Nov 2023 02:09:44 +0100 Subject: [PATCH] Enable build for Databricks 13.3 [databricks] (#9677) * pom changes * pom changes * pom changes * add databricks13.3 to premerge * Added ToPrettyString support * xfail approximate percentile test * xfail failing udf tests * xfail failing tests due to WriteIntoDeltaCommand * xfail test_delta_atomic_create_table_as_select and test_delta_atomic_replace_table_as_select * Added 341db to shim-deps and removed from datagen/pom.xml * updated udf-compiler pom.xml * updated sql-plugin pom.xml * fixed multiple pom.xml * updated udf-compiler pom.xml * removed TODO * Signoff Signed-off-by: Raza Jafri * updated scala 2.13 poms * Revert "xfail failing tests due to WriteIntoDeltaCommand" This reverts commit 00b498ed3ea963605cc36560e8896fe27bd412d2. * Revert "xfail test_delta_atomic_create_table_as_select and test_delta_atomic_replace_table_as_select" This reverts commit ea2fd40b8215cdfa845074127a641af62052e947. * remove tests/pom.xml changes * reverted 2.13 generation of tests/pom.xml * removed 341db profile from tests as we don't run unit tests on databricks * fixed the xfail reason to point to the correct issue * removed diff.patch * Revert "xfail approximate percentile test" This reverts commit 0a7fa52dc06681a9ef8f1da6b36ed35ac2be79dc. * build fixes Signed-off-by: Jason Lowe * Fix spark321db build * Skip UDF tests until UDF handling is updated * Remove xfail/skips eclipsed by module-level skip * xfail fastparquet tests due to nulls being introduced by pandas * Fix incorrect shimplify directives for 341db * Fix fallback test --------- Signed-off-by: Raza Jafri Signed-off-by: Jason Lowe Co-authored-by: Jason Lowe --- aggregator/pom.xml | 17 ++++++++ .../src/main/python/delta_lake_merge_test.py | 2 +- .../python/fastparquet_compatibility_test.py | 25 +++++++---- .../src/main/python/udf_cudf_test.py | 7 +++- integration_tests/src/main/python/udf_test.py | 8 +++- .../Jenkinsfile-blossom.premerge-databricks | 2 +- pom.xml | 29 ++++++++++++- scala2.13/aggregator/pom.xml | 17 ++++++++ scala2.13/pom.xml | 29 ++++++++++++- scala2.13/shim-deps/pom.xml | 41 +++++++++++++++++++ shim-deps/pom.xml | 41 +++++++++++++++++++ .../shims/ParquetLegacyNanoAsLongShims.scala | 1 - .../shims/ParquetTimestampNTZShims.scala | 1 - .../hive/rapids/shims/FileSinkDescShim.scala | 1 - .../rapids/shims/HiveInspectorsShim.scala | 1 - .../shims/TagScanForRuntimeFiltering.scala | 1 - ...puDatabricksShuffleExchangeExecBase.scala} | 16 ++------ .../rapids/shims/GpuShuffleExchangeExec.scala | 16 +++++++- .../spark/rapids/shims/CastCheckShims.scala | 1 - .../ParquetTimestampAnnotationShims.scala | 1 - .../spark/rapids/shims/CastCheckShims.scala | 1 + .../shims/ParquetLegacyNanoAsLongShims.scala | 1 + .../ParquetTimestampAnnotationShims.scala | 1 + .../shims/ParquetTimestampNTZShims.scala | 1 + .../shims/TagScanForRuntimeFiltering.scala | 1 + .../rapids/shims/Spark341PlusDBShims.scala | 16 +++++++- .../rapids/shims/GpuShuffleExchangeExec.scala | 10 +++-- .../hive/rapids/shims/FileSinkDescShim.scala | 1 + .../rapids/shims/HiveInspectorsShim.scala | 1 + 29 files changed, 251 insertions(+), 39 deletions(-) rename sql-plugin/src/main/spark321db/scala/org/apache/spark/rapids/shims/{GpuShuffleExchangeExecBase.scala => GpuDatabricksShuffleExchangeExecBase.scala} (76%) rename sql-plugin/src/main/{spark350 => spark341db}/scala/org/apache/spark/sql/hive/rapids/shims/FileSinkDescShim.scala (98%) rename sql-plugin/src/main/{spark350 => spark341db}/scala/org/apache/spark/sql/hive/rapids/shims/HiveInspectorsShim.scala (98%) diff --git a/aggregator/pom.xml b/aggregator/pom.xml index 27c13af1e4d..4fa4827ac52 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -619,6 +619,23 @@ + + release341db + + + buildver + 341db + + + + + com.nvidia + rapids-4-spark-delta-spark341db_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + release333 diff --git a/integration_tests/src/main/python/delta_lake_merge_test.py b/integration_tests/src/main/python/delta_lake_merge_test.py index 1d43259434b..0ba63380aba 100644 --- a/integration_tests/src/main/python/delta_lake_merge_test.py +++ b/integration_tests/src/main/python/delta_lake_merge_test.py @@ -97,7 +97,7 @@ def checker(data_path, do_merge): merge_sql=merge_sql, check_func=checker) -@allow_non_gpu("ExecutedCommandExec,BroadcastHashJoinExec,ColumnarToRowExec,BroadcastExchangeExec,DataWritingCommandExec", *delta_meta_allow) +@allow_non_gpu("ExecutedCommandExec,BroadcastHashJoinExec,ColumnarToRowExec,BroadcastExchangeExec,DataWritingCommandExec", delta_write_fallback_allow, *delta_meta_allow) @delta_lake @ignore_order @pytest.mark.skipif(is_databricks_runtime() and spark_version() < "3.3.2", reason="NOT MATCHED BY SOURCE added in DBR 12.2") diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index 6ec5ec88fd3..b51fa5a55ef 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -17,7 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect from data_gen import * from fastparquet_utils import get_fastparquet_result_canonicalizer -from spark_session import spark_version, with_cpu_session, with_gpu_session +from spark_session import is_databricks_runtime, spark_version, with_cpu_session, with_gpu_session def fastparquet_unavailable(): @@ -107,8 +107,12 @@ def read_with_fastparquet_or_plugin(spark): pytest.param(IntegerGen(nullable=True), marks=pytest.mark.xfail(reason="Nullables cause merge errors, when converting to Spark dataframe")), LongGen(nullable=False), - FloatGen(nullable=False), - DoubleGen(nullable=False), + pytest.param(FloatGen(nullable=False), + marks=pytest.mark.xfail(is_databricks_runtime(), + reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), + pytest.param(DoubleGen(nullable=False), + marks=pytest.mark.xfail(is_databricks_runtime(), + reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), StringGen(nullable=False), pytest.param(DecimalGen(nullable=False), marks=pytest.mark.xfail(reason="fastparquet reads Decimal columns as Float, as per " @@ -131,8 +135,11 @@ def read_with_fastparquet_or_plugin(spark): marks=pytest.mark.xfail(reason="Conversion from Pandas dataframe (read with fastparquet) to Spark dataframe " "fails: \"Unable to infer the type of the field a\".")), - StructGen(children=[("first", IntegerGen(nullable=False)), - ("second", FloatGen(nullable=False))], nullable=False) + pytest.param( + StructGen(children=[("first", IntegerGen(nullable=False)), + ("second", FloatGen(nullable=False))], nullable=False), + marks=pytest.mark.xfail(is_databricks_runtime(), + reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), ], ids=idfn) def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): """ @@ -176,8 +183,12 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): LongGen(nullable=False), pytest.param(LongGen(nullable=True), marks=pytest.mark.xfail(reason="Nullables cause merge errors, when converting to Spark dataframe")), - FloatGen(nullable=False), - DoubleGen(nullable=False), + pytest.param(FloatGen(nullable=False), + marks=pytest.mark.xfail(is_databricks_runtime(), + reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), + pytest.param(DoubleGen(nullable=False), + marks=pytest.mark.xfail(is_databricks_runtime(), + reason="https://github.com/NVIDIA/spark-rapids/issues/9778")), StringGen(nullable=False), pytest.param(DecimalGen(nullable=False), marks=pytest.mark.xfail(reason="fastparquet reads Decimal columns as Float, as per " diff --git a/integration_tests/src/main/python/udf_cudf_test.py b/integration_tests/src/main/python/udf_cudf_test.py index 04416315702..6d94a5da206 100644 --- a/integration_tests/src/main/python/udf_cudf_test.py +++ b/integration_tests/src/main/python/udf_cudf_test.py @@ -37,10 +37,15 @@ from typing import Iterator from pyspark.sql import Window from pyspark.sql.functions import pandas_udf, PandasUDFType -from spark_session import with_cpu_session, with_gpu_session +from spark_session import is_databricks_runtime, is_spark_340_or_later, with_cpu_session, with_gpu_session from marks import cudf_udf +if is_databricks_runtime() and is_spark_340_or_later(): + # Databricks 13.3 does not use separate reader/writer threads for Python UDFs + # which can lead to hangs. Skipping these tests until the Python UDF handling is updated. + pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493") + _conf = { 'spark.rapids.sql.exec.AggregateInPandasExec': 'true', 'spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec': 'true', diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py index 14fc57cf972..db8425f6387 100644 --- a/integration_tests/src/main/python/udf_test.py +++ b/integration_tests/src/main/python/udf_test.py @@ -15,7 +15,7 @@ import pytest from conftest import is_at_least_precommit_run -from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_350_or_later +from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_340_or_later from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version @@ -43,6 +43,12 @@ import pyarrow from typing import Iterator, Tuple + +if is_databricks_runtime() and is_spark_340_or_later(): + # Databricks 13.3 does not use separate reader/writer threads for Python UDFs + # which can lead to hangs. Skipping these tests until the Python UDF handling is updated. + pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493") + arrow_udf_conf = { 'spark.sql.execution.arrow.pyspark.enabled': 'true', 'spark.rapids.sql.exec.WindowInPandasExec': 'true', diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks index 0ea835d39a9..27c42f59aab 100644 --- a/jenkins/Jenkinsfile-blossom.premerge-databricks +++ b/jenkins/Jenkinsfile-blossom.premerge-databricks @@ -88,7 +88,7 @@ pipeline { // 'name' and 'value' only supprt literal string in the declarative Jenkins // Refer to Jenkins issue https://issues.jenkins.io/browse/JENKINS-62127 name 'DB_RUNTIME' - values '10.4', '11.3', '12.2' + values '10.4', '11.3', '12.2', '13.3' } } stages { diff --git a/pom.xml b/pom.xml index d099315ef8c..7e6ed88cf9f 100644 --- a/pom.xml +++ b/pom.xml @@ -509,6 +509,31 @@ delta-lake/delta-spark332db + + + release341db + + + buildver + 341db + + + + + 3.4.4 + spark341db + ${spark341db.version} + ${spark341db.version} + 3.3.1 + true + 1.12.0 + ${spark330.iceberg.version} + + + shim-deps/databricks + delta-lake/delta-spark341db + + release350 @@ -691,6 +716,7 @@ 3.3.2.3.3.7190.0-91 3.3.0-databricks 3.3.2-databricks + 3.4.1-databricks 3.5.0 3.12.4 4.3.0 @@ -745,7 +771,8 @@ 321db, 330db, - 332db + 332db, + 341db + release341db + + + buildver + 341db + + + + + 3.4.4 + spark341db + ${spark341db.version} + ${spark341db.version} + 3.3.1 + true + 1.12.0 + ${spark330.iceberg.version} + + + shim-deps/databricks + delta-lake/delta-spark341db + + release350 @@ -691,6 +716,7 @@ 3.3.2.3.3.7190.0-91 3.3.0-databricks 3.3.2-databricks + 3.4.1-databricks 3.5.0 3.12.4 4.3.0 @@ -745,7 +771,8 @@ 321db, 330db, - 332db + 332db, + 341db