From 44696ac96d7dedb22540da29cfea73a0111d1d4c Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 31 May 2023 13:09:04 +0800 Subject: [PATCH] [SPARK-43868][SQL][TESTS] Remove `originalUDFs` from `TestHive` to ensure `ObjectHashAggregateExecBenchmark` can run successfully on Github Action ### What changes were proposed in this pull request? This pr remove `originalUDFs` from `TestHive` to ensure `ObjectHashAggregateExecBenchmark` can run successfully on Github Action. ### Why are the changes needed? After SPARK-43225, `org.codehaus.jackson:jackson-mapper-asl` becomes a test scope dependency, so when using GA to run benchmark, it is not in the classpath because GA uses https://github.com/apache/spark/blob/d61c77cac17029ee27319e6b766b48d314a4dd31/.github/workflows/benchmark.yml#L179-L183 iunstead of the sbt `Test/runMain`. `ObjectHashAggregateExecBenchmark` used `TestHive`, and `TestHive` will always call `org.apache.hadoop.hive.ql.exec.FunctionRegistry#getFunctionNames` to init `originalUDFs` before this pr, so when we run `ObjectHashAggregateExecBenchmark` on GitHub Actions, there will be the following exceptions: ``` Error: Exception in thread "main" java.lang.NoClassDefFoundError: org/codehaus/jackson/map/type/TypeFactory at org.apache.hadoop.hive.ql.udf.UDFJson.(UDFJson.java:64) at java.lang.Class.forName0(Native Method) at java.lang.Class.forName(Class.java:348) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge.getUdfClassInternal(GenericUDFBridge.java:142) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge.getUdfClass(GenericUDFBridge.java:132) at org.apache.hadoop.hive.ql.exec.FunctionInfo.getFunctionClass(FunctionInfo.java:151) at org.apache.hadoop.hive.ql.exec.Registry.addFunction(Registry.java:519) at org.apache.hadoop.hive.ql.exec.Registry.registerUDF(Registry.java:163) at org.apache.hadoop.hive.ql.exec.Registry.registerUDF(Registry.java:154) at org.apache.hadoop.hive.ql.exec.Registry.registerUDF(Registry.java:147) at org.apache.hadoop.hive.ql.exec.FunctionRegistry.(FunctionRegistry.java:322) at org.apache.spark.sql.hive.test.TestHiveSparkSession.(TestHive.scala:530) at org.apache.spark.sql.hive.test.TestHiveSparkSession.(TestHive.scala:185) at org.apache.spark.sql.hive.test.TestHiveContext.(TestHive.scala:133) at org.apache.spark.sql.hive.test.TestHive$.(TestHive.scala:54) at org.apache.spark.sql.hive.test.TestHive$.(TestHive.scala:53) at org.apache.spark.sql.execution.benchmark.ObjectHashAggregateExecBenchmark$.getSparkSession(ObjectHashAggregateExecBenchmark.scala:47) at org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark.$init$(SqlBasedBenchmark.scala:35) at org.apache.spark.sql.execution.benchmark.ObjectHashAggregateExecBenchmark$.(ObjectHashAggregateExecBenchmark.scala:45) at org.apache.spark.sql.execution.benchmark.ObjectHashAggregateExecBenchmark.main(ObjectHashAggregateExecBenchmark.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.benchmark.Benchmarks$.$anonfun$main$7(Benchmarks.scala:128) at scala.collection.ArrayOps$.foreach$extension(ArrayOps.scala:1328) at org.apache.spark.benchmark.Benchmarks$.main(Benchmarks.scala:91) at org.apache.spark.benchmark.Benchmarks.main(Benchmarks.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1025) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:192) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:215) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1116) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1125) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.lang.ClassNotFoundException: org.codehaus.jackson.map.type.TypeFactory at java.net.URLClassLoader.findClass(URLClassLoader.java:387) at java.lang.ClassLoader.loadClass(ClassLoader.java:418) at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352) at java.lang.ClassLoader.loadClass(ClassLoader.java:351) ... 40 more ``` Then I found that `originalUDFs` is a unused val in `TestHive` now(SPARK-1251 | https://github.com/apache/spark/pull/6920 introduced it and become unused after SPARK-20667 | https://github.com/apache/spark/pull/17908), so this pr remove it from `TestHive` to avoid calling `FunctionRegistry#getFunctionNames`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - Pass GitHub Actions - Run `ObjectHashAggregateExecBenchmark` on Github Action: **Before** https://github.com/LuciferYang/spark/actions/runs/5128228630/jobs/9224706982 image **After** https://github.com/LuciferYang/spark/actions/runs/5128227211/jobs/9224704507 image `ObjectHashAggregateExecBenchmark` run successfully. Closes #41369 from LuciferYang/hive-udf. Lead-authored-by: yangjie01 Co-authored-by: YangJie Signed-off-by: Yuming Wang (cherry picked from commit 3472619a26106b211685798034ad4622e7053cdf) --- .../scala/org/apache/spark/sql/hive/test/TestHive.scala | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala index 3769de07d8a37..09fdb1cc2ce04 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.hive.test import java.io.File import java.net.URI -import java.util.{Set => JavaSet} import scala.collection.JavaConverters._ import scala.collection.mutable @@ -27,7 +26,6 @@ import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf.HiveConf.ConfVars -import org.apache.hadoop.hive.ql.exec.FunctionRegistry import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.spark.{SparkConf, SparkContext} @@ -523,12 +521,6 @@ private[hive] class TestHiveSparkSession( } } - /** - * Records the UDFs present when the server starts, so we can delete ones that are created by - * tests. - */ - protected val originalUDFs: JavaSet[String] = FunctionRegistry.getFunctionNames - /** * Resets the test instance by deleting any table, view, temp view, and UDF that have been created */