From bdb7704b2b19fcfdfdd79bdeee4894700cbc4878 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 11 Feb 2025 08:52:43 +0800 Subject: [PATCH] [SPARK-51146][INFRA] Publish a new Spark distribution with Spark Connect enabled (step 1) ### What changes were proposed in this pull request? This PR is the first step to update the release scripts to publish a new distribution with spark connect enabled. I need to merge it first because the release process invokes the `make-distribution.sh` script in the upstream branch-4.0, and I can't test my change locally with dry-run mode. ### Why are the changes needed? to publish a new Spark distribution with Spark Connect enabled ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? manual test to make sure `dev/create-release/release-build.sh` is working as expected. ### Was this patch authored or co-authored using generative AI tooling? no Closes #49865 from cloud-fan/release. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit 79db110ef701a929344c03b2a8a37e951aafa081) Signed-off-by: Wenchen Fan --- dev/create-release/release-build.sh | 13 +++++++++++-- dev/make-distribution.sh | 20 +++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 8654c8317ae49..a378f790572b5 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -276,6 +276,10 @@ if [[ "$1" == "package" ]]; then if [[ $BUILD_PACKAGE == *"withr"* ]]; then R_FLAG="--r" fi + SPARK_CONNECT_FLAG="" + if [[ $BUILD_PACKAGE == *"withconnect"* ]]; then + SPARK_CONNECT_FLAG="--connect" + fi echo "Building binary dist $NAME" cp -r spark spark-$SPARK_VERSION-bin-$NAME @@ -295,7 +299,7 @@ if [[ "$1" == "package" ]]; then echo "Creating distribution" ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz \ - $PIP_FLAG $R_FLAG $FLAGS 2>&1 > ../binary-release-$NAME.log + $PIP_FLAG $R_FLAG $SPARK_CONNECT_FLAG $FLAGS 2>&1 > ../binary-release-$NAME.log cd .. if [[ -n $R_FLAG ]]; then @@ -353,7 +357,12 @@ if [[ "$1" == "package" ]]; then fi declare -A BINARY_PKGS_EXTRA - BINARY_PKGS_EXTRA["hadoop3"]="withpip,withr" + if [[ $SPARK_VERSION > "3.5.99" ]]; then + # Since 4.0, we publish a new distribution with Spark Connect enable. + BINARY_PKGS_EXTRA["hadoop3"]="withpip,withr,withconnect" + else + BINARY_PKGS_EXTRA["hadoop3"]="withpip,withr" + fi # This is dead code as Scala 2.12 is no longer supported, but we keep it as a template for # adding new Scala version support in the future. This secondary Scala version only has one diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 53c317a39ea1e..46509dc530fc7 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -35,6 +35,7 @@ DISTDIR="$SPARK_HOME/dist" MAKE_TGZ=false MAKE_PIP=false MAKE_R=false +MAKE_SPARK_CONNECT=false NAME=none MVN="$SPARK_HOME/build/mvn" @@ -43,7 +44,7 @@ function exit_with_usage { echo "make-distribution.sh - tool for making binary distributions of Spark" echo "" echo "usage:" - cl_options="[--name] [--tgz] [--pip] [--r] [--mvn ]" + cl_options="[--name] [--tgz] [--pip] [--r] [--connect] [--mvn ]" echo "make-distribution.sh $cl_options " echo "See Spark's \"Building Spark\" doc for correct Maven options." echo "" @@ -62,6 +63,9 @@ while (( "$#" )); do --r) MAKE_R=true ;; + --connect) + MAKE_SPARK_CONNECT=true + ;; --mvn) MVN="$2" shift @@ -308,4 +312,18 @@ if [ "$MAKE_TGZ" == "true" ]; then fi $TAR -czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME" rm -rf "$TARDIR" + if [[ "$MAKE_SPARK_CONNECT" == "true" ]]; then + TARDIR_NAME=spark-$VERSION-bin-$NAME-spark-connect + TARDIR="$SPARK_HOME/$TARDIR_NAME" + rm -rf "$TARDIR" + cp -r "$DISTDIR" "$TARDIR" + sed -i -e '$s/.*/export SPARK_CONNECT_MODE=1\ + &/' "$TARDIR/bin/pyspark" + sed -i -e '$s/.*/export SPARK_CONNECT_MODE=1\ + &/' "$TARDIR/bin/spark-shell" + sed -i -e '$s/.*/export SPARK_CONNECT_MODE=1\ + &/' "$TARDIR/bin/spark-submit" + $TAR -czf "$TARDIR_NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME" + rm -rf "$TARDIR" + fi fi