From 4acc114f7b414dcb8d3dc3e9dd2604e4814b6f3a Mon Sep 17 00:00:00 2001 From: Thanh Nguyen Date: Mon, 1 Jul 2024 12:35:26 -0500 Subject: [PATCH] add spark master and worker image build --- .github/workflows/image_build_and_test.yaml | 30 +++++++++++++++++++++ tube/settings.py | 4 +-- tube/utils/spark.py | 2 ++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/.github/workflows/image_build_and_test.yaml b/.github/workflows/image_build_and_test.yaml index e3b3276f..b91118fa 100644 --- a/.github/workflows/image_build_and_test.yaml +++ b/.github/workflows/image_build_and_test.yaml @@ -12,6 +12,36 @@ jobs: ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }} QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }} QUAY_ROBOT_TOKEN: ${{ secrets.QUAY_ROBOT_TOKEN }} + build-master: + name: spark master + uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master + needs: [build-spark-base] + with: + OVERRIDE_REPO_NAME: spark-master + OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3 + DOCKERFILE_LOCATION: "./dockers/spark/master/Dockerfile" + DOCKERFILE_BUILD_CONTEXT: "./spark/master" + USE_QUAY_ONLY: true + secrets: + ECR_AWS_ACCESS_KEY_ID: ${{ secrets.ECR_AWS_ACCESS_KEY_ID }} + ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }} + QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }} + QUAY_ROBOT_TOKEN: ${{ secrets.QUAY_ROBOT_TOKEN }} + build-worker: + name: spark worker + uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master + needs: [build-spark-base] + with: + OVERRIDE_REPO_NAME: spark-worker + OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3 + DOCKERFILE_LOCATION: "./dockers/spark/worker/Dockerfile" + DOCKERFILE_BUILD_CONTEXT: "./spark/worker" + USE_QUAY_ONLY: true + secrets: + ECR_AWS_ACCESS_KEY_ID: ${{ secrets.ECR_AWS_ACCESS_KEY_ID }} + ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }} + QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }} + QUAY_ROBOT_TOKEN: ${{ secrets.QUAY_ROBOT_TOKEN }} # then run the tests test: diff --git a/tube/settings.py b/tube/settings.py index 8f36f877..f798ffa5 100644 --- a/tube/settings.py +++ b/tube/settings.py @@ -89,7 +89,5 @@ os.environ[ "PYSPARK_SUBMIT_ARGS" -] = "--jars {}/dist/elasticsearch-spark-20_2.11-{}.jar pyspark-shell".format( - ES_HADOOP_HOME_BIN, ES_HADOOP_VERSION -) +] = f"--jars {ES_HADOOP_HOME_BIN}/dist/elasticsearch-spark-20_2.11-{ES_HADOOP_VERSION}.jar pyspark-shell" os.environ["HADOOP_CLIENT_OPTS"] = os.getenv("HADOOP_CLIENT_OPTS", "") diff --git a/tube/utils/spark.py b/tube/utils/spark.py index 61e02a5c..2197f31e 100644 --- a/tube/utils/spark.py +++ b/tube/utils/spark.py @@ -21,6 +21,8 @@ def make_spark_context(tube_config): .set("spark.executor.memory", tube_config.SPARK_EXECUTOR_MEMORY) .set("spark.driver.memory", tube_config.SPARK_DRIVER_MEMORY) .set("spark.python.profile", "false") + .set("spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:/spark/conf/log4j.properties") + .set("spark.submit.pyFiles", "/tube") .setAppName(config.APP_NAME) ) if tube_config.RUNNING_MODE == enums.RUNNING_MODE_DEV: