Skip to content

Commit

Permalink
add spark master and worker image build
Browse files Browse the repository at this point in the history
  • Loading branch information
thanh-nguyen-dang committed Jul 1, 2024
1 parent af235d9 commit c51a77f
Show file tree
Hide file tree
Showing 10 changed files with 181 additions and 4 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/image_build_and_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,34 @@ jobs:
ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }}
QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }}
QUAY_ROBOT_TOKEN: ${{ secrets.QUAY_ROBOT_TOKEN }}
build-master:
name: spark master
uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master
with:
OVERRIDE_REPO_NAME: spark-master
OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3
DOCKERFILE_LOCATION: "./dockers/spark/master/Dockerfile"
DOCKERFILE_BUILD_CONTEXT: "./dockers/spark/master"
USE_QUAY_ONLY: true
secrets:
ECR_AWS_ACCESS_KEY_ID: ${{ secrets.ECR_AWS_ACCESS_KEY_ID }}
ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }}
QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }}
QUAY_ROBOT_TOKEN: ${{ secrets.QUAY_ROBOT_TOKEN }}
build-worker:
name: spark worker
uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master
with:
OVERRIDE_REPO_NAME: spark-worker
OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3
DOCKERFILE_LOCATION: "./dockers/spark/worker/Dockerfile"
DOCKERFILE_BUILD_CONTEXT: "./dockers/spark/worker"
USE_QUAY_ONLY: true
secrets:
ECR_AWS_ACCESS_KEY_ID: ${{ secrets.ECR_AWS_ACCESS_KEY_ID }}
ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }}
QUAY_USERNAME: ${{ secrets.QUAY_USERNAME }}
QUAY_ROBOT_TOKEN: ${{ secrets.QUAY_ROBOT_TOKEN }}

# then run the tests
test:
Expand Down
31 changes: 31 additions & 0 deletions dockers/spark/master/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3

LABEL maintainer="Gezim Sejdiu <g.sejdiu@gmail.com>, Giannis Mouchakis <gmouchakis@gmail.com>"

ENV SPARK_MASTER_PORT 7077
ENV SPARK_MASTER_WEBUI_PORT 8080
ENV SPARK_MASTER_LOG /spark/logs

RUN mkdir /tube
WORKDIR /tube

# copy ONLY poetry artifact, install the dependencies but not fence
# this will make sure than the dependencies is cached
COPY ../../../poetry.lock ../../../pyproject.toml /tube/
RUN python -m poetry config virtualenvs.create false \
&& python -m poetry install -vv --no-root --only main --no-interaction \
&& python -m poetry show -v

# copy source code ONLY after installing dependencies
COPY ../../../tube /tube/

RUN python -m poetry config virtualenvs.create false \
&& python -m poetry install -vv --only main --no-interaction \
&& python -m poetry show -v

EXPOSE 8080 7077 6066

COPY master.sh /

CMD ["/bin/bash", "/master.sh"]

30 changes: 30 additions & 0 deletions dockers/spark/master/master.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

export SPARK_MASTER_HOST=${SPARK_MASTER_HOST:-`hostname`}

export SPARK_HOME=/spark

. "/spark/sbin/spark-config.sh"

. "/spark/bin/load-spark-env.sh"

mkdir -p $SPARK_MASTER_LOG

ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out

cp /spark/conf/log4j2.properties.template /spark/conf/log4j2.properties
sed -i 's/= info/= DEBUG/g' /spark/conf/log4j2.properties
echo "logger.org.apache.spark=DEBUG" >> /spark/conf/log4j2.properties

function addConfig() {
local path=$1
local name=$2
local value=$3

local entry="<property><name>$name</name><value>${value}</value></property>"
local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
}

cd /spark/bin && /spark/sbin/../bin/spark-class -Dlog4j.configuration=file:/spark/conf/log4j2.properties org.apache.spark.deploy.master.Master \
--ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
10 changes: 10 additions & 0 deletions dockers/spark/submit/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3

LABEL maintainer="Gezim Sejdiu <g.sejdiu@gmail.com>, Giannis Mouchakis <gmouchakis@gmail.com>"

ENV SPARK_MASTER_NAME spark-master
ENV SPARK_MASTER_PORT 7077

COPY submit.sh /

CMD ["/bin/bash", "/submit.sh"]
30 changes: 30 additions & 0 deletions dockers/spark/submit/submit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

export SPARK_MASTER_URL=spark://${SPARK_MASTER_NAME}:${SPARK_MASTER_PORT}
export SPARK_HOME=/spark

/wait-for-step.sh
/execute-step.sh

if [ ! -z "${SPARK_APPLICATION_JAR_LOCATION}" ]; then
echo "Submit application ${SPARK_APPLICATION_JAR_LOCATION} with main class ${SPARK_APPLICATION_MAIN_CLASS} to Spark master ${SPARK_MASTER_URL}"
echo "Passing arguments ${SPARK_APPLICATION_ARGS}"
/${SPARK_HOME}/bin/spark-submit \
--class ${SPARK_APPLICATION_MAIN_CLASS} \
--master ${SPARK_MASTER_URL} \
${SPARK_SUBMIT_ARGS} \
${SPARK_APPLICATION_JAR_LOCATION} ${SPARK_APPLICATION_ARGS}
else
if [ ! -z "${SPARK_APPLICATION_PYTHON_LOCATION}" ]; then
echo "Submit application ${SPARK_APPLICATION_PYTHON_LOCATION} to Spark master ${SPARK_MASTER_URL}"
echo "Passing arguments ${SPARK_APPLICATION_ARGS}"
PYSPARK_PYTHON=python3 /spark/bin/spark-submit \
--master ${SPARK_MASTER_URL} \
${SPARK_SUBMIT_ARGS} \
${SPARK_APPLICATION_PYTHON_LOCATION} ${SPARK_APPLICATION_ARGS}
else
echo "Not recognized application."
fi
fi

/finish-step.sh
30 changes: 30 additions & 0 deletions dockers/spark/worker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3

LABEL maintainer="Gezim Sejdiu <g.sejdiu@gmail.com>, Giannis Mouchakis <gmouchakis@gmail.com>"

ENV SPARK_WORKER_WEBUI_PORT 8081
ENV SPARK_WORKER_LOG /spark/logs
ENV SPARK_MASTER "spark://spark-master:7077"

RUN mkdir /tube
WORKDIR /tube

# copy ONLY poetry artifact, install the dependencies but not fence
# this will make sure than the dependencies is cached
COPY ../../../poetry.lock ../../../pyproject.toml /tube/
RUN python -m poetry config virtualenvs.create false \
&& python -m poetry install -vv --no-root --only main --no-interaction \
&& python -m poetry show -v

# copy source code ONLY after installing dependencies
COPY ../../../tube /tube

RUN python -m poetry config virtualenvs.create false \
&& python -m poetry install -vv --only main --no-interaction \
&& python -m poetry show -v

EXPOSE 8081

COPY worker.sh /

CMD ["/bin/bash", "/worker.sh"]
18 changes: 18 additions & 0 deletions dockers/spark/worker/worker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

export SPARK_HOME=/spark

. "/spark/sbin/spark-config.sh"

. "/spark/bin/load-spark-env.sh"

mkdir -p $SPARK_WORKER_LOG

ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out

cp /spark/conf/log4j2.properties.template /spark/conf/log4j2.properties
sed -i 's/= info/= DEBUG/g' /spark/conf/log4j2.properties
echo "logger.org.apache.spark=DEBUG" >> /spark/conf/log4j2.properties

/spark/sbin/../bin/spark-class -Dlog4j.configuration=file:/spark/conf/log4j2.properties org.apache.spark.deploy.worker.Worker \
--webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ moto="1.1.24"
pyspark-stubs="*"

[build-system]
requires = ["setuptools=65.3.0","poetry>=1.1.0"]
requires = ["setuptools==65.3.0","poetry>=1.1.0"]
build-backend = "poetry.masonry.api"
4 changes: 1 addition & 3 deletions tube/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,5 @@

os.environ[
"PYSPARK_SUBMIT_ARGS"
] = "--jars {}/dist/elasticsearch-spark-20_2.11-{}.jar pyspark-shell".format(
ES_HADOOP_HOME_BIN, ES_HADOOP_VERSION
)
] = f"--jars {ES_HADOOP_HOME_BIN}/dist/elasticsearch-spark-20_2.11-{ES_HADOOP_VERSION}.jar pyspark-shell"
os.environ["HADOOP_CLIENT_OPTS"] = os.getenv("HADOOP_CLIENT_OPTS", "")
2 changes: 2 additions & 0 deletions tube/utils/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def make_spark_context(tube_config):
.set("spark.executor.memory", tube_config.SPARK_EXECUTOR_MEMORY)
.set("spark.driver.memory", tube_config.SPARK_DRIVER_MEMORY)
.set("spark.python.profile", "false")
.set("spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:/spark/conf/log4j.properties")
.set("spark.submit.pyFiles", "/tube")
.setAppName(config.APP_NAME)
)
if tube_config.RUNNING_MODE == enums.RUNNING_MODE_DEV:
Expand Down

0 comments on commit c51a77f

Please sign in to comment.