diff --git a/Dockerfile b/Dockerfile index a2d1fe50..db2a8e7c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -114,8 +114,8 @@ RUN python -m poetry config virtualenvs.create false \ # copy source code ONLY after installing dependencies COPY . /tube -COPY log4j.properties /spark/conf/log4j.properties -COPY log4j2.properties /spark/conf/log4j2.properties +COPY dockers/confs/log4j.properties /spark/conf/log4j.properties +COPY dockers/confs/log4j2.properties /spark/conf/log4j2.properties RUN python -m poetry config virtualenvs.create false \ && python -m poetry install -vv --only main --no-interaction \ diff --git a/log4j.properties b/dockers/confs/log4j.properties similarity index 66% rename from log4j.properties rename to dockers/confs/log4j.properties index f526d88c..ebf8f562 100644 --- a/log4j.properties +++ b/dockers/confs/log4j.properties @@ -3,7 +3,7 @@ log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %-5p %c{1}:%L - %m%n -log4j.logger.org.apache.spark.executor.CoarseGrainedExecutorBackend=DEBUG -log4j.logger.org.apache.spark.scheduler.TaskSetManager=DEBUG -log4j.logger.org.apache.hadoop.fs=DEBUG -log4j.logger.org.apache.spark.scheduler.DAGScheduler=DEBUG \ No newline at end of file +log4j.logger.org.apache.spark.executor.CoarseGrainedExecutorBackend=INFO +log4j.logger.org.apache.spark.scheduler.TaskSetManager=INFO +log4j.logger.org.apache.hadoop.fs=INFO +log4j.logger.org.apache.spark.scheduler.DAGScheduler=INFO \ No newline at end of file diff --git a/dockers/spark/worker/log4j2.properties b/dockers/confs/log4j2.properties similarity index 92% rename from dockers/spark/worker/log4j2.properties rename to dockers/confs/log4j2.properties index ae247a0d..1cd33f64 100644 --- a/dockers/spark/worker/log4j2.properties +++ b/dockers/confs/log4j2.properties @@ -47,30 +47,30 @@ logger.jetty1.level = debug logger.jetty1.appenderRef.stdout.ref = console logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle -logger.jetty2.level = debug +logger.jetty2.level = info logger.jetty2.appenderRef.stdout.ref = console logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper -logger.replexprTyper.level = debug +logger.replexprTyper.level = info logger.replexprTyper.appenderRef.stdout.ref = console logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter -logger.replSparkILoopInterpreter.level = debug +logger.replSparkILoopInterpreter.level = info logger.replSparkILoopInterpreter.appenderRef.stdout.ref = console logger.parquet1.name = org.apache.parquet -logger.parquet1.level = debug +logger.parquet1.level = info logger.parquet1.appenderRef.stdout.ref = console logger.parquet2.name = parquet -logger.parquet2.level = debug +logger.parquet2.level = info logger.parquet2.appenderRef.stdout.ref = console # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler logger.RetryingHMSHandler.level = fatal logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry -logger.FunctionRegistry.level = debug +logger.FunctionRegistry.level = info # For deploying Spark ThriftServer # SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 @@ -80,20 +80,20 @@ appender.console.filter.1.onMatch = deny appender.console.filter.1.onMismatch = neutral logger.org.name=org.apache -logger.org.level=debug +logger.org.level=info logger.org.appenderRef.stdout.ref = console # Logger configuration for org.apache.spark package logger.spark.name = org.apache.spark -logger.spark.level = debug +logger.spark.level = info logger.spark.appenderRef.stdout.ref = console # Logger configuration for org.apache.spark.sql package logger.sql.name = org.apache.spark.sql -logger.sql.level = debug +logger.sql.level = info logger.sql.appenderRef.stdout.ref = console # Logger configuration for py4j package logger.py4j.name = py4j -logger.py4j.level = debug +logger.py4j.level = info logger.py4j.appenderRef.stdout.ref = console \ No newline at end of file diff --git a/dockers/spark/master/Dockerfile b/dockers/spark/master/Dockerfile index a1e418d2..3aff7d7d 100644 --- a/dockers/spark/master/Dockerfile +++ b/dockers/spark/master/Dockerfile @@ -31,7 +31,7 @@ RUN python -m pip install dist/tube-1.0.5-py3-none-any.whl EXPOSE 8080 7077 6066 COPY dockers/spark/master/master.sh / -COPY dockers/spark/master/log4j.properties /spark/conf/log4j.properties -COPY dockers/spark/master/log4j2.properties /spark/conf/log4j2.properties +COPY dockers/confs/log4j.properties /spark/conf/log4j.properties +COPY dockers/confs/log4j2.properties /spark/conf/log4j2.properties CMD ["/bin/bash", "/master.sh"] diff --git a/dockers/spark/master/log4j.properties b/dockers/spark/master/log4j.properties deleted file mode 100644 index 15a6c838..00000000 --- a/dockers/spark/master/log4j.properties +++ /dev/null @@ -1,9 +0,0 @@ -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %-5p %c{1}:%L - %m%n -log4j.logger.org.apache.spark.executor.CoarseGrainedExecutorBackend=DEBUG -log4j.logger.org.apache.spark.scheduler.TaskSetManager=DEBUG -log4j.logger.org.apache.hadoop.fs=DEBUG -log4j.logger.org.apache.spark.scheduler.DAGScheduler=DEBUG \ No newline at end of file diff --git a/dockers/spark/master/log4j2.properties b/dockers/spark/master/log4j2.properties deleted file mode 100644 index ae247a0d..00000000 --- a/dockers/spark/master/log4j2.properties +++ /dev/null @@ -1,99 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -rootLogger.level = debug -rootLogger.appenderRef.stdout.ref = console - -# In the pattern layout configuration below, we specify an explicit `%ex` conversion -# pattern for logging Throwables. If this was omitted, then (by default) Log4J would -# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional -# class packaging information. That extra information can sometimes add a substantial -# performance overhead, so we disable it in our default logging config. -# For more information, see SPARK-39361. -appender.console.type = Console -appender.console.name = console -appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex - -# Set the default spark-shell/spark-sql log level to WARN. When running the -# spark-shell/spark-sql, the log level for these classes is used to overwrite -# the root logger's log level, so that the user can have different defaults -# for the shell and regular Spark apps. -logger.repl.name = org.apache.spark.repl.Main -logger.repl.level = warn - -logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver -logger.thriftserver.level = warn - -# Settings to quiet third party logs that are too verbose -logger.jetty1.name = org.sparkproject.jetty -logger.jetty1.level = debug -logger.jetty1.appenderRef.stdout.ref = console - -logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle -logger.jetty2.level = debug -logger.jetty2.appenderRef.stdout.ref = console - -logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper -logger.replexprTyper.level = debug -logger.replexprTyper.appenderRef.stdout.ref = console - -logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter -logger.replSparkILoopInterpreter.level = debug -logger.replSparkILoopInterpreter.appenderRef.stdout.ref = console - -logger.parquet1.name = org.apache.parquet -logger.parquet1.level = debug -logger.parquet1.appenderRef.stdout.ref = console - -logger.parquet2.name = parquet -logger.parquet2.level = debug -logger.parquet2.appenderRef.stdout.ref = console - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler -logger.RetryingHMSHandler.level = fatal -logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry -logger.FunctionRegistry.level = debug - -# For deploying Spark ThriftServer -# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 -appender.console.filter.1.type = RegexFilter -appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* -appender.console.filter.1.onMatch = deny -appender.console.filter.1.onMismatch = neutral - -logger.org.name=org.apache -logger.org.level=debug -logger.org.appenderRef.stdout.ref = console - -# Logger configuration for org.apache.spark package -logger.spark.name = org.apache.spark -logger.spark.level = debug -logger.spark.appenderRef.stdout.ref = console - -# Logger configuration for org.apache.spark.sql package -logger.sql.name = org.apache.spark.sql -logger.sql.level = debug -logger.sql.appenderRef.stdout.ref = console - -# Logger configuration for py4j package -logger.py4j.name = py4j -logger.py4j.level = debug -logger.py4j.appenderRef.stdout.ref = console \ No newline at end of file diff --git a/dockers/spark/submit/Dockerfile b/dockers/spark/submit/Dockerfile deleted file mode 100644 index cf6560ac..00000000 --- a/dockers/spark/submit/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3 - -LABEL maintainer="Gezim Sejdiu , Giannis Mouchakis " - -ENV SPARK_MASTER_NAME spark-master -ENV SPARK_MASTER_PORT 7077 - -COPY submit.sh / - -CMD ["/bin/bash", "/submit.sh"] diff --git a/dockers/spark/submit/submit.sh b/dockers/spark/submit/submit.sh deleted file mode 100644 index 93cf9b27..00000000 --- a/dockers/spark/submit/submit.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -export SPARK_MASTER_URL=spark://${SPARK_MASTER_NAME}:${SPARK_MASTER_PORT} -export SPARK_HOME=/spark - -/wait-for-step.sh -/execute-step.sh - -if [ ! -z "${SPARK_APPLICATION_JAR_LOCATION}" ]; then - echo "Submit application ${SPARK_APPLICATION_JAR_LOCATION} with main class ${SPARK_APPLICATION_MAIN_CLASS} to Spark master ${SPARK_MASTER_URL}" - echo "Passing arguments ${SPARK_APPLICATION_ARGS}" - /${SPARK_HOME}/bin/spark-submit \ - --class ${SPARK_APPLICATION_MAIN_CLASS} \ - --master ${SPARK_MASTER_URL} \ - ${SPARK_SUBMIT_ARGS} \ - ${SPARK_APPLICATION_JAR_LOCATION} ${SPARK_APPLICATION_ARGS} -else - if [ ! -z "${SPARK_APPLICATION_PYTHON_LOCATION}" ]; then - echo "Submit application ${SPARK_APPLICATION_PYTHON_LOCATION} to Spark master ${SPARK_MASTER_URL}" - echo "Passing arguments ${SPARK_APPLICATION_ARGS}" - PYSPARK_PYTHON=python3 /spark/bin/spark-submit \ - --master ${SPARK_MASTER_URL} \ - ${SPARK_SUBMIT_ARGS} \ - ${SPARK_APPLICATION_PYTHON_LOCATION} ${SPARK_APPLICATION_ARGS} - else - echo "Not recognized application." - fi -fi - -/finish-step.sh \ No newline at end of file diff --git a/dockers/spark/worker/Dockerfile b/dockers/spark/worker/Dockerfile index 6ed01b38..07c6bb02 100644 --- a/dockers/spark/worker/Dockerfile +++ b/dockers/spark/worker/Dockerfile @@ -30,7 +30,7 @@ RUN python -m pip install dist/tube-1.0.5-py3-none-any.whl EXPOSE 8081 COPY dockers/spark/worker/worker.sh / -COPY dockers/spark/worker/log4j.properties /spark/conf/log4j.properties -COPY dockers/spark/worker/log4j2.properties /spark/conf/log4j2.properties +COPY dockers/confs/log4j.properties /spark/conf/log4j.properties +COPY dockers/confs/log4j2.properties /spark/conf/log4j2.properties CMD ["/bin/bash", "/worker.sh"] \ No newline at end of file diff --git a/dockers/spark/worker/log4j.properties b/dockers/spark/worker/log4j.properties deleted file mode 100644 index 15a6c838..00000000 --- a/dockers/spark/worker/log4j.properties +++ /dev/null @@ -1,9 +0,0 @@ -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %-5p %c{1}:%L - %m%n -log4j.logger.org.apache.spark.executor.CoarseGrainedExecutorBackend=DEBUG -log4j.logger.org.apache.spark.scheduler.TaskSetManager=DEBUG -log4j.logger.org.apache.hadoop.fs=DEBUG -log4j.logger.org.apache.spark.scheduler.DAGScheduler=DEBUG \ No newline at end of file diff --git a/log4j2.properties b/log4j2.properties deleted file mode 100644 index ae247a0d..00000000 --- a/log4j2.properties +++ /dev/null @@ -1,99 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -rootLogger.level = debug -rootLogger.appenderRef.stdout.ref = console - -# In the pattern layout configuration below, we specify an explicit `%ex` conversion -# pattern for logging Throwables. If this was omitted, then (by default) Log4J would -# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional -# class packaging information. That extra information can sometimes add a substantial -# performance overhead, so we disable it in our default logging config. -# For more information, see SPARK-39361. -appender.console.type = Console -appender.console.name = console -appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex - -# Set the default spark-shell/spark-sql log level to WARN. When running the -# spark-shell/spark-sql, the log level for these classes is used to overwrite -# the root logger's log level, so that the user can have different defaults -# for the shell and regular Spark apps. -logger.repl.name = org.apache.spark.repl.Main -logger.repl.level = warn - -logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver -logger.thriftserver.level = warn - -# Settings to quiet third party logs that are too verbose -logger.jetty1.name = org.sparkproject.jetty -logger.jetty1.level = debug -logger.jetty1.appenderRef.stdout.ref = console - -logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle -logger.jetty2.level = debug -logger.jetty2.appenderRef.stdout.ref = console - -logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper -logger.replexprTyper.level = debug -logger.replexprTyper.appenderRef.stdout.ref = console - -logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter -logger.replSparkILoopInterpreter.level = debug -logger.replSparkILoopInterpreter.appenderRef.stdout.ref = console - -logger.parquet1.name = org.apache.parquet -logger.parquet1.level = debug -logger.parquet1.appenderRef.stdout.ref = console - -logger.parquet2.name = parquet -logger.parquet2.level = debug -logger.parquet2.appenderRef.stdout.ref = console - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler -logger.RetryingHMSHandler.level = fatal -logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry -logger.FunctionRegistry.level = debug - -# For deploying Spark ThriftServer -# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 -appender.console.filter.1.type = RegexFilter -appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* -appender.console.filter.1.onMatch = deny -appender.console.filter.1.onMismatch = neutral - -logger.org.name=org.apache -logger.org.level=debug -logger.org.appenderRef.stdout.ref = console - -# Logger configuration for org.apache.spark package -logger.spark.name = org.apache.spark -logger.spark.level = debug -logger.spark.appenderRef.stdout.ref = console - -# Logger configuration for org.apache.spark.sql package -logger.sql.name = org.apache.spark.sql -logger.sql.level = debug -logger.sql.appenderRef.stdout.ref = console - -# Logger configuration for py4j package -logger.py4j.name = py4j -logger.py4j.level = debug -logger.py4j.appenderRef.stdout.ref = console \ No newline at end of file diff --git a/tube/settings.py b/tube/settings.py index f798ffa5..3a8a486c 100644 --- a/tube/settings.py +++ b/tube/settings.py @@ -85,6 +85,7 @@ SPARK_MASTER = os.getenv("SPARK_MASTER", "local[1]") # 'spark-service' SPARK_EXECUTOR_MEMORY = os.getenv("SPARK_EXECUTOR_MEMORY", "2g") SPARK_DRIVER_MEMORY = os.getenv("SPARK_DRIVER_MEMORY", "512m") +SPARK_DRIVER_HOST = os.getenv("SPARK_DRIVER_HOST", "tube") APP_NAME = "Gen3 ETL" os.environ[ diff --git a/tube/utils/spark.py b/tube/utils/spark.py index 02d1cfb5..7c74d266 100644 --- a/tube/utils/spark.py +++ b/tube/utils/spark.py @@ -20,6 +20,8 @@ def make_spark_context(tube_config): SparkConf() .set("spark.executor.memory", tube_config.SPARK_EXECUTOR_MEMORY) .set("spark.driver.memory", tube_config.SPARK_DRIVER_MEMORY) + .set("spark.driver.host", tube_config.SPARK_DRIVER_HOST) + .set("spark.driver.port", "4040") .set("spark.python.profile", "false") .set("spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:/spark/conf/log4j.properties") # .set("spark.submit.pyFiles", "/tube")