Skip to content

Commit

Permalink
try to use spark base image
Browse files Browse the repository at this point in the history
  • Loading branch information
thanh-nguyen-dang committed Jul 15, 2024
1 parent 6348e1f commit f71e4d0
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 71 deletions.
67 changes: 6 additions & 61 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,45 +1,15 @@
FROM quay.io/cdis/python:python3.9-buster-stable
FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3

ENV DEBIAN_FRONTEND=noninteractive \
SQOOP_VERSION="1.4.7" \
HADOOP_VERSION="3.3.2" \
ES_HADOOP_VERSION="8.3.3" \
MAVEN_ES_URL="https://search.maven.org/remotecontent?filepath=org/elasticsearch" \
ES_SPARK_30_2_12="elasticsearch-spark-30_2.12" \
ES_SPARK_20_2_11="elasticsearch-spark-20_2.11"
ENV MAVEN_ES_SPARK_VERSION="${MAVEN_ES_URL}/${ES_SPARK_30_2_12}/${ES_HADOOP_VERSION}/${ES_SPARK_30_2_12}-${ES_HADOOP_VERSION}"
ENV SQOOP_INSTALLATION_URL="http://archive.apache.org/dist/sqoop/${SQOOP_VERSION}/sqoop-${SQOOP_VERSION}.bin__hadoop-2.6.0.tar.gz" \
HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \
ES_HADOOP_INSTALLATION_URL="https://artifacts.elastic.co/downloads/elasticsearch-hadoop/elasticsearch-hadoop-${ES_HADOOP_VERSION}.zip" \
SQOOP_HOME="/sqoop" \
HADOOP_HOME="/hadoop" \
ES_HADOOP_HOME="/es-hadoop" \
JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/"
ENV SQOOP_HOME="/sqoop" \
SQOOP_VERSION="1.4.7"

ENV SQOOP_INSTALLATION_URL="http://archive.apache.org/dist/sqoop/${SQOOP_VERSION}/sqoop-${SQOOP_VERSION}.bin__hadoop-2.6.0.tar.gz"
ENV ES_HADOOP_HOME_VERSION="${ES_HADOOP_HOME}/elasticsearch-hadoop-${ES_HADOOP_VERSION}"
RUN mkdir -p /usr/share/man/man1
RUN mkdir -p /usr/share/man/man7
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
openjdk-11-jdk-headless \
# dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine
libpq-dev \
postgresql-client \
wget \
unzip \
git \
# dependency for cryptography
libffi-dev \
# dependency for cryptography
libssl-dev \
libssl1.1 \
libgnutls30 \
vim \
curl \
g++ \
&& rm -rf /var/lib/apt/lists/*

RUN python -m pip install --upgrade pip poetry requests

RUN wget ${SQOOP_INSTALLATION_URL} \
&& mkdir -p $SQOOP_HOME \
&& tar -xvf sqoop-${SQOOP_VERSION}.bin__hadoop-2.6.0.tar.gz -C ${SQOOP_HOME} --strip-components 1 \
Expand All @@ -52,32 +22,7 @@ RUN wget https://dlcdn.apache.org//commons/lang/binaries/commons-lang-2.6-bin.ta
&& rm commons-lang-2.6-bin.tar.gz \
&& mv commons-lang-2.6/commons-lang-2.6.jar $SQOOP_HOME/lib/

RUN wget ${HADOOP_INSTALLATION_URL} \
&& mkdir -p $HADOOP_HOME \
&& tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 \
&& rm hadoop-${HADOOP_VERSION}.tar.gz \
&& rm -rf $HADOOP_HOME/share/doc
RUN wget ${ES_HADOOP_INSTALLATION_URL} \
&& mkdir -p $ES_HADOOP_HOME \
&& unzip elasticsearch-hadoop-${ES_HADOOP_VERSION}.zip -d ${ES_HADOOP_HOME} \
&& rm elasticsearch-hadoop-${ES_HADOOP_VERSION}.zip
RUN wget ${MAVEN_ES_SPARK_VERSION}.jar -O ${ES_HADOOP_HOME_VERSION}/dist/${ES_SPARK_20_2_11}-${ES_HADOOP_VERSION}.jar
RUN wget ${MAVEN_ES_SPARK_VERSION}-javadoc.jar -O ${ES_HADOOP_HOME_VERSION}/dist/${ES_SPARK_20_2_11}-${ES_HADOOP_VERSION}-javadoc.jar
RUN wget ${MAVEN_ES_SPARK_VERSION}-sources.jar -O ${ES_HADOOP_HOME_VERSION}/dist/${ES_SPARK_20_2_11}-${ES_HADOOP_VERSION}-sources.jar
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
HADOOP_MAPRED_HOME=$HADOOP_HOME \
HADOOP_COMMON_HOME=$HADOOP_HOME \
HADOOP_HDFS_HOME=$HADOOP_HOME \
YARN_HOME=$HADOOP_HOME \
ACCUMULO_HOME=/accumulo \
HIVE_HOME=/hive \
HBASE_HOME=/hbase \
HCAT_HOME=/hcatalog \
ZOOKEEPER_HOME=/zookeeper \
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native \
LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH
RUN mkdir -p $ACCUMULO_HOME $HIVE_HOME $HBASE_HOME $HCAT_HOME $ZOOKEEPER_HOME
ENV PATH=${SQOOP_HOME}/bin:${HADOOP_HOME}/sbin:$HADOOP_HOME/bin:${JAVA_HOME}/bin:${PATH}
ENV PATH=${SQOOP_HOME}/bin:${PATH}

WORKDIR /tube

Expand Down
10 changes: 5 additions & 5 deletions dockers/confs/log4j.properties
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
log4j.rootCategory=DEBUG, console
log4j.rootCategory=INFO, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %-5p %c{1}:%L - %m%n
log4j.logger.org.apache.spark.executor.CoarseGrainedExecutorBackend=DEBUG
log4j.logger.org.apache.spark.scheduler.TaskSetManager=DEBUG
log4j.logger.org.apache.hadoop.fs=DEBUG
log4j.logger.org.apache.spark.scheduler.DAGScheduler=DEBUG
log4j.logger.org.apache.spark.executor.CoarseGrainedExecutorBackend=INFO
log4j.logger.org.apache.spark.scheduler.TaskSetManager=INFO
log4j.logger.org.apache.hadoop.fs=INFO
log4j.logger.org.apache.spark.scheduler.DAGScheduler=INFO
10 changes: 5 additions & 5 deletions dockers/confs/log4j2.properties
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

# Set everything to be logged to the console
rootLogger.level = debug
rootLogger.level = info
rootLogger.appenderRef.stdout.ref = console

# In the pattern layout configuration below, we specify an explicit `%ex` conversion
Expand All @@ -43,7 +43,7 @@ logger.thriftserver.level = warn

# Settings to quiet third party logs that are too verbose
logger.jetty1.name = org.sparkproject.jetty
logger.jetty1.level = debug
logger.jetty1.level = info
logger.jetty1.appenderRef.stdout.ref = console

logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
Expand Down Expand Up @@ -80,12 +80,12 @@ appender.console.filter.1.onMatch = deny
appender.console.filter.1.onMismatch = neutral

logger.org.name=org.apache
logger.org.level=debug
logger.org.level=info
logger.org.appenderRef.stdout.ref = console

# Logger configuration for org.apache.spark package
logger.spark.name = org.apache.spark
logger.spark.level = debug
logger.spark.level = info
logger.spark.appenderRef.stdout.ref = console

# Logger configuration for org.apache.spark.sql package
Expand All @@ -95,5 +95,5 @@ logger.sql.appenderRef.stdout.ref = console

# Logger configuration for py4j package
logger.py4j.name = py4j
logger.py4j.level = debug
logger.py4j.level = info
logger.py4j.appenderRef.stdout.ref = console

0 comments on commit f71e4d0

Please sign in to comment.