Skip to content
This repository has been archived by the owner on Aug 23, 2024. It is now read-only.

Commit

Permalink
Cleanup shell scripts and Dockerfile (#4)
Browse files Browse the repository at this point in the history
* Fix shellcheck errors

* Cleanup all shell scripts with shfmt and shellcheck

* Lint Dockerfile
  • Loading branch information
dfsnow authored Aug 21, 2023
1 parent 8e8c5fb commit 980d904
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 188 deletions.
76 changes: 39 additions & 37 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

# Fresh base image
FROM centos:7 AS hadoop
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

# Development tools
RUN yum install -y curl which tar bunzip2 openssh-server openssh-clients rsync && \
Expand All @@ -12,9 +13,10 @@ RUN yum install -y curl which tar bunzip2 openssh-server openssh-clients rsync &

# JDK8
ARG JAVA_VER=1.8.0
ENV JAVA_HOME /usr/lib/jvm/java-${JAVA_VER}-openjdk
ENV PATH ${PATH}:${JAVA_HOME}/bin
RUN yum install -y java-${JAVA_VER}-openjdk-devel
ENV JAVA_HOME /usr/lib/jvm/java-"$JAVA_VER"-openjdk
ENV PATH "$PATH":"$JAVA_HOME"/bin
RUN yum install -y java-"$JAVA_VER"-openjdk-devel && \
yum clean all

# Hadoop
ARG HADOOP_VER=3.1.0
Expand All @@ -24,32 +26,32 @@ ENV HDFS_DATANODE_USER root
ENV HDFS_SECONDARYNAMENODE_USER root
ENV YARN_RESOURCEMANAGER_USER root
ENV YARN_NODEMANAGER_USER root
ENV HADOOP_COMMON_HOME ${HADOOP_HOME}
ENV HADOOP_HDFS_HOME ${HADOOP_HOME}
ENV HADOOP_MAPRED_HOME ${HADOOP_HOME}
ENV HADOOP_YARN_HOME ${HADOOP_HOME}
ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop
ENV PATH ${PATH}:${HADOOP_HOME}/bin
RUN curl -sk https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VER}/hadoop-${HADOOP_VER}.tar.gz \
ENV HADOOP_COMMON_HOME "$HADOOP_HOME"
ENV HADOOP_HDFS_HOME "$HADOOP_HOME"
ENV HADOOP_MAPRED_HOME "$HADOOP_HOME"
ENV HADOOP_YARN_HOME "$HADOOP_HOME"
ENV HADOOP_CONF_DIR "$HADOOP_HOME"/etc/hadoop
ENV PATH "$PATH":"$HADOOP_HOME"/bin
RUN curl -sk https://archive.apache.org/dist/hadoop/common/hadoop-"$HADOOP_VER"/hadoop-"$HADOOP_VER".tar.gz \
| tar -xz -C /usr/local/ && \
cd /usr/local && \
ln -s ./hadoop-${HADOOP_VER} hadoop && \
echo "JAVA_HOME=${JAVA_HOME}" >> ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh && \
echo "HADOOP_HOME=${HADOOP_HOME}" >> ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh
ln -s ./hadoop-"$HADOOP_VER" hadoop && \
echo "JAVA_HOME=${JAVA_HOME}" >> "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh && \
echo "HADOOP_HOME=${HADOOP_HOME}" >> "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh

# Copy config files. Initially run in pseudo-distributed mode
# for namenode setup/formatting
RUN mkdir ${HADOOP_HOME}/input && \
cp ${HADOOP_HOME}/etc/hadoop/*.xml ${HADOOP_HOME}/input
COPY docker-config/site/ ${HADOOP_HOME}/etc/hadoop/
RUN mkdir "$HADOOP_HOME"/input && \
cp "$HADOOP_HOME"/etc/hadoop/*.xml "$HADOOP_HOME"/input
COPY docker-config/site/ "$HADOOP_HOME"/etc/hadoop/
RUN sed s/HOSTNAME/localhost/ \
${HADOOP_HOME}/etc/hadoop/core-site.xml.template \
> ${HADOOP_HOME}/etc/hadoop/core-site.xml && \
"$HADOOP_HOME"/etc/hadoop/core-site.xml.template \
> "$HADOOP_HOME"/etc/hadoop/core-site.xml && \
sed s/HOSTNAME/localhost/ \
${HADOOP_HOME}/etc/hadoop/yarn-site.xml.template \
"$HADOOP_HOME"/etc/hadoop/yarn-site.xml.template \
> /usr/local/hadoop/etc/hadoop/yarn-site.xml && \
sed s/HOSTNAME/localhost/ \
${HADOOP_HOME}/etc/hadoop/mapred-site.xml.template \
"$HADOOP_HOME"/etc/hadoop/mapred-site.xml.template \
> /usr/local/hadoop/etc/hadoop/mapred-site.xml

# ssh setup for node comms
Expand Down Expand Up @@ -84,38 +86,38 @@ FROM hadoop AS sqoop
ARG SQOOP_VER=1.4.7
ARG SQOOP_HADOOP_VER=2.6.0
ENV SQOOP_HOME /usr/local/sqoop
ENV SQOOP_CONF_DIR ${SQOOP_HOME}/conf
ENV PATH ${PATH}:${HADOOP_HOME}/bin:${SQOOP_HOME}/bin
RUN curl -s http://archive.apache.org/dist/sqoop/${SQOOP_VER}/sqoop-${SQOOP_VER}.bin__hadoop-${SQOOP_HADOOP_VER}.tar.gz \
ENV SQOOP_CONF_DIR "$SQOOP_HOME"/conf
ENV PATH "$PATH":"$HADOOP_HOME"/bin:"$SQOOP_HOME"/bin
RUN curl -s http://archive.apache.org/dist/sqoop/"$SQOOP_VER"/sqoop-"$SQOOP_VER".bin__hadoop-"$SQOOP_HADOOP_VER".tar.gz \
| tar -xz -C /usr/local && \
ln -s /usr/local/sqoop-${SQOOP_VER}.bin__hadoop-${SQOOP_HADOOP_VER} ${SQOOP_HOME}
ln -s /usr/local/sqoop-"$SQOOP_VER".bin__hadoop-"$SQOOP_HADOOP_VER" "$SQOOP_HOME"

# Make dirs for class files and jars
RUN mkdir -p /tmp/bindir /tmp/target && \
chmod +x ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh && \
chown root:root ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh
chmod +x "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh && \
chown root:root "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh

# Download and install Apache Hive
ARG HIVE_VER=3.1.2
ENV HIVE_HOME /usr/local/hive/apache-hive-${HIVE_VER}-bin
ENV HIVE_CONF_DIR ${HIVE_HOME}/conf
ENV HCAT_HOME ${HIVE_HOME}/hcatalog
ENV PATH ${PATH}:${HIVE_HOME}/bin
ENV PATH ${PATH}:${HCAT_HOME}/bin
ENV HIVE_HOME /usr/local/hive/apache-hive-"$HIVE_VER"-bin
ENV HIVE_CONF_DIR "$HIVE_HOME"/conf
ENV HCAT_HOME "$HIVE_HOME"/hcatalog
ENV PATH "$PATH":"$HIVE_HOME"/bin
ENV PATH "$PATH":"$HCAT_HOME"/bin
RUN mkdir -p /usr/local/hive && \
curl -s https://dlcdn.apache.org/hive/hive-${HIVE_VER}/apache-hive-${HIVE_VER}-bin.tar.gz \
curl -s https://dlcdn.apache.org/hive/hive-"$HIVE_VER"/apache-hive-"$HIVE_VER"-bin.tar.gz \
| tar -xz -C /usr/local/hive && \
rm ${HIVE_HOME}/lib/log4j-slf4j-impl-2.10.0.jar
COPY docker-config/hive/hive-site.xml ${HIVE_CONF_DIR}
rm "$HIVE_HOME"/lib/log4j-slf4j-impl-2.10.0.jar
COPY docker-config/hive/hive-site.xml "$HIVE_CONF_DIR"

# Install the postgresql connector driver
ARG PSQL_JDBC_VER=42.2.24
RUN curl -o /usr/share/java/postgresql-jdbc.jar https://jdbc.postgresql.org/download/postgresql-${PSQL_JDBC_VER}.jar && \
RUN curl -o /usr/share/java/postgresql-jdbc.jar https://jdbc.postgresql.org/download/postgresql-"$PSQL_JDBC_VER".jar && \
chmod 644 /usr/share/java/postgresql-jdbc.jar && \
ln -s /usr/share/java/postgresql-jdbc.jar ${HIVE_HOME}/lib/postgresql-jdbc.jar
ln -s /usr/share/java/postgresql-jdbc.jar "$HIVE_HOME"/lib/postgresql-jdbc.jar

# Entrypoint/startup for sqoop and hive
ADD https://repo1.maven.org/maven2/org/json/json/20090211/json-20090211.jar ${SQOOP_HOME}/lib
ADD https://repo1.maven.org/maven2/org/json/json/20090211/json-20090211.jar "$SQOOP_HOME"/lib
COPY docker-config/bootstrap.sh docker-config/entrypoint.sh /etc/docker-config/
RUN chown -R root:root /etc/docker-config/ && \
chmod -R 755 /etc/docker-config/ && \
Expand Down
24 changes: 12 additions & 12 deletions docker-config/bootstrap.sh
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
#!/bin/bash
: ${HADOOP_HOME:=/usr/local/hadoop}
$HADOOP_HOME/etc/hadoop/hadoop-env.sh
: "$HADOOP_HOME:=/usr/local/hadoop"
"$HADOOP_HOME"/etc/hadoop/hadoop-env.sh

# Remove annoying warnings about dependencies
export ACCUMULO_HOME=/tmp
export ZOOKEEPER_HOME=/tmp
export HBASE_HOME=/tmp

# Replace namenode hostname with docker-compose hostname
sed s/HOSTNAME/$MASTER_HOSTNAME/ $HADOOP_HOME/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
sed s/HOSTNAME/$MASTER_HOSTNAME/ $HADOOP_HOME/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml
sed s/HOSTNAME/$MASTER_HOSTNAME/ $HADOOP_HOME/etc/hadoop/mapred-site.xml.template > /usr/local/hadoop/etc/hadoop/mapred-site.xml
sed s/HOSTNAME/"$MASTER_HOSTNAME"/ "$HADOOP_HOME"/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
sed s/HOSTNAME/"$MASTER_HOSTNAME"/ "$HADOOP_HOME"/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml
sed s/HOSTNAME/"$MASTER_HOSTNAME"/ "$HADOOP_HOME"/etc/hadoop/mapred-site.xml.template > /usr/local/hadoop/etc/hadoop/mapred-site.xml

# Don't start hadoop for metastore server
if [[ -z "$SKIP_BOOTSTRAP" ]]; then
# Clear HDFS datanode storage location
rm -rf /tmp/hadoop/dataNode/*

# Remove ssh keys from data nodes
if [[ $HOSTNAME != $MASTER_HOSTNAME ]]; then
if [[ "$HOSTNAME" != "$MASTER_HOSTNAME" ]]; then
rm /root/.ssh/id_ed25519
fi

# Add node hostnames to workers file
echo $MASTER_HOSTNAME > $HADOOP_HOME/etc/hadoop/workers
echo $NODE_HOSTNAMES | tr -s ' ' '\n' >> $HADOOP_HOME/etc/hadoop/workers
echo "$MASTER_HOSTNAME" > "$HADOOP_HOME"/etc/hadoop/workers
echo "$NODE_HOSTNAMES" | tr -s ' ' '\n' >> "$HADOOP_HOME"/etc/hadoop/workers

# Start hadoop
/usr/sbin/sshd
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh
$HADOOP_HOME/bin/mapred --daemon start historyserver
"$HADOOP_HOME"/sbin/start-dfs.sh
"$HADOOP_HOME"/sbin/start-yarn.sh
"$HADOOP_HOME"/bin/mapred --daemon start historyserver
hdfs dfs -mkdir -p /user/root/

# Grab JDBC drivers from mounted volume
cp /jdbc/* $SQOOP_HOME/lib
cp /jdbc/* "$SQOOP_HOME"/lib
else
echo "Skipping hadoop bootstrap..."
fi
2 changes: 1 addition & 1 deletion docker-config/hive/init-hive-metastore.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
set -e

psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<- EOSQL
CREATE USER hive WITH PASSWORD 'hive';
CREATE DATABASE iasworld;
ALTER DATABASE iasworld OWNER TO hive;
Expand Down
6 changes: 3 additions & 3 deletions docker-config/init.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
/usr/sbin/sshd
$HADOOP_HOME/bin/hdfs namenode -format
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hive/warehouse/iasworld.db
"$HADOOP_HOME"/bin/hdfs namenode -format
"$HADOOP_HOME"/sbin/start-dfs.sh
"$HADOOP_HOME"/bin/hdfs dfs -mkdir -p /user/hive/warehouse/iasworld.db
79 changes: 39 additions & 40 deletions run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
START_TIME=`date +%s`
START_TIME=$(date +%s)

# Env variables controlling where sqooped data is exported to
TEMP_LOG_FILE="logs/temp-sqoop-log"
Expand All @@ -13,7 +13,7 @@ LOG_GROUP_NAME="/ccao/jobs/sqoop"
--remove-orphans \
--no-color \
| ts '%.s' \
| tee ${TEMP_LOG_FILE}
| tee "$TEMP_LOG_FILE"

# Cleanup after docker run
/usr/local/bin/docker-compose rm -f -s -v
Expand All @@ -23,104 +23,103 @@ LOG_GROUP_NAME="/ccao/jobs/sqoop"
find target/ -maxdepth 1 -type d -empty -delete
TABLES_EXTRACTED=$(ls target/)
for TABLE in ${TABLES_EXTRACTED}; do
SUB_DIRS=$(find target/${TABLE}/* -maxdepth 1 -type d -exec basename {} \;)
if [[ ! -z $SUB_DIRS && $(echo ${SUB_DIRS} | wc -l ) -gt 0 ]]; then
SUB_DIRS=$(find target/"$TABLE"/* -maxdepth 1 -type d -exec basename {} \;)
if [[ -n "$SUB_DIRS" && $(echo "$SUB_DIRS" | wc -l) -gt 0 ]]; then
for dir in ${SUB_DIRS}; do
/usr/bin/aws s3 rm \
${BUCKET_URI}/iasworld/${TABLE}/${dir} \
"$BUCKET_URI"/iasworld/"$TABLE"/"$dir" \
--exclude "*" \
--include "*.parquet" \
--recursive \
| ts '%.s' \
| tee -a ${TEMP_LOG_FILE}
| tee -a "$TEMP_LOG_FILE"
done
else
/usr/bin/aws s3 rm \
${BUCKET_URI}/iasworld/${TABLE} \
"$BUCKET_URI"/iasworld/"$TABLE" \
--exclude "*" \
--include "*.parquet" \
--recursive \
| ts '%.s' \
| tee -a ${TEMP_LOG_FILE}
| tee -a "$TEMP_LOG_FILE"
fi
done

# Uploaded pulled files from local target/ dir to S3
/usr/bin/aws s3 mv \
target/ \
${BUCKET_URI}/iasworld \
"$BUCKET_URI"/iasworld \
--exclude "*" \
--include "*.parquet" \
--recursive \
--no-progress \
| ts '%.s' \
| tee -a ${TEMP_LOG_FILE}
| tee -a "$TEMP_LOG_FILE"

# Delete any remaining empty dirs
find target/ -type d -empty -delete

# Print overall runtime stats and tables extracted
END_TIME=`date +%s`
RUNTIME=$((END_TIME-START_TIME))
END_TIME=$(date +%s)
RUNTIME=$((END_TIME - START_TIME))
HH=$((RUNTIME / 3600))
MM=$(( (RUNTIME % 3600) / 60 ))
SS=$(( (RUNTIME % 3600) % 60 ))
MM=$(((RUNTIME % 3600) / 60))
SS=$(((RUNTIME % 3600) % 60))
echo "Tables extracted: ${TABLES_EXTRACTED}" \
| ts '%.s' \
| tee -a ${TEMP_LOG_FILE}
| tee -a "$TEMP_LOG_FILE"
echo "Total extraction time: ${HH}:${MM}:${SS} (hh:mm:ss)" \
| ts '%.s' \
| tee -a ${TEMP_LOG_FILE}
| tee -a "$TEMP_LOG_FILE"

# Bash function to convert text log to JSON consumable by CloudWatch
parse_logs () {
cat $1 \
| sed 's/ /|/' \
| sed 's/\([0-9]\.[0-9]\{3\}\)[0-9]\{1,\}/\1/' \
| sed 's/\.//' \
| awk '{print $1, $NF}' OFS='|' FS='|' \
| jq -Rn '[inputs
parse_logs() {
sed 's/ /|/' "$1" \
| sed 's/\([0-9]\.[0-9]\{3\}\)[0-9]\{1,\}/\1/' \
| sed 's/\.//' \
| awk '{print $1, $NF}' OFS='|' FS='|' \
| jq -Rn '[inputs
| . / "\n"
| (.[] | select(length > 0) | . / "|") as $input
| {"timestamp": $input[0]|tonumber, "message": $input[1]}]' \
> $2
> "$2"
}

# Create log stream in CloudWatch with today's date
LOG_STREAM_NAME="sqoop-job-log-$(date -u +'%Y-%m-%d_%H-%M-%S')"
/usr/bin/aws logs create-log-stream \
--log-group-name ${LOG_GROUP_NAME} \
--log-stream-name ${LOG_STREAM_NAME}
--log-group-name "$LOG_GROUP_NAME" \
--log-stream-name "$LOG_STREAM_NAME"

# Convert text output from Docker and AWS CLI to clean JSON
# for upload to AWS CloudWatch. Logs need to be split into chunks for upload
# First loop skips using sequence token necessary for subsequent uploads
COUNTER=1
split -l 4000 ${TEMP_LOG_FILE} logs/temp-sqoop-log-
split -l 4000 "$TEMP_LOG_FILE" logs/temp-sqoop-log-
for f in logs/temp-sqoop-log-*; do
parse_logs ${f} ${f}.json
if [[ ${COUNTER} -eq 1 ]]; then
parse_logs "$f" "$f".json
if [[ "$COUNTER" -eq 1 ]]; then
SEQ_TOKEN=$(
/usr/bin/aws logs put-log-events \
--log-group-name ${LOG_GROUP_NAME} \
--log-stream-name ${LOG_STREAM_NAME} \
--log-events file://${f}.json \
| jq -r .nextSequenceToken
--log-group-name "$LOG_GROUP_NAME" \
--log-stream-name "$LOG_STREAM_NAME" \
--log-events file://"$f".json \
| jq -r .nextSequenceToken
)
else
SEQ_TOKEN=$(
/usr/bin/aws logs put-log-events \
--log-group-name ${LOG_GROUP_NAME} \
--log-stream-name ${LOG_STREAM_NAME} \
--log-events file://${f}.json \
--sequence-token ${SEQ_TOKEN} \
| jq -r .nextSequenceToken
--log-group-name "$LOG_GROUP_NAME" \
--log-stream-name "$LOG_STREAM_NAME" \
--log-events file://"$f".json \
--sequence-token "$SEQ_TOKEN" \
| jq -r .nextSequenceToken
)
fi
COUNTER=$((COUNTER + 1))
done
echo "Logs successfully uploaded to CloudWatch"

# Remove uploaded log files
mv ${TEMP_LOG_FILE} ${BACKUP_LOG_FILE}
rm ${TEMP_LOG_FILE}*
mv ./"$TEMP_LOG_FILE" ./"$BACKUP_LOG_FILE"
if [[ -n "$TEMP_LOG_FILE" ]]; then rm ./"$TEMP_LOG_FILE"*; fi
Loading

0 comments on commit 980d904

Please sign in to comment.