From 980d904535c6f3962c54f67cbc804895559461e2 Mon Sep 17 00:00:00 2001 From: Dan Snow <31494343+dfsnow@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:35:53 -0500 Subject: [PATCH] Cleanup shell scripts and Dockerfile (#4) * Fix shellcheck errors * Cleanup all shell scripts with shfmt and shellcheck * Lint Dockerfile --- Dockerfile | 76 +++++++++---------- docker-config/bootstrap.sh | 24 +++--- docker-config/hive/init-hive-metastore.sh | 2 +- docker-config/init.sh | 6 +- run.sh | 79 ++++++++++---------- scripts/get-tables.sh | 25 ++++--- scripts/run-sqoop.sh | 90 ++++++++++++----------- tables/update-tables.sh | 78 ++++++++++---------- 8 files changed, 192 insertions(+), 188 deletions(-) diff --git a/Dockerfile b/Dockerfile index b40f456..db46729 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ # Fresh base image FROM centos:7 AS hadoop +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # Development tools RUN yum install -y curl which tar bunzip2 openssh-server openssh-clients rsync && \ @@ -12,9 +13,10 @@ RUN yum install -y curl which tar bunzip2 openssh-server openssh-clients rsync & # JDK8 ARG JAVA_VER=1.8.0 -ENV JAVA_HOME /usr/lib/jvm/java-${JAVA_VER}-openjdk -ENV PATH ${PATH}:${JAVA_HOME}/bin -RUN yum install -y java-${JAVA_VER}-openjdk-devel +ENV JAVA_HOME /usr/lib/jvm/java-"$JAVA_VER"-openjdk +ENV PATH "$PATH":"$JAVA_HOME"/bin +RUN yum install -y java-"$JAVA_VER"-openjdk-devel && \ + yum clean all # Hadoop ARG HADOOP_VER=3.1.0 @@ -24,32 +26,32 @@ ENV HDFS_DATANODE_USER root ENV HDFS_SECONDARYNAMENODE_USER root ENV YARN_RESOURCEMANAGER_USER root ENV YARN_NODEMANAGER_USER root -ENV HADOOP_COMMON_HOME ${HADOOP_HOME} -ENV HADOOP_HDFS_HOME ${HADOOP_HOME} -ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} -ENV HADOOP_YARN_HOME ${HADOOP_HOME} -ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop -ENV PATH ${PATH}:${HADOOP_HOME}/bin -RUN curl -sk https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VER}/hadoop-${HADOOP_VER}.tar.gz \ +ENV HADOOP_COMMON_HOME "$HADOOP_HOME" +ENV HADOOP_HDFS_HOME "$HADOOP_HOME" +ENV HADOOP_MAPRED_HOME "$HADOOP_HOME" +ENV HADOOP_YARN_HOME "$HADOOP_HOME" +ENV HADOOP_CONF_DIR "$HADOOP_HOME"/etc/hadoop +ENV PATH "$PATH":"$HADOOP_HOME"/bin +RUN curl -sk https://archive.apache.org/dist/hadoop/common/hadoop-"$HADOOP_VER"/hadoop-"$HADOOP_VER".tar.gz \ | tar -xz -C /usr/local/ && \ cd /usr/local && \ - ln -s ./hadoop-${HADOOP_VER} hadoop && \ - echo "JAVA_HOME=${JAVA_HOME}" >> ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh && \ - echo "HADOOP_HOME=${HADOOP_HOME}" >> ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh + ln -s ./hadoop-"$HADOOP_VER" hadoop && \ + echo "JAVA_HOME=${JAVA_HOME}" >> "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh && \ + echo "HADOOP_HOME=${HADOOP_HOME}" >> "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh # Copy config files. Initially run in pseudo-distributed mode # for namenode setup/formatting -RUN mkdir ${HADOOP_HOME}/input && \ - cp ${HADOOP_HOME}/etc/hadoop/*.xml ${HADOOP_HOME}/input -COPY docker-config/site/ ${HADOOP_HOME}/etc/hadoop/ +RUN mkdir "$HADOOP_HOME"/input && \ + cp "$HADOOP_HOME"/etc/hadoop/*.xml "$HADOOP_HOME"/input +COPY docker-config/site/ "$HADOOP_HOME"/etc/hadoop/ RUN sed s/HOSTNAME/localhost/ \ - ${HADOOP_HOME}/etc/hadoop/core-site.xml.template \ - > ${HADOOP_HOME}/etc/hadoop/core-site.xml && \ + "$HADOOP_HOME"/etc/hadoop/core-site.xml.template \ + > "$HADOOP_HOME"/etc/hadoop/core-site.xml && \ sed s/HOSTNAME/localhost/ \ - ${HADOOP_HOME}/etc/hadoop/yarn-site.xml.template \ + "$HADOOP_HOME"/etc/hadoop/yarn-site.xml.template \ > /usr/local/hadoop/etc/hadoop/yarn-site.xml && \ sed s/HOSTNAME/localhost/ \ - ${HADOOP_HOME}/etc/hadoop/mapred-site.xml.template \ + "$HADOOP_HOME"/etc/hadoop/mapred-site.xml.template \ > /usr/local/hadoop/etc/hadoop/mapred-site.xml # ssh setup for node comms @@ -84,38 +86,38 @@ FROM hadoop AS sqoop ARG SQOOP_VER=1.4.7 ARG SQOOP_HADOOP_VER=2.6.0 ENV SQOOP_HOME /usr/local/sqoop -ENV SQOOP_CONF_DIR ${SQOOP_HOME}/conf -ENV PATH ${PATH}:${HADOOP_HOME}/bin:${SQOOP_HOME}/bin -RUN curl -s http://archive.apache.org/dist/sqoop/${SQOOP_VER}/sqoop-${SQOOP_VER}.bin__hadoop-${SQOOP_HADOOP_VER}.tar.gz \ +ENV SQOOP_CONF_DIR "$SQOOP_HOME"/conf +ENV PATH "$PATH":"$HADOOP_HOME"/bin:"$SQOOP_HOME"/bin +RUN curl -s http://archive.apache.org/dist/sqoop/"$SQOOP_VER"/sqoop-"$SQOOP_VER".bin__hadoop-"$SQOOP_HADOOP_VER".tar.gz \ | tar -xz -C /usr/local && \ - ln -s /usr/local/sqoop-${SQOOP_VER}.bin__hadoop-${SQOOP_HADOOP_VER} ${SQOOP_HOME} + ln -s /usr/local/sqoop-"$SQOOP_VER".bin__hadoop-"$SQOOP_HADOOP_VER" "$SQOOP_HOME" # Make dirs for class files and jars RUN mkdir -p /tmp/bindir /tmp/target && \ - chmod +x ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh && \ - chown root:root ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh + chmod +x "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh && \ + chown root:root "$HADOOP_HOME"/etc/hadoop/hadoop-env.sh # Download and install Apache Hive ARG HIVE_VER=3.1.2 -ENV HIVE_HOME /usr/local/hive/apache-hive-${HIVE_VER}-bin -ENV HIVE_CONF_DIR ${HIVE_HOME}/conf -ENV HCAT_HOME ${HIVE_HOME}/hcatalog -ENV PATH ${PATH}:${HIVE_HOME}/bin -ENV PATH ${PATH}:${HCAT_HOME}/bin +ENV HIVE_HOME /usr/local/hive/apache-hive-"$HIVE_VER"-bin +ENV HIVE_CONF_DIR "$HIVE_HOME"/conf +ENV HCAT_HOME "$HIVE_HOME"/hcatalog +ENV PATH "$PATH":"$HIVE_HOME"/bin +ENV PATH "$PATH":"$HCAT_HOME"/bin RUN mkdir -p /usr/local/hive && \ - curl -s https://dlcdn.apache.org/hive/hive-${HIVE_VER}/apache-hive-${HIVE_VER}-bin.tar.gz \ + curl -s https://dlcdn.apache.org/hive/hive-"$HIVE_VER"/apache-hive-"$HIVE_VER"-bin.tar.gz \ | tar -xz -C /usr/local/hive && \ - rm ${HIVE_HOME}/lib/log4j-slf4j-impl-2.10.0.jar -COPY docker-config/hive/hive-site.xml ${HIVE_CONF_DIR} + rm "$HIVE_HOME"/lib/log4j-slf4j-impl-2.10.0.jar +COPY docker-config/hive/hive-site.xml "$HIVE_CONF_DIR" # Install the postgresql connector driver ARG PSQL_JDBC_VER=42.2.24 -RUN curl -o /usr/share/java/postgresql-jdbc.jar https://jdbc.postgresql.org/download/postgresql-${PSQL_JDBC_VER}.jar && \ +RUN curl -o /usr/share/java/postgresql-jdbc.jar https://jdbc.postgresql.org/download/postgresql-"$PSQL_JDBC_VER".jar && \ chmod 644 /usr/share/java/postgresql-jdbc.jar && \ - ln -s /usr/share/java/postgresql-jdbc.jar ${HIVE_HOME}/lib/postgresql-jdbc.jar + ln -s /usr/share/java/postgresql-jdbc.jar "$HIVE_HOME"/lib/postgresql-jdbc.jar # Entrypoint/startup for sqoop and hive -ADD https://repo1.maven.org/maven2/org/json/json/20090211/json-20090211.jar ${SQOOP_HOME}/lib +ADD https://repo1.maven.org/maven2/org/json/json/20090211/json-20090211.jar "$SQOOP_HOME"/lib COPY docker-config/bootstrap.sh docker-config/entrypoint.sh /etc/docker-config/ RUN chown -R root:root /etc/docker-config/ && \ chmod -R 755 /etc/docker-config/ && \ diff --git a/docker-config/bootstrap.sh b/docker-config/bootstrap.sh index 1b1dec1..6dafc72 100644 --- a/docker-config/bootstrap.sh +++ b/docker-config/bootstrap.sh @@ -1,6 +1,6 @@ #!/bin/bash -: ${HADOOP_HOME:=/usr/local/hadoop} -$HADOOP_HOME/etc/hadoop/hadoop-env.sh +: "$HADOOP_HOME:=/usr/local/hadoop" +"$HADOOP_HOME"/etc/hadoop/hadoop-env.sh # Remove annoying warnings about dependencies export ACCUMULO_HOME=/tmp @@ -8,9 +8,9 @@ export ZOOKEEPER_HOME=/tmp export HBASE_HOME=/tmp # Replace namenode hostname with docker-compose hostname -sed s/HOSTNAME/$MASTER_HOSTNAME/ $HADOOP_HOME/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml -sed s/HOSTNAME/$MASTER_HOSTNAME/ $HADOOP_HOME/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml -sed s/HOSTNAME/$MASTER_HOSTNAME/ $HADOOP_HOME/etc/hadoop/mapred-site.xml.template > /usr/local/hadoop/etc/hadoop/mapred-site.xml +sed s/HOSTNAME/"$MASTER_HOSTNAME"/ "$HADOOP_HOME"/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml +sed s/HOSTNAME/"$MASTER_HOSTNAME"/ "$HADOOP_HOME"/etc/hadoop/yarn-site.xml.template > /usr/local/hadoop/etc/hadoop/yarn-site.xml +sed s/HOSTNAME/"$MASTER_HOSTNAME"/ "$HADOOP_HOME"/etc/hadoop/mapred-site.xml.template > /usr/local/hadoop/etc/hadoop/mapred-site.xml # Don't start hadoop for metastore server if [[ -z "$SKIP_BOOTSTRAP" ]]; then @@ -18,23 +18,23 @@ if [[ -z "$SKIP_BOOTSTRAP" ]]; then rm -rf /tmp/hadoop/dataNode/* # Remove ssh keys from data nodes - if [[ $HOSTNAME != $MASTER_HOSTNAME ]]; then + if [[ "$HOSTNAME" != "$MASTER_HOSTNAME" ]]; then rm /root/.ssh/id_ed25519 fi # Add node hostnames to workers file - echo $MASTER_HOSTNAME > $HADOOP_HOME/etc/hadoop/workers - echo $NODE_HOSTNAMES | tr -s ' ' '\n' >> $HADOOP_HOME/etc/hadoop/workers + echo "$MASTER_HOSTNAME" > "$HADOOP_HOME"/etc/hadoop/workers + echo "$NODE_HOSTNAMES" | tr -s ' ' '\n' >> "$HADOOP_HOME"/etc/hadoop/workers # Start hadoop /usr/sbin/sshd - $HADOOP_HOME/sbin/start-dfs.sh - $HADOOP_HOME/sbin/start-yarn.sh - $HADOOP_HOME/bin/mapred --daemon start historyserver + "$HADOOP_HOME"/sbin/start-dfs.sh + "$HADOOP_HOME"/sbin/start-yarn.sh + "$HADOOP_HOME"/bin/mapred --daemon start historyserver hdfs dfs -mkdir -p /user/root/ # Grab JDBC drivers from mounted volume - cp /jdbc/* $SQOOP_HOME/lib + cp /jdbc/* "$SQOOP_HOME"/lib else echo "Skipping hadoop bootstrap..." fi diff --git a/docker-config/hive/init-hive-metastore.sh b/docker-config/hive/init-hive-metastore.sh index 0a3f350..6524247 100644 --- a/docker-config/hive/init-hive-metastore.sh +++ b/docker-config/hive/init-hive-metastore.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL +psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<- EOSQL CREATE USER hive WITH PASSWORD 'hive'; CREATE DATABASE iasworld; ALTER DATABASE iasworld OWNER TO hive; diff --git a/docker-config/init.sh b/docker-config/init.sh index f63c500..9fe0c20 100644 --- a/docker-config/init.sh +++ b/docker-config/init.sh @@ -1,5 +1,5 @@ #!/bin/bash /usr/sbin/sshd -$HADOOP_HOME/bin/hdfs namenode -format -$HADOOP_HOME/sbin/start-dfs.sh -$HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hive/warehouse/iasworld.db +"$HADOOP_HOME"/bin/hdfs namenode -format +"$HADOOP_HOME"/sbin/start-dfs.sh +"$HADOOP_HOME"/bin/hdfs dfs -mkdir -p /user/hive/warehouse/iasworld.db diff --git a/run.sh b/run.sh index 0f5dfb9..f196ef0 100755 --- a/run.sh +++ b/run.sh @@ -1,5 +1,5 @@ #!/bin/bash -START_TIME=`date +%s` +START_TIME=$(date +%s) # Env variables controlling where sqooped data is exported to TEMP_LOG_FILE="logs/temp-sqoop-log" @@ -13,7 +13,7 @@ LOG_GROUP_NAME="/ccao/jobs/sqoop" --remove-orphans \ --no-color \ | ts '%.s' \ - | tee ${TEMP_LOG_FILE} + | tee "$TEMP_LOG_FILE" # Cleanup after docker run /usr/local/bin/docker-compose rm -f -s -v @@ -23,98 +23,97 @@ LOG_GROUP_NAME="/ccao/jobs/sqoop" find target/ -maxdepth 1 -type d -empty -delete TABLES_EXTRACTED=$(ls target/) for TABLE in ${TABLES_EXTRACTED}; do - SUB_DIRS=$(find target/${TABLE}/* -maxdepth 1 -type d -exec basename {} \;) - if [[ ! -z $SUB_DIRS && $(echo ${SUB_DIRS} | wc -l ) -gt 0 ]]; then + SUB_DIRS=$(find target/"$TABLE"/* -maxdepth 1 -type d -exec basename {} \;) + if [[ -n "$SUB_DIRS" && $(echo "$SUB_DIRS" | wc -l) -gt 0 ]]; then for dir in ${SUB_DIRS}; do /usr/bin/aws s3 rm \ - ${BUCKET_URI}/iasworld/${TABLE}/${dir} \ + "$BUCKET_URI"/iasworld/"$TABLE"/"$dir" \ --exclude "*" \ --include "*.parquet" \ --recursive \ | ts '%.s' \ - | tee -a ${TEMP_LOG_FILE} + | tee -a "$TEMP_LOG_FILE" done else /usr/bin/aws s3 rm \ - ${BUCKET_URI}/iasworld/${TABLE} \ + "$BUCKET_URI"/iasworld/"$TABLE" \ --exclude "*" \ --include "*.parquet" \ --recursive \ | ts '%.s' \ - | tee -a ${TEMP_LOG_FILE} + | tee -a "$TEMP_LOG_FILE" fi done # Uploaded pulled files from local target/ dir to S3 /usr/bin/aws s3 mv \ target/ \ - ${BUCKET_URI}/iasworld \ + "$BUCKET_URI"/iasworld \ --exclude "*" \ --include "*.parquet" \ --recursive \ --no-progress \ | ts '%.s' \ - | tee -a ${TEMP_LOG_FILE} + | tee -a "$TEMP_LOG_FILE" # Delete any remaining empty dirs find target/ -type d -empty -delete # Print overall runtime stats and tables extracted -END_TIME=`date +%s` -RUNTIME=$((END_TIME-START_TIME)) +END_TIME=$(date +%s) +RUNTIME=$((END_TIME - START_TIME)) HH=$((RUNTIME / 3600)) -MM=$(( (RUNTIME % 3600) / 60 )) -SS=$(( (RUNTIME % 3600) % 60 )) +MM=$(((RUNTIME % 3600) / 60)) +SS=$(((RUNTIME % 3600) % 60)) echo "Tables extracted: ${TABLES_EXTRACTED}" \ | ts '%.s' \ - | tee -a ${TEMP_LOG_FILE} + | tee -a "$TEMP_LOG_FILE" echo "Total extraction time: ${HH}:${MM}:${SS} (hh:mm:ss)" \ | ts '%.s' \ - | tee -a ${TEMP_LOG_FILE} + | tee -a "$TEMP_LOG_FILE" # Bash function to convert text log to JSON consumable by CloudWatch -parse_logs () { - cat $1 \ - | sed 's/ /|/' \ - | sed 's/\([0-9]\.[0-9]\{3\}\)[0-9]\{1,\}/\1/' \ - | sed 's/\.//' \ - | awk '{print $1, $NF}' OFS='|' FS='|' \ - | jq -Rn '[inputs +parse_logs() { + sed 's/ /|/' "$1" \ + | sed 's/\([0-9]\.[0-9]\{3\}\)[0-9]\{1,\}/\1/' \ + | sed 's/\.//' \ + | awk '{print $1, $NF}' OFS='|' FS='|' \ + | jq -Rn '[inputs | . / "\n" | (.[] | select(length > 0) | . / "|") as $input | {"timestamp": $input[0]|tonumber, "message": $input[1]}]' \ - > $2 + > "$2" } # Create log stream in CloudWatch with today's date LOG_STREAM_NAME="sqoop-job-log-$(date -u +'%Y-%m-%d_%H-%M-%S')" /usr/bin/aws logs create-log-stream \ - --log-group-name ${LOG_GROUP_NAME} \ - --log-stream-name ${LOG_STREAM_NAME} + --log-group-name "$LOG_GROUP_NAME" \ + --log-stream-name "$LOG_STREAM_NAME" # Convert text output from Docker and AWS CLI to clean JSON # for upload to AWS CloudWatch. Logs need to be split into chunks for upload # First loop skips using sequence token necessary for subsequent uploads COUNTER=1 -split -l 4000 ${TEMP_LOG_FILE} logs/temp-sqoop-log- +split -l 4000 "$TEMP_LOG_FILE" logs/temp-sqoop-log- for f in logs/temp-sqoop-log-*; do - parse_logs ${f} ${f}.json - if [[ ${COUNTER} -eq 1 ]]; then + parse_logs "$f" "$f".json + if [[ "$COUNTER" -eq 1 ]]; then SEQ_TOKEN=$( /usr/bin/aws logs put-log-events \ - --log-group-name ${LOG_GROUP_NAME} \ - --log-stream-name ${LOG_STREAM_NAME} \ - --log-events file://${f}.json \ - | jq -r .nextSequenceToken + --log-group-name "$LOG_GROUP_NAME" \ + --log-stream-name "$LOG_STREAM_NAME" \ + --log-events file://"$f".json \ + | jq -r .nextSequenceToken ) else SEQ_TOKEN=$( /usr/bin/aws logs put-log-events \ - --log-group-name ${LOG_GROUP_NAME} \ - --log-stream-name ${LOG_STREAM_NAME} \ - --log-events file://${f}.json \ - --sequence-token ${SEQ_TOKEN} \ - | jq -r .nextSequenceToken + --log-group-name "$LOG_GROUP_NAME" \ + --log-stream-name "$LOG_STREAM_NAME" \ + --log-events file://"$f".json \ + --sequence-token "$SEQ_TOKEN" \ + | jq -r .nextSequenceToken ) fi COUNTER=$((COUNTER + 1)) @@ -122,5 +121,5 @@ done echo "Logs successfully uploaded to CloudWatch" # Remove uploaded log files -mv ${TEMP_LOG_FILE} ${BACKUP_LOG_FILE} -rm ${TEMP_LOG_FILE}* +mv ./"$TEMP_LOG_FILE" ./"$BACKUP_LOG_FILE" +if [[ -n "$TEMP_LOG_FILE" ]]; then rm ./"$TEMP_LOG_FILE"*; fi diff --git a/scripts/get-tables.sh b/scripts/get-tables.sh index 0815d91..2f43e0a 100755 --- a/scripts/get-tables.sh +++ b/scripts/get-tables.sh @@ -1,35 +1,36 @@ #!/bin/bash # Check if table env var exists from docker, if not, use all tables -if [[ -z ${IPTS_TABLE} ]]; then +if [[ -z "$IPTS_TABLE" ]]; then JOB_TABLES=$(awk -F"," 'NR>1 {print $1}' /tmp/tables/tables-list.csv) else - JOB_TABLES=${IPTS_TABLE} + JOB_TABLES="$IPTS_TABLE" fi # Create database in HDFS to store tables DB_NAME=iasworld hive -e "DROP DATABASE IF EXISTS ${DB_NAME}; CREATE DATABASE ${DB_NAME};" -echo "Creating jobs for table(s): $(echo ${JOB_TABLES} | paste -sd,)" +JOB_TABLES_C=$(echo "$JOB_TABLES" | paste -sd,) +echo "Creating jobs for table(s): ${JOB_TABLES_C}" for TABLE in ${JOB_TABLES}; do # Get lowercase table name - TABLE_LC=$(echo ${TABLE} | awk '{print tolower($0)}') - mkdir -p /tmp/bindir/${TABLE} + TABLE_LC=$(echo "$TABLE" | awk '{print tolower($0)}') + mkdir -p /tmp/bindir/"$TABLE" # Options passed to sqoop. Fetch 1 row of each table in order # to grab metadata SQOOP_OPTIONS_MAIN=( job -D oracle.sessionTimeZone=America/Chicago \ - --create ${TABLE} -- import \ + --create "$TABLE" -- import \ --bindir /tmp/bindir/ \ - --connect jdbc:oracle:thin:@//${IPTS_HOSTNAME}:${IPTS_PORT}/${IPTS_SERVICE_NAME} \ - --username ${IPTS_USERNAME} \ + --connect jdbc:oracle:thin:@//"$IPTS_HOSTNAME":"$IPTS_PORT"/"$IPTS_SERVICE_NAME" \ + --username "$IPTS_USERNAME" \ --password-file file:///run/secrets/IPTS_PASSWORD \ --query "SELECT * FROM IASWORLD.${TABLE} WHERE \$CONDITIONS FETCH FIRST 1 ROWS ONLY" \ - --hcatalog-database ${DB_NAME} \ - --hcatalog-table ${TABLE} \ + --hcatalog-database "$DB_NAME" \ + --hcatalog-table "$TABLE" \ --drop-and-create-hcatalog-table \ --num-mappers 1 ) @@ -39,10 +40,10 @@ for TABLE in ${JOB_TABLES}; do sqoop "${SQOOP_OPTIONS_MAIN[@]}" # Execute saved sqoop job - sqoop job --exec ${TABLE} + sqoop job --exec "$TABLE" # Export table definition to mounted dir rm -rf /tmp/hadoop-root/nm-local-dir/usercache/* - hive -e "SHOW CREATE TABLE iasworld.${TABLE_LC}" > /tmp/tables/${TABLE}.sql + hive -e "SHOW CREATE TABLE iasworld.${TABLE_LC}" > /tmp/tables/"$TABLE".sql echo "Completed job for table: ${TABLE}" done diff --git a/scripts/run-sqoop.sh b/scripts/run-sqoop.sh index b8e36c6..bd1369b 100755 --- a/scripts/run-sqoop.sh +++ b/scripts/run-sqoop.sh @@ -1,10 +1,10 @@ #!/bin/bash # Check if table env var exists from docker, if not, use all tables -if [[ -z ${IPTS_TABLE} ]]; then +if [[ -z "$IPTS_TABLE" ]]; then JOB_TABLES=$(awk -F"," 'NR>1 {print $1}' /tmp/tables/tables-list.csv) else - JOB_TABLES=$(echo ${IPTS_TABLE} | tr -cd '[:alpha:][:space:]_') + JOB_TABLES=$(echo "$IPTS_TABLE" | tr -cd '[:alpha:][:space:]_') fi # Create database in HDFS to store tables @@ -15,39 +15,41 @@ echo "Creating jobs for table(s): ${JOB_TABLES}" for TABLE in ${JOB_TABLES}; do # Get lowercase table name - TABLE_LC=$(echo ${TABLE} | awk '{print tolower($0)}') - mkdir -p /tmp/bindir/${TABLE} + TABLE_LC=$(echo "$TABLE" | awk '{print tolower($0)}') + mkdir -p /tmp/bindir/"$TABLE" # Boolean value for whether $TABLE contains the TAXYR column - CONTAINS_TAXYR=$(awk -F"," \ - -v table=${TABLE} \ - '$1 == table {print $2}' \ - /tmp/tables/tables-list.csv | - tr -d "\n" | tr -d "\r" + CONTAINS_TAXYR=$( + awk -F"," \ + -v table="$TABLE" \ + '$1 == table {print $2}' \ + /tmp/tables/tables-list.csv \ + | tr -d "\n" | tr -d "\r" ) # Get the number of buckets to use for Hive partitioning - NUM_BUCKETS=$(awk -F"," \ - -v table=${TABLE} \ - '$1 == table {print $3}' \ - /tmp/tables/tables-list.csv | - tr -d "\n" | tr -d "\r" + NUM_BUCKETS=$( + awk -F"," \ + -v table="$TABLE" \ + '$1 == table {print $3}' \ + /tmp/tables/tables-list.csv \ + | tr -d "\n" | tr -d "\r" ) # Construct a query based on specified year and condition - QUERY_YEAR=$(echo ${IPTS_TABLE} | grep -Po "(?<=${TABLE}.)[0-9]{4}") - QUERY_COND=$(echo ${IPTS_TABLE} | grep -Po "(?<=${TABLE})[<>=]") - if [[ -z ${QUERY_YEAR} && -z ${QUERY_COND} ]]; then + QUERY_YEAR=$(echo "$IPTS_TABLE" | grep -Po "(?<=${TABLE}.)[0-9]{4}") + QUERY_COND=$(echo "$IPTS_TABLE" | grep -Po "(?<=${TABLE})[<>=]") + if [[ -z "$QUERY_YEAR" && -z "$QUERY_COND" ]]; then NUM_MAPPERS=$(($(date +%Y) - 1998)) BOUNDARY_QUERY="SELECT MIN(TAXYR), MAX(TAXYR) FROM IASWORLD.${TABLE}" QUERY="SELECT * FROM IASWORLD.${TABLE} WHERE \$CONDITIONS" else # Make number of mappers roughly equal to number of tax years - if [[ ${QUERY_COND} == \> ]]; then - NUM_MAPPERS=$(($(date +%Y) - ${QUERY_YEAR})) + if [[ "$QUERY_COND" == \> ]]; then + NUM_MAPPERS=$(($(date +%Y) - "$QUERY_YEAR")) elif [[ ${QUERY_COND} == \< ]]; then - NUM_MAPPERS=$((${QUERY_YEAR} - 1998)) + NUM_MAPPERS=$(("$QUERY_YEAR" - 1998)) else NUM_MAPPERS=1 fi @@ -60,28 +62,28 @@ for TABLE in ${JOB_TABLES}; do # Options passed to sqoop SQOOP_OPTIONS_MAIN=( - job -D oracle.sessionTimeZone=America/Chicago \ - --create ${TABLE} -- import \ - --bindir /tmp/bindir/${TABLE} \ - --connect jdbc:oracle:thin:@//${IPTS_HOSTNAME}:${IPTS_PORT}/${IPTS_SERVICE_NAME} \ - --username ${IPTS_USERNAME} \ - --password-file file:///run/secrets/IPTS_PASSWORD \ - --query "${QUERY}" \ - --hcatalog-database ${DB_NAME} \ - --hcatalog-table ${TABLE} + job -D oracle.sessionTimeZone=America/Chicago + --create "$TABLE" -- import + --bindir /tmp/bindir/"$TABLE" + --connect jdbc:oracle:thin:@//"$IPTS_HOSTNAME":"$IPTS_PORT"/"$IPTS_SERVICE_NAME" + --username "$IPTS_USERNAME" + --password-file file:///run/secrets/IPTS_PASSWORD + --query "${QUERY}" + --hcatalog-database "$DB_NAME" + --hcatalog-table "$TABLE" ) # Create the Hive tables necessary to run sqoop jobs # Table definitions are stored in the repo - hive -f /tmp/tables/${TABLE}.sql + hive -f /tmp/tables/"$TABLE".sql # Create a sqoop job for the selected table(s). Tables with TAXYR # get partitioning and splitby during sqoop import - if [[ ${CONTAINS_TAXYR} == TRUE ]]; then + if [[ "$CONTAINS_TAXYR" == TRUE ]]; then sqoop "${SQOOP_OPTIONS_MAIN[@]}" \ --boundary-query "${BOUNDARY_QUERY}" \ --split-by TAXYR \ - --num-mappers ${NUM_MAPPERS} + --num-mappers "$NUM_MAPPERS" else sqoop "${SQOOP_OPTIONS_MAIN[@]}" -m 1 fi @@ -91,13 +93,13 @@ for TABLE in ${JOB_TABLES}; do sqoop job \ -D java.security.egd=file:///dev/./urandom \ -D mapred.child.java.opts="-Djava.security.egd=file:///dev/./urandom" \ - --exec ${TABLE} + --exec "$TABLE" # If buckets are specified, rewrite output from sqoop to bucketed table # Then copy from distributed file system (HDFS) to local mounted dir - mkdir -p /tmp/target/${TABLE} - if [[ ${NUM_BUCKETS} -gt 1 ]]; then - if [[ ${CONTAINS_TAXYR} == TRUE ]]; then + mkdir -p /tmp/target/"$TABLE" + if [[ "$NUM_BUCKETS" -gt 1 ]]; then + if [[ "$CONTAINS_TAXYR" == TRUE ]]; then hive -e \ "INSERT OVERWRITE TABLE ${DB_NAME}.${TABLE_LC}_bucketed PARTITION(taxyr) SELECT * FROM ${DB_NAME}.${TABLE_LC};" @@ -107,21 +109,21 @@ for TABLE in ${JOB_TABLES}; do SELECT * FROM ${DB_NAME}.${TABLE_LC};" fi hdfs dfs -copyToLocal \ - /user/hive/warehouse/${DB_NAME}.db/${TABLE_LC}_bucketed/* \ - /tmp/target/${TABLE} + /user/hive/warehouse/"$DB_NAME".db/"$TABLE_LC"_bucketed/* \ + /tmp/target/"$TABLE" else hdfs dfs -copyToLocal \ - /user/hive/warehouse/${DB_NAME}.db/${TABLE_LC}/* \ - /tmp/target/${TABLE} + /user/hive/warehouse/"$DB_NAME".db/"$TABLE_LC"/* \ + /tmp/target/"$TABLE" fi # Remove tables in HDFS once moved to local - hdfs dfs -rm -r -f /user/hive/warehouse/${DB_NAME}.db/${TABLE_LC}_bucketed - hdfs dfs -rm -r -f /user/hive/warehouse/${DB_NAME}.db/${TABLE_LC} + hdfs dfs -rm -r -f /user/hive/warehouse/"$DB_NAME".db/"$TABLE_LC"_bucketed + hdfs dfs -rm -r -f /user/hive/warehouse/"$DB_NAME".db/"$TABLE_LC" # Remove _SUCCESS files and rename parts to parquet - find /tmp/target/${TABLE} -type f -name '_SUCCESS' -delete - find /tmp/target/${TABLE} -type f -exec mv {} {}.snappy.parquet \; + find /tmp/target/"$TABLE" -type f -name '_SUCCESS' -delete + find /tmp/target/"$TABLE" -type f -exec mv {} {}.snappy.parquet \; # Clear the sqoop workers cache of copied JARs rm -rf /tmp/hadoop-root/nm-local-dir/usercache/* diff --git a/tables/update-tables.sh b/tables/update-tables.sh index 5e5ba5d..baf9044 100755 --- a/tables/update-tables.sh +++ b/tables/update-tables.sh @@ -2,99 +2,99 @@ # Script to convert table definitions created by Sqoop into # bucketed, partitioned tables saved as parquet TABLES=$(awk -F"," 'NR>1 {print $1}' tables-list.csv) -DB_NAME=iasworld # Loop through each file and convert for TABLE in ${TABLES}; do - TABLE_LC=$(echo ${TABLE} | awk '{print tolower($0)}') + TABLE_LC=$(echo "$TABLE" | awk '{print tolower($0)}') # Boolean value for whether $TABLE contains the TAXYR column - CONTAINS_TAXYR=$(awk -F"," \ - -v table=${TABLE} \ - '$1 == table {print $2}' \ - tables-list.csv | - tr -d "\n" | tr -d "\r" + CONTAINS_TAXYR=$( + awk -F"," \ + -v table="$TABLE" \ + '$1 == table {print $2}' \ + tables-list.csv \ + | tr -d "\n" | tr -d "\r" ) # Get the number of buckets to use for Hive partitioning - NUM_BUCKETS=$(awk -F"," \ - -v table=${TABLE} \ - '$1 == table {print $3}' \ - tables-list.csv | - tr -d "\n" | tr -d "\r" + NUM_BUCKETS=$( + awk -F"," \ + -v table="$TABLE" \ + '$1 == table {print $3}' \ + tables-list.csv \ + | tr -d "\n" | tr -d "\r" ) # Remove properties and options added by Sqoop and remove taxyr col (it # will be readded manually) - cat ${TABLE}.sql \ - | sed '/ROW FORMAT SERDE/Q' \ - | sed '/`taxyr` decimal(4,0)/d' \ - > ${TABLE}.sql.tmp1 + sed '/ROW FORMAT SERDE/Q' "$TABLE".sql \ + | sed "/\`taxyr\` decimal(4,0)/d" \ + > "$TABLE".sql.tmp1 # For TAXYR and BUCKETS, create an unpartitioned, unbucketed table # as a temporary place for sqoop to extract to, and a partitioned, # bucketed table as a final location - if [[ ${CONTAINS_TAXYR} == TRUE && ${NUM_BUCKETS} -gt 1 ]]; then + if [[ "$CONTAINS_TAXYR" == TRUE && "$NUM_BUCKETS" -gt 1 ]]; then # Create partitioned bucketed table def - cp ${TABLE}.sql.tmp1 ${TABLE}.sql.tmp2 - sed -i "/^CREATE TABLE/s/${TABLE_LC}/${TABLE_LC}\_bucketed/" ${TABLE}.sql.tmp2 + cp "$TABLE".sql.tmp1 "$TABLE".sql.tmp2 + sed -i "/^CREATE TABLE/s/${TABLE_LC}/${TABLE_LC}\_bucketed/" "$TABLE".sql.tmp2 echo "PARTITIONED BY (\`taxyr\` string) CLUSTERED BY (\`parid\`) SORTED BY (\`seq\`) INTO ${NUM_BUCKETS} BUCKETS STORED AS PARQUET TBLPROPERTIES ('parquet.compression'='SNAPPY');" \ | tr -s ' ' \ - >> ${TABLE}.sql.tmp2 + >> "$TABLE".sql.tmp2 # Create table def to extract to - sed -i '$ s/.$/\,/' ${TABLE}.sql.tmp1 + sed -i '$ s/.$/\,/' "$TABLE".sql.tmp1 echo " \`taxyr\` string) STORED AS RCFILE; - " >> ${TABLE}.sql.tmp1 + " >> "$TABLE".sql.tmp1 - cat ${TABLE}.sql.tmp1 ${TABLE}.sql.tmp2 > ${TABLE}.sql + cat "$TABLE".sql.tmp1 "$TABLE".sql.tmp2 > "$TABLE".sql # For tables with TAXYR but not enough data to bucket, do the above, # but remove buckets and temp table (sqoop can extract directly as # long as there are no buckets) - elif [[ ${CONTAINS_TAXYR} == TRUE && ! ${NUM_BUCKETS} -gt 1 ]]; then + elif [[ "$CONTAINS_TAXYR" == TRUE && ! "$NUM_BUCKETS" -gt 1 ]]; then - cat ${TABLE}.sql.tmp1 | sed '/taxyr/d' > ${TABLE}.sql.tmp2 + sed '/taxyr/d' "$TABLE".sql.tmp1 > "$TABLE".sql.tmp2 echo "PARTITIONED BY (\`taxyr\` string) STORED AS PARQUET TBLPROPERTIES ('parquet.compression'='SNAPPY');" \ | tr -s ' ' \ - >> ${TABLE}.sql.tmp2 + >> "$TABLE".sql.tmp2 - cat ${TABLE}.sql.tmp2 > ${TABLE}.sql + cat "$TABLE".sql.tmp2 > "$TABLE".sql # With buckets and no TAXYR, don't create partitions - elif [[ ${CONTAINS_TAXYR} == FALSE && ${NUM_BUCKETS} -gt 1 ]]; then + elif [[ "$CONTAINS_TAXYR" == FALSE && "$NUM_BUCKETS" -gt 1 ]]; then - cp ${TABLE}.sql.tmp1 ${TABLE}.sql.tmp2 - sed -i "s/${TABLE_LC}/${TABLE_LC}\_bucketed/g" ${TABLE}.sql.tmp2 + cp "$TABLE".sql.tmp1 "$TABLE".sql.tmp2 + sed -i "s/${TABLE_LC}/${TABLE_LC}\_bucketed/g" "$TABLE".sql.tmp2 echo "CLUSTERED BY (\`parid\`) SORTED BY (\`seq\`) INTO ${NUM_BUCKETS} BUCKETS STORED AS PARQUET TBLPROPERTIES ('parquet.compression'='SNAPPY');" \ | tr -s ' ' \ - >> ${TABLE}.sql.tmp2 + >> "$TABLE".sql.tmp2 echo "STORED AS RCFILE; - " >> ${TABLE}.sql.tmp1 - cat ${TABLE}.sql.tmp1 ${TABLE}.sql.tmp2 > ${TABLE}.sql + " >> "$TABLE".sql.tmp1 + cat "$TABLE".sql.tmp1 "$TABLE".sql.tmp2 > "$TABLE".sql else echo "STORED AS PARQUET -TBLPROPERTIES ('parquet.compression'='SNAPPY');" >> ${TABLE}.sql.tmp1 - mv ${TABLE}.sql.tmp1 ${TABLE}.sql +TBLPROPERTIES ('parquet.compression'='SNAPPY');" >> "$TABLE".sql.tmp1 + mv "$TABLE".sql.tmp1 "$TABLE".sql fi # Delete bucketing sort of CV table (no seq number) - if [[ ${TABLE} == CVLEG ]]; then - sed -i 's/SORTED BY (`seq`) //' ${TABLE}.sql - elif [[ ${TABLE} == CVOWN || ${TABLE} == CVTRAN ]]; then - sed -i '/^CLUSTERED BY/d' ${TABLE}.sql + if [[ "$TABLE" == CVLEG ]]; then + sed -i "s/SORTED BY (\`seq\`) //" "$TABLE".sql + elif [[ "$TABLE" == CVOWN || "$TABLE" == CVTRAN ]]; then + sed -i "/^CLUSTERED BY/d" "$TABLE".sql fi rm -f ./*.tmp*