From 44ad9cb8ecc11ebab5f70bc0089af43e47ff8f58 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 6 Aug 2024 15:19:06 +1000 Subject: [PATCH 1/4] [QOLDEV-863] adjust Solr sync approach for more robustness - Export to EFS as an archive, not an exploded directory - Import by stopping Solr and wholesale replacing the index, not via replication restore endpoint --- files/default/solr-sync.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/files/default/solr-sync.sh b/files/default/solr-sync.sh index 2e55c59..b128c37 100644 --- a/files/default/solr-sync.sh +++ b/files/default/solr-sync.sh @@ -7,7 +7,7 @@ set -x BACKUP_NAME="$CORE_NAME-$(date +'%Y-%m-%dT%H:%M')" SNAPSHOT_NAME="snapshot.$BACKUP_NAME" LOCAL_SNAPSHOT="$LOCAL_DIR/$SNAPSHOT_NAME" -SYNC_SNAPSHOT="$SYNC_DIR/$SNAPSHOT_NAME" +SYNC_SNAPSHOT="$SYNC_DIR/${SNAPSHOT_NAME}.tgz" MINUTE=$(date +%M) function set_dns_primary () { @@ -52,17 +52,17 @@ function export_snapshot () { if [ "$REPLICATION_STATUS" != "0" ]; then return $REPLICATION_STATUS fi - sudo -u solr sh -c "$LUCENE_CHECK $LOCAL_SNAPSHOT && rsync -a --delete $LOCAL_SNAPSHOT/ $SYNC_SNAPSHOT/" || return 1 + sh -c "$LUCENE_CHECK $LOCAL_SNAPSHOT && sudo -u solr tar --force-local --exclude=write.lock -czf $SYNC_SNAPSHOT -C $LOCAL_SNAPSHOT ." || return 1 } function import_snapshot () { # Give the master time to update the sync copy for i in $(eval echo "{1..40}"); do - if [ -f "$SYNC_SNAPSHOT/write.lock" ]; then - sudo -u solr rm -r $LOCAL_DIR/snapshot.$CORE_NAME-* - sudo -u solr rsync -a --delete "$SYNC_SNAPSHOT/" "$LOCAL_SNAPSHOT/" || exit 1 - rm $LOCAL_SNAPSHOT/write.lock - curl "$HOST/$CORE_NAME/replication?command=restore&location=$LOCAL_DIR&name=$BACKUP_NAME" + if [ -f "$SYNC_SNAPSHOT" ]; then + sudo service solr stop + sudo -u solr mkdir $LOCAL_DIR/index + rm $LOCAL_DIR/index/* && sudo -u solr tar -xzf "$SYNC_SNAPSHOT" -C $LOCAL_DIR/index || exit 1 + sudo service solr start return 1 else sleep 5 @@ -100,9 +100,7 @@ if (/usr/local/bin/pick-solr-master.sh); then # Hourly backup to S3 if [ "$MINUTE" = "00" ]; then - cd "$LOCAL_DIR" - tar --force-local -czf "$SNAPSHOT_NAME.tgz" "$SNAPSHOT_NAME" - aws s3 mv "$SNAPSHOT_NAME.tgz" "s3://$BUCKET/solr_backup/$CORE_NAME/" --expires $(date -d '30 days' --iso-8601=seconds) + aws s3 cp "$SYNC_SNAPSHOT" "s3://$BUCKET/solr_backup/$CORE_NAME/" --expires $(date -d '30 days' --iso-8601=seconds) fi else # make traffic come to this instance only as a backup option From 1530584f49352698339b436933225ca7f3749136 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 6 Aug 2024 16:08:17 +1000 Subject: [PATCH 2/4] [QOLDEV-863] clean up long-obsolete health check files --- recipes/ckanbatch-configure.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/recipes/ckanbatch-configure.rb b/recipes/ckanbatch-configure.rb index 63a770b..ec19ac7 100644 --- a/recipes/ckanbatch-configure.rb +++ b/recipes/ckanbatch-configure.rb @@ -60,6 +60,13 @@ group "root" end +file "/etc/cron.daily/prune-health-checks" do + content "/usr/local/bin/pick-job-server.sh && find /data -maxdepth 1 -name '*-healthcheck_*' -mmin '+60' -execdir rm '{}' ';' >/dev/null 2>&1\n" + mode "0755" + owner "root" + group "root" +end + file "/etc/cron.d/ckan-worker" do content "*/5 * * * * root /usr/local/bin/pick-job-server.sh && /usr/local/bin/ckan-monitor-job-queue.sh >/dev/null 2>&1\n" mode '0644' From 3350b871e3600adfb955acdd727caffb69dc528c Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Tue, 6 Aug 2024 16:47:59 +1000 Subject: [PATCH 3/4] [QOLDEV-863] update initial Solr config to grab archive instead of exploded dir --- recipes/solr-deploy.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/recipes/solr-deploy.rb b/recipes/solr-deploy.rb index e5ae39b..7b71256 100755 --- a/recipes/solr-deploy.rb +++ b/recipes/solr-deploy.rb @@ -267,10 +267,15 @@ action [:stop] end bash "Copy latest index from EFS" do + user account_name code <<-EOS rsync -a --delete #{efs_data_dir}/ #{real_data_dir}/ - LATEST_INDEX=`ls -dtr #{efs_data_dir}/data/#{core_name}/data/snapshot.* |tail -1` - rsync $LATEST_INDEX/ #{real_data_dir}/data/#{core_name}/data/index/ + CORE_DATA="#{real_data_dir}/data/#{core_name}/data" + LATEST_INDEX=`ls -dtr $CORE_DATA/snapshot.* |tail -1` + if (echo "$LATEST_INDEX" |grep "[.]tgz$" >/dev/null 2>&1); then + mkdir -p "$CORE_DATA/index" + rm -f $CORE_DATA/index/*; tar -xzf "$LATEST_INDEX" -C $CORE_DATA/index + fi EOS only_if { ::File.directory? efs_data_dir } end From a7dc38377cb1aefa2e0ee774f6c988cdd7484064 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Wed, 7 Aug 2024 11:02:02 +1000 Subject: [PATCH 4/4] [QOLDEV-863] use Systemd to start Solr during sync --- files/default/solr-sync.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/files/default/solr-sync.sh b/files/default/solr-sync.sh index b128c37..56e7356 100644 --- a/files/default/solr-sync.sh +++ b/files/default/solr-sync.sh @@ -62,8 +62,8 @@ function import_snapshot () { sudo service solr stop sudo -u solr mkdir $LOCAL_DIR/index rm $LOCAL_DIR/index/* && sudo -u solr tar -xzf "$SYNC_SNAPSHOT" -C $LOCAL_DIR/index || exit 1 - sudo service solr start - return 1 + sudo systemctl start solr + return 0 else sleep 5 fi