-
Notifications
You must be signed in to change notification settings - Fork 128
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #615 from dbic/enh-anon-cmd
Add a bash anon-cmd to be used to incrementally anonymize sids
- Loading branch information
Showing
1 changed file
with
54 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/bin/bash | ||
# Generic anonymization script which would anonymize sid based on what it had | ||
# seen in the past or simply what the translation dict already has. | ||
|
||
set -eu | ||
|
||
debug() { | ||
: echo "DEBUG: $*" >&2 | ||
} | ||
|
||
# Translation file location | ||
# Store under .git by default to guarantee that it is not committed or locked by git-annex etc | ||
# But it might not fit some usecases where there is no .git | ||
anon_file_default=$(dirname "$0")/../.git/anon_sid_map.csv | ||
anon_file="${AC_ANON_FILE:-$anon_file_default}" | ||
anon_fmt="${AC_ANON_FMT:-%03d}" | ||
|
||
sid="$1" | ||
|
||
# harmonize since elderly awk on rolando seems to have no clue about IGNORECASE | ||
sid=$(echo "$sid" | tr '[:lower:]' '[:upper:]') | ||
|
||
debug "Using $anon_file to map $sid" | ||
|
||
if [ ! -e "$anon_file" ]; then | ||
touch "$anon_file" # initiate it | ||
fi | ||
|
||
# apparently heudiconv passes even those we provided in `-s` CLI option | ||
# to anonymization script. So, we will have to match those by our format | ||
# and then give back if matches. That would forbid plain remapping though if | ||
# original ids are in the same format, so some folks might want to disable that! | ||
sid_input_fmted=$(echo "$sid" | sed -e 's,^0*,,g' | xargs printf "$anon_fmt" 2>&1 || :) | ||
if [ "$sid" = "$sid_input_fmted" ]; then | ||
debug already in the anonymized format | ||
echo "$sid" | ||
exit 0 | ||
fi | ||
|
||
res=$(grep "^$sid," "$anon_file" | head -n 1) | ||
if [ -n "$res" ]; then | ||
ann="${res##*,}" | ||
debug "Found $ann in '$res'" | ||
else | ||
echo "We have all sids mapped already! Will not create a new one for $sid" >&2; exit 1 | ||
# need to take the latest one | ||
largest=$(sed -e 's/.*,//g' "$anon_file" | sort -n | tail -n1 | sed -e 's,^0*,,g') | ||
next=$((largest+1)) | ||
# shellcheck disable=SC2059 | ||
ann=$(printf "$anon_fmt" $next) | ||
debug "Found $largest and $next to get $ann, storing" | ||
echo "$sid,$ann" >> "$anon_file" | ||
fi | ||
echo "$ann" |