Skip to content

Latest commit

 

History

History
126 lines (98 loc) · 4.62 KB

03_Genotyping.md

File metadata and controls

126 lines (98 loc) · 4.62 KB

Genotyping

Small test for cosigt on UTHSC cluster

Preparation

mkdir -p /scratch/small_test && cd /scratch/small_test

# Region-of-interest
mkdir roi && echo -e "grch38#chr6\t31972057\t32055418" > roi/roi.bed

# Cram
mkdir cram && cd cram
wget https://1000genomes.s3.amazonaws.com/1000G_2504_high_coverage/additional_698_related/data/ERR3988768/HG00438.final.cram
wget https://1000genomes.s3.amazonaws.com/1000G_2504_high_coverage/additional_698_related/data/ERR3988768/HG00438.final.cram.crai
cd ..

# Reference
mkdir reference && cd reference
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
samtools faidx GRCh38_full_analysis_set_plus_decoy_hla.fa
cd ..

# Assemblies
mkdir assemblies && cd assemblies
wget -O HPRC-yr1.agc "https://zenodo.org/record/5826274/files/HPRC-yr1.agc?download=1"
rm chr6.y1.fa
while read -r line; do
    agc getctg HPRC-yr1.agc $line >> chr6.y1.fa
done < $dir_base/data/chr6.y1.txt
samtools faidx ../reference/GRCh38_full_analysis_set_plus_decoy_hla.fa chr6 | sed 's/^>chr6/>grch38#chr6/' > chr6.grhch38.fa
cat chr6.grhch38.fa >> chr6.y1.fa
samtools faidx chr6.y1.fa
cd ..

# Prepare pipeline and two OPTIONAL input files
git clone https://github.com/davidebolo1993/cosigt
cd cosigt
git checkout eb36f56f210be9de9859fbe7902a21879267a94a
cd cosigt_smk

# Disable stuff we don't care about for now
sed "/\/ava\.pdf/s/^/#/" workflow/Snakefile -i
sed "/\/pgrtk/s/^/#/" workflow/Snakefile -i
sed "/annotations/s/^/#/" workflow/Snakefile -i
sed "/\/untangle/s/^/#/" workflow/Snakefile -i

# Fix current issues
sed 's/reg[3]/reg[2]/g' workflow/scripts/annotate.r -i
sed 's/reg[4]/reg[3]/g' workflow/scripts/annotate.r -i

# OPTIONAL: create a map of alignment name to id
for s in $(ls ../../cram/*.cram); do cram=$(basename $s) && id=$(echo $cram | cut -d "." -f 1) && echo -e "$cram\t$id"; done > sample.map.tsv

# OPTIONAL: add some annotations for the region of interest
cd resources/annotations
wget -c https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.annotation.gtf.gz
cd ../..

# Create a temporary directory
mkdir -p /scratch/small_test/tmp

# Prepare the configuration files
conda activate /lizardfs/guarracino/condatools/cosigt

cd /scratch/small_test/cosigt/cosigt_smk
python workflow/scripts/organize.py \
    -a /scratch/small_test/cram \
    -r /scratch/small_test/reference/GRCh38_full_analysis_set_plus_decoy_hla.fa \
    --assemblies /scratch/small_test/assemblies/chr6.y1.fa \
    --roi /scratch/small_test/roi/roi.bed \
    --wfmash_tmpdir /scratch/small_test/tmp \
    --pggb_tmpdir /scratch/small_test/tmp \
    --output /scratch/small_test/output \
    --samplemap /scratch/small_test/cosigt/cosigt_smk/sample.map.tsv \
    --annotation /scratch/small_test/cosigt/cosigt_smk/resources/annotations/gencode.v47.annotation.gtf.gz

conda deactivate

Genotyping

On the UTHSC cluster, put the following in ~/.bashrc (or ~/.zshrc):

export PATH=$(echo $PATH | tr ':' '\n' | awk '!(/\/gnu\/store\// || /guix/)' | paste -sd ':') # Remove guix's path

export GUIX_PROFILE="/home/guarracino/.guix-profile"
 . "$GUIX_PROFILE/etc/profile"
#export PYTHONNOUSERSITE=1 # Tells Python not to use the user site-packages directory and clear the Python path
#export PATH="/home/guarracino/.guix-profile/bin:$PATH"
#export GUIX_LOCPATH="/home/guarracino/.guix-profile/lib/locale"

export PATH="/lizardfs/guarracino/tools/bedtools2/bin:$PATH"
export PATH="/lizardfs/guarracino/tools/samtools-1.21:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/gafpack/target/release:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/GFAffix/target/release:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/impg/target/release:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/gfainject/target/release:$PATH"

export PATH="/lizardfs/guarracino/tools_for_cosigt/wfmash/build/bin:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/seqwish/bin:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/smoothxg/bin:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/odgi/bin:$PATH"
export PATH="/lizardfs/guarracino/tools_for_cosigt/pggb:$PATH"

# Added for go and cosigt installation
export PATH="/lizardfs/guarracino/tools/go/bin:$PATH"
export PATH="/lizardfs/guarracino/tools/agc-1.1_x64-linux:$PATH"
export PATH="/lizardfs/guarracino/git/cosigt:$PATH"

Run the cosigt pipeline:

# Activate conda environment
conda activate /lizardfs/guarracino/condatools/cosigt

cd /scratch/small_test/cosigt/cosigt_smk
snakemake cosigt --cores 8

conda deactivate

Results will be in /scratch/small_test/output.