Skip to content

Commit

Permalink
src and Dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
dvantwisk committed Nov 3, 2024
1 parent 2dc3e89 commit 6420f81
Show file tree
Hide file tree
Showing 29 changed files with 750 additions and 52 deletions.
72 changes: 20 additions & 52 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#COPY requirements.txt .
#RUN uv pip install -r requirements.txt

#FROM quay.io/biocontainers/pbmm2:1.16.0--h9ee0642_0 AS ONE_P2
#FROM quay.io/biocontainers/pbmm2:1.16.0--h9ee0642_0 AS Pbmm2
#FROM quay.io/biocontainers/pbfusion:0.3.1--hdfd78af_0 AS PBFusion
#FROM quay.io/biocontainers/longgf:0.1.2--h4ac6f70_7 AS LongGF
#FROM davidsongroup/jaffa:2.4 AS JAFFA
Expand All @@ -15,7 +15,7 @@
#FROM quay.io/biocontainers/genion:1.2.3--hdcf5f25_1 AS Genion
#FROM quay.io/biocontainers/star-fusion:1.10.0--hdfd78af_1 AS Starfusion
#FROM staphb/samtools:1.9 AS Samtools
#FROM r-base:4.4.0 AS Rbase
#FROM rocker/r-base:4.4.0 AS Rbase

#FROM mambaorg/micromamba:2.0.2

Expand All @@ -26,7 +26,6 @@
#COPY requirements.txt .
#RUN micromamba create -f requiremnts.txt


FROM ubuntu:22.04 AS Fusim

WORKDIR /opt
Expand All @@ -37,62 +36,31 @@ RUN apt-get update && \
RUN wget https://github.com/aebruno/fusim/raw/master/releases/fusim-0.2.2-bin.zip && \
unzip fusim-0.2.2-bin.zip

FROM ubuntu:22.04
#COPY --from=ONE_P2 /usr/local/bin/pbmm2 /bin/
RUN wget https://www.niehs.nih.gov/sites/default/files/2024-02/artbinmountrainier2016.06.05linux64.tgz && \
tar xvzf artbinmountrainier2016.06.05linux64.tgz

FROM mambaorg/micromamba:2.0.2

COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml

#FROM ubuntu:22.04
#COPY --from=Pbmm2 /usr/local/bin/pbmm2 /bin/
#COPY --from=PBFusion /usr/local/bin/pbfusion /bin/
#COPY --from=Genion /opt/conda/envs/env/bin//genion /bin/
#COPY --from=Minimap2 /usr/local/bin/minimap2 /bin/
#COPY --from=JAFFA /JAFFA /JAFFA
#COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
#COPY --from=Rbase /usr/bin /bin/
COPY --from=Fusim /opt/ /bin/

RUN apt-get update && \
apt-get install -y openjdk-11-jre-headless wget unzip && \
apt-get clean;

ENV PATH="$PATH:/bin:/JAFFA/tools/bin:/JAFFA:/fusim-0.2.2"

#FROM ghcr.io/astral-sh/uv:0.2.12 AS builder
#COPY --from=Fusim /opt/ /bin/

#RUN wget -O fusim-0.2.2.zip https://github.com/aebruno/fusim/raw/master/releases/fusim-0.2.2-bin.zip \

#FROM ghcr.io/astral-sh/uv:0.2.12 AS

# Use a multi-stage build to first get uv
#FROM ghcr.io/astral-sh/uv:0.2.12 AS uv
# Choose your python version here
#FROM python:3.10.1-slim-buster
# Create a virtual environment with uv inside the container
#RUN --mount=from=uv,source=/uv,target=./uv \
# ./uv venv /opt/venv
# We need to set this environment variable so that uv knows where
# the virtual environment is to install packages
#ENV VIRTUAL_ENV=/opt/venv
# Make sure that the virtual environment is in the PATH so
# we can use the binaries of packages that we install such as pip
# without needing to activate the virtual environment explicitly
#ENV PATH="/opt/venv/bin:$PATH"



#RUN wget -O fusim-0.2.2.zip https://github.com/aebruno/fusim/raw/master/releases/fusim-0.2.2-bin.zip \
# unzip fusim-0.2.2.zip

#COPY

#COPY requirements.txt .
#COPY sequential_run.sh .
#COPY envionment.config .
#COPY generate_annotation_files.sh .
#COPY generate_breakpoints.sh .
#COPY generate_fusim.sh .
#COPY create_fusim_ref.R .
#RUN apt-get update && \
# apt-get install -y openjdk-11-jre-headless wget && \
# apt-get clean;

# The /app directory should act as the main application directory
#WORKDIR /usr/src
#RUN R -e "install.packages('BiocManager',dependencies=TRUE, repos='http://cran.rstudio.com/')" && \
# R -e "BiocManager::install(c('GenomicFeatures', 'Biostrings', 'biomaRt', 'rtracklayer', 'stringr', 'ggplot2', 'patchwork', 'cowplot'),dependencies=TRUE')"

#CMD ["conda", "run", "-n", "env-a", "/bin/bash”]
#ENV PATH="$PATH:/bin:/JAFFA/tools/bin:/JAFFA:/opt/fusim-0.2.2"

#FROM ubuntu:latest
#COPY ./src ./src

#ENTRYPOINT ["bash", "./src/sequential_run.sh"]
21 changes: 21 additions & 0 deletions src/arriba_helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/sh

#SBATCH --job-name arriba_helper
#SBATCH --partition allnodes
#SBATCH --time 12:00:00
#SBATCH --mem=128G

MAPPING_DIR=${ALIGNMENT_STORAGE_DIR}/shortreads_${6}k_star
ARRIBA_DIR=${ARRIBA_STORAGE_DIR}/shortreads_${6}k_arriba

[ ! -d ${ARRIBA_DIR} ] && mkdir ${ARRIBA_DIR}

/home/vantwisk/arriba_v2.2.1/arriba \
-x ${MAPPING_DIR}/fusions-${1}-${4}-${5}-Aligned.out.bam \
-o ${ARRIBA_DIR}/fusions-${1}-${4}-${5}.tsv -O ${ARRIBA_DIR}/fusions-${1}-${4}-${5}.discarded.tsv \
-a ${DNA_REFERENCE} \
-g ${GTF_REFERENCE} \
-b /home/vantwisk/arriba_v2.2.1/database/blacklist_hg38_GRCh38_v2.2.1.tsv \
-k /home/vantwisk/arriba_v2.2.1/database/known_fusions_hg38_GRCh38_v2.2.1.tsv \
-t /home/vantwisk/arriba_v2.2.1/database/known_fusions_hg38_GRCh38_v2.2.1.tsv \
-p /home/vantwisk/arriba_v2.2.1/database/protein_domains_hg38_GRCh38_v2.2.1.gff3
22 changes: 22 additions & 0 deletions src/art_helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/sh

#SBATCH -N 1
#SBATCH --mem=32g
#SBATCH -n 1
#SBATCH -t 02:00:00


OUTDIR=${SIM_STORAGE_DIR}/shortreads_${6}k
[ ! -d ${OUTDIR} ] && mkdir ${OUTDIR}

echo $1
echo $2
echo $3
echo $4

#for j in $(seq 1 10); do
/home/vantwisk/art_bin_MountRainier/art_illumina --rndSeed $RANDOM -ss HS25 -i ${FUSION_TRANSCRIPTOME} -o ${OUTDIR}/fusions-${1}-${4}-${5}- -l ${4} -f ${1} -p -m 500 -s 10
#done

#cat ${OUTDIR}/fusions-${1}-${4}-${5}-1.fq | gzip > ${OUTDIR}/fusions-${1}-${4}-${5}-1.fq.gz
#cat ${OUTDIR}/fusions-${1}-${4}-${5}-2.fq | gzip > ${OUTDIR}/fusions-${1}-${4}-${5}-2.fq.gz
29 changes: 29 additions & 0 deletions src/badread_helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/sh

#SBATCH -N 1
#SBATCH --mem=128G
#SBATCH -n 1
#SBATCH --time 24:00:00

OUTDIR=${SIM_STORAGE_DIR}/longreads_${8}k

[ ! -d ${OUTDIR} ] && mkdir ${OUTDIR}
echo $1
echo $2
echo $3
echo $4
echo $5
echo $6
echo $7
echo $8

rustyread --threads 32 simulate --reference ${FUSION_TRANSCRIPTOME} \
--quantity ${1}x \
--qscore_model ../models/qscore_models/${6} --glitches 0,0,0 --junk_reads 0 --random_reads 0 \
--error_model ../models/error_models/${6} --identity ${5} \
--chimera 0 --seed $RANDOM > ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq #pfun/fuse-${1}-${4}.fq.gz #fuse-transcript-${1}-${4}.fq.gz

awk '{ if (NR%4==1) gsub(".*","@transcript/"NR,$1); print }' ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq > ${OUTDIR}/fusions-${1}-${5}-${6}-${4}_2.fq
mv ${OUTDIR}/fusions-${1}-${5}-${6}-${4}_2.fq ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq
gzip ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq
#awk '{ if (NR%4==1) gsub(".*","@transcript/"NR,$1); print }' ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq > ${OUTDIR}/fusions-${1}-${5}-${6}-${4}.fq
32 changes: 32 additions & 0 deletions src/fusionseeker_helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --job-name fusionseeker
#SBATCH --partition allnodes
#SBATCH --time UNLIMITED
#SBATCH --cpus-per-task 1
#SBATCH --mem-per-cpu 10g

DATADIR_MINIMAP=${ALIGNMENT_STORAGE_DIR}/longreads_${9}k_minimap2_ens
FUSIONSEEKER_DIR=${FUSIONSEEKER_STORAGE_DIR}/longreads_${9}k_fusionseeker

[ ! -d ${FUSIONSEEKER_DIR} ] && mkdir ${FUSIONSEEKER_DIR}

DATATYPE=nanopore
if [[ ${5} == *"pacbio"* ]]; then
DATATYPE=isoseq
fi

/home/vantwisk/FusionSeeker/fusionseeker \
--thread ${THREADS} \
--bam ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-sorted.bam \
--datatype $DATATYPE \
--human38 \
--outpath ${FUSIONSEEKER_DIR}/fusions-${1}-${5}-${6}-${4}-fusionseeker \
-s 2 || true

#fusionseeker \
# --thread 16 \
# --bam ${DATADIR_MINIMAP}/fusions-${1}-${5}-${6}-${4}-sorted.bam \
# --gtf ${GTF_REFERENCE} \
# --ref ${DNA_REFERENCE} \
# -o ${FUSIONSEEKER_DIR}/fusions-${1}-${5}-${6}-${4}-fusionseeker \
# -s 2
36 changes: 36 additions & 0 deletions src/generate_annotation_resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

if [ ! -f ${DNA_REFERENCE_GEN} ]; then
wget -O ${DNA_REFERENCE_GEN}.gz https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz
gunzip -d ${DNA_REFERENCE_GEN}.gz
fi

if [ ! -f ${DNA_REFERENCE_ENS} ]; then
wget -O ${DNA_REFERENCE_ENS}.gz http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
gunzip -d ${DNA_REFERENCE_ENS}.gz
fi

if [ ! -f ${CDNA_REFERENCE} ]; then
#wget -O ${CDNA_REFERENCE}.gz http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
wget -O ${CDNA_REFERENCE}.gz https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.transcripts.fa.gz
gunzip -d ${CDNA_REFERENCE}.gz
fi

if [ ! -f ${GTF_REFERENCE} ]; then
#wget -O ${GTF_REFERENCE}.gz http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
wget -O ${GTF_REFERENCE}.gz https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz
gunzip -d ${GTF_REFERENCE}.gz
fi

if [ ! -f ${GENOMIC_SUPER_DUPS} ]; then
wget -O ${GENOMIC_SUPER_DUPS}.gz ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/genomicSuperDups.txt.gz
gunzip -d ${GENOMIC_SUPER_DUPS}.gz
fi

if [ ! -d ${CTAT_LIB_DIR} ]; then
wget -O ${CTAT_LIB_DIR}.tar.gz https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v22_CTAT_lib_Mar012021.plug-n-play.tar.gz
tar xvzf ${CTAT_LIB_DIR}.tar.gz
fi

if [ ! -f ${TRANSCRIPT_LIMITED_FILE} ]; then
Rscript generate_breakpoints.R ${GTF_REFERENCE} ${CDNA_REFERENCE} ${TRANSCRIPT_LIMITED_FILE} ${TRANSCRIPT_LIMIIT}
fi
22 changes: 22 additions & 0 deletions src/generate_arriba.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
N_TRANSCRIPTS=("canoncical") #REPLACE_N_TRANSCRIPTS
REPLICATES=10 #REPLACE_REPLICATES
COVERAGE=(3 10 30 50 100) #REPLACE_COVERAGE
QUALITY=("95,100,4") #REPLACE_QUALITY
TECH=("pacbio2021" "nanopore2023") #REPLACE_TECH
READ_LENGTHS=(150) #REPLACE_READ_LENGTHS

MIN_OVERLAP_LEN=100 # REPLACE_MIN_OVERLAP_LEN
BIN_SIZE=50 # REPLACE_BIN_SIZE
MIN_MAP_LENGTH=100 #REPLACE_MIN_MAP_LENGTH

GENION_MIN_SUPPORT=2 #REPLACE_GENION_MIN_SUPPORT

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!READ_LENGTHS[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
sbatch arriba_helper.sh ${COVERAGE[$q]} 1 1 ${READ_LENGTHS[$j]} ${i} ${N_TRANSCRIPTS[$n]}
done
done
done
done
23 changes: 23 additions & 0 deletions src/generate_fusim.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

if [ ! -f ${REF_STORAGE_DIR}/refFlat.txt ]; then
wget -O ${REF_STORAGE_DIR}/refFlat.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refFlat.txt
gunzip ${REF_STORAGE_DIR}/refFlat.txt.gz
fi

if [ ! -f ${REF_STORAGE_DIR}/hg38_chroma.fa ]; then
wget -O ${REF_STORAGE_DIR}/hg38.chromFa.tar.gz ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz
tar -xzf ${REF_STORAGE_DIR}/hg38.chromFa.tar.gz
cat ${REF_STORAGE_DIR}/chr*.fa > hg38_chroma.fa
samtools faidx ${REF_STORAGE_DIR}/hg38_chroma.fa
fi

java -jar /home/vantwisk/fusim-0.2.2/fusim.jar \
--gene-model=${REF_STORAGE_DIR}/refFlat.txt \
--fusions=${NFUSIONS} \
--reference=${REF_STORAGE_DIR}/hg38_chroma.fa \
--fasta-output=${FUSIM_FASTA_FILE} \
--text-output=${FUSIM_TXT_FILE}

cat ${TRANSCIPT_LIMITED_FILE} ${FUSIM_FASTA_FILE} > ${FUSION_TRANSCRIPTOME}

Rscript create_fusim_ref.R ${FUSIM_TXT_FILE} ${FUSIM_REF_FILE}
24 changes: 24 additions & 0 deletions src/generate_fusionseeker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
N_TRANSCRIPTS=("canoncical") #REPLACE_N_TRANSCRIPTS
REPLICATES=10 #REPLACE_REPLICATES
COVERAGE=(3 10 30 50 100) #REPLACE_COVERAGE
QUALITY=("95,100,4") #REPLACE_QUALITY
TECH=("pacbio2021" "nanopore2023") #REPLACE_TECH
READ_LENGTHS=(150) #REPLACE_READ_LENGTHS

MIN_OVERLAP_LEN=100 # REPLACE_MIN_OVERLAP_LEN
BIN_SIZE=50 # REPLACE_BIN_SIZE
MIN_MAP_LENGTH=100 #REPLACE_MIN_MAP_LENGTH

GENION_MIN_SUPPORT=2 #REPLACE_GENION_MIN_SUPPORT

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
eval ${TF_BASH} fusionseeker_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
24 changes: 24 additions & 0 deletions src/generate_genion.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
N_TRANSCRIPTS=("canoncical") #REPLACE_N_TRANSCRIPTS
REPLICATES=10 #REPLACE_REPLICATES
COVERAGE=(3 10 30 50 100) #REPLACE_COVERAGE
QUALITY=("95,100,4") #REPLACE_QUALITY
TECH=("pacbio2021" "nanopore2023") #REPLACE_TECH
READ_LENGTHS=(150) #REPLACE_READ_LENGTHS

MIN_OVERLAP_LEN=100 # REPLACE_MIN_OVERLAP_LEN
BIN_SIZE=50 # REPLACE_BIN_SIZE
MIN_MAP_LENGTH=100 #REPLACE_MIN_MAP_LENGTH

GENION_MIN_SUPPORT=2 #REPLACE_GENION_MIN_SUPPORT

for i in $(seq 1 ${REPLICATES}); do
for q in ${!COVERAGE[@]}; do
for j in ${!QUALITY[@]}; do
for k in ${!TECH[@]}; do
for n in ${!N_TRANSCRIPTS[@]}; do
eval ${TF_BASH} genion_helper.sh ${COVERAGE[$q]} 1 1 ${i} ${QUALITY[$j]} ${TECH[$k]} ax sam ${GENION_MIN_SUPPORT} ${N_TRANSCRIPTS[$n]}
done
done
done
done
done
36 changes: 36 additions & 0 deletions src/generate_graphs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
N_TRANSCRIPTS=("canoncical") #REPLACE_N_TRANSCRIPTS
REPLICATES=10 #REPLACE_REPLICATES
COVERAGE=(3 10 30 50 100) #REPLACE_COVERAGE
QUALITY=("95,100,4") #REPLACE_QUALITY
TECH=("pacbio2021" "nanopore2023") #REPLACE_TECH
READ_LENGTHS=(150) #REPLACE_READ_LENGTHS
MIN_OVERLAP_LEN=100 # REPLACE_MIN_OVERLAP_LEN
BIN_SIZE=50 # REPLACE_BIN_SIZE
MIN_MAP_LENGTH=100 #REPLACE_MIN_MAP_LENGTH
GENION_MIN_SUPPORT=2 #REPLACE_GENION_MIN_SUPPORT

print_array() {
array=("$@")
#printf -v tmp '%s, ' "${array[@]}"
#printf -v out '%s\n' "${tmp%, }"
printf -v one %s "${array[0]}"
printf -v two ',%s' "${array[@]:1}"
printf -v three '\n'
echo "${one}${two}${three}"
}

#print_array $COVERAGE

N_TRANSCRIPTS=`IFS=':';echo "${N_TRANSCRIPTS[*]// /:}";IFS=$' \t\n'`
COVERAGE=`IFS=':';echo "${COVERAGE[*]// /:}";IFS=$' \t\n'`
QUALITY=`IFS=':';echo "${QUALITY[*]// /:}";IFS=$' \t\n'`
TECH=`IFS=':';echo "${TECH[*]// /:}";IFS=$' \t\n'`
READ_LENGTHS=`IFS=':';echo "${READ_LENGTHS[*]// /:}";IFS=$' \t\n'`

#echo $N_TRANSCRIPTS
#echo $COVERAGE
#echo $QUALITY
#echo $TECH
#echo $READ_LENGTHS

Rscript combined_graphs.R ${FUSIM_REF_FILE} ${GENION_STORAGE_DIR} ${JAFFAL_STORAGE_DIR} ${LONGGF_STORAGE_DIR} ${FUSIONSEEKER_STORAGE_DIR} ${ARRIBA_STORAGE_DIR} ${STARFUSION_STORAGE_DIR} ${PBFUSION_STORAGE_DIR} ${GRAPHS_STORAGE_DIR} ${N_TRANSCRIPTS} ${COVERAGE} ${QUALITY} ${TECH} ${REPLICATES} ${READ_LENGTHS}
Loading

0 comments on commit 6420f81

Please sign in to comment.