From ebccfe531fe2e2b1d6ba3e3500c1a1b6cca67724 Mon Sep 17 00:00:00 2001 From: asyakhl Date: Thu, 16 Nov 2023 14:04:38 -0500 Subject: [PATCH] adding chip motif analysis tool, MEME rule --- config/genome.json | 20 ++++++++++++++++---- workflow/Snakefile | 6 +++++- workflow/rules/peakcall.smk | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/config/genome.json b/config/genome.json index df021ff..18fae5f 100644 --- a/config/genome.json +++ b/config/genome.json @@ -13,7 +13,10 @@ "GENOMECHR": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/Chromsomes/", "GTFFILE": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/genes.gtf", "REFLEN": "/data/OpenOmics/references/cfChIP-seek/hg19_basic/indexes/hg19.fa.sizes", - "FRAC": "0.8970265870106838" + "FRAC": "0.8970265870106838", + "MEME_VERTEBRATES_DB": "/fdb/meme/motif_databases/JASPAR/JASPAR2018_CORE_vertebrates_non-redundant.meme", + "MEME_EUKARYOTE_DB": "/fdb/meme/motif_databases/EUKARYOTE/jolma2013.meme", + "MEME_GENOME_DB": "/fdb/meme/motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme" }, "hg38": { "ALIAS": "hg38", @@ -28,7 +31,10 @@ "GENOMECHR": "/data/OpenOmics/references/chrom-seek/hg38_basic/Chromsomes/", "GTFFILE": "/data/OpenOmics/references/chrom-seek/hg38_basic/genes.gtf", "REFLEN": "/data/OpenOmics/references/chrom-seek/hg38_basic/indexes/hg38.fa.sizes", - "FRAC": "0.9084767300375779" + "FRAC": "0.9084767300375779", + "MEME_VERTEBRATES_DB": "/fdb/meme/motif_databases/JASPAR/JASPAR2018_CORE_vertebrates_non-redundant.meme", + "MEME_EUKARYOTE_DB": "/fdb/meme/motif_databases/EUKARYOTE/jolma2013.meme", + "MEME_GENOME_DB": "/fdb/meme/motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme" }, "mm10": { "ALIAS": "mm10", @@ -43,7 +49,10 @@ "GENOMECHR": "/data/OpenOmics/references/chrom-seek/mm10_basic/Chromsomes/", "GTFFILE": "/data/OpenOmics/references/chrom-seek/mm10_basic/genes.gtf", "REFLEN": "/data/OpenOmics/references/chrom-seek/mm10_basic/indexes/mm10.fa.sizes", - "FRAC": "0.9053190260640642" + "FRAC": "0.9053190260640642", + "MEME_VERTEBRATES_DB": "/fdb/meme/motif_databases/JASPAR/JASPAR2018_CORE_vertebrates_non-redundant.meme", + "MEME_EUKARYOTE_DB": "/fdb/meme/motif_databases/EUKARYOTE/jolma2013.meme", + "MEME_GENOME_DB": "/fdb/meme/motif_databases/MOUSE/HOCOMOCOv11_full_MOUSE_mono_meme_format.meme" }, "rheMac10": { "ALIAS": "rheMac10", @@ -58,7 +67,10 @@ "GENOMECHR": "", "GTFFILE": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.110.chr.gtf", "REFLEN": "/data/OpenOmics/references/chrom-seek/rheMac10/Macaca_mulatta.Mmul_10.dna.chr.sizes", - "FRAC": "0.9643910116868353" + "FRAC": "0.9643910116868353", + "MEME_VERTEBRATES_DB": "/fdb/meme/motif_databases/JASPAR/JASPAR2018_CORE_vertebrates_non-redundant.meme", + "MEME_EUKARYOTE_DB": "/fdb/meme/motif_databases/EUKARYOTE/jolma2013.meme", + "MEME_GENOME_DB": "" } } } diff --git a/workflow/Snakefile b/workflow/Snakefile index 4808ada..4a8966c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -206,7 +206,9 @@ if assay == "cfchip": provided(expand(join(workpath,uropa_dir,diffbind_dir,'{name}_{PeakTool}_uropa_{type}_allhits.txt'), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'],name=contrasts,type=["protTSS"]), reps == "yes"), provided(expand(join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), - PeakTool=PeakTools),reps == "yes" and contrast) + PeakTool=PeakTools),reps == "yes" and contrast), + expand(join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips), + expand(join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips) elif assay in ["atac", "chip"]: rule all: @@ -239,6 +241,8 @@ elif assay in ["atac", "chip"]: provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"), name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"), provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"), name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"), provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"),name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"), + expand(join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips), + expand(join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips) ############################# diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index ba3d340..94e1166 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -286,4 +286,40 @@ rule SICER: mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} fi fi + """ + +rule MEME: + input: + bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) + output: + meme_out = join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"), + ame_out = join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html") + params: + rname='MEME', + ref_fa=config['references'][genome]['GENOME'], + meme_vertebrates_db=config['references'][genome]['MEME_VERTEBRATES_DB'], + meme_euk_db=config['references'][genome]['MEME_EUKARYOTE_DB'], + meme_genome_db=config['references'][genome]['MEME_GENOME_DB'], + oc=join(workpath, "MEME", "{PeakTool}", "{name}"), + tmpdir=tmpdir, + outfa="{name}.fa", + ntasks=int(28) + shell: """ + module load meme + module load bedtools + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa} + meme-chip \\ + --oc {params.oc}_meme \\ + -db {params.meme_vertebrates_db} \\ + -meme-searchsize 34000000 \\ + -meme-p {params.ntasks} \\ + ${{tmp}}/{params.outfa} + + ame \\ + --oc {params.oc}_ame ${{tmp}}/{params.outfa} \\ + {params.meme_euk_db} {params.meme_vertebrates_db} {params.meme_genome_db} """ \ No newline at end of file