diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..076d145 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 gprezza + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..381d90e --- /dev/null +++ b/README.md @@ -0,0 +1,114 @@ +# CRISPRi_designer +Tools to design gRNA/array pools targeting a list of genes. + +## Dependencies +Create a conda environment from the environment.yml file: +``` +conda env create -f environment.yml +``` +## get_sequences_from_gff +The first step in the pipeline is to obtain a fasta file with the sequences of the genes that are to be targeted. These sequences must contain the promoter. The file can be created from a gff annotation with the get_sequences_from_gff.py script. + +### Usage +``` +usage: get_sequences_from_gff.py [-h] [--outname OUTNAME] + [--promoter PROMOTER] [--genetype GENETYPE] + [--attribute ATTRIBUTE] [--name NAME] + gff_file fasta_file + +Extract gene sequences from gff and genome files. + +positional arguments: + gff_file Input annotation (gff) file with sequences of the + target genes. + fasta_file Input genome file. + +optional arguments: + -h, --help show this help message and exit + --outname OUTNAME, -o OUTNAME + Base name of the output files. The .fasta extension + will be appended to this. (Default: target_genes. + --promoter PROMOTER, -p PROMOTER + Length before the gene start to be considered as the + promoter. (Default: 50) + --genetype GENETYPE, -t GENETYPE + Consider only genes of this type (Third column of the + gff file). + --attribute ATTRIBUTE, -a ATTRIBUTE + Consider only genes that have this attribute (ninth + column of the gff file).E.g. if you want to filter for + the attribute "sRNA_type=Intergenic" + writesRNA_type=Intergenic. + --name NAME, -n NAME Take gene name from this attribute of the gff + attribute column (ninth column of the gff file) +``` +### Input files +* A gff annotation (*gff_file*) in the standard format (tab-separated, no header, columns: seqname,feature,start,end,score,strand,frame,attribute). +* A (multi)fasta file (*fasta_file*) containing the genome file (chromosome(s) and plasmid(s), if present). + +### Output +* A _basename_coordinates.txt_ file containing the coordinates of each gene (promoter included). +* A _basename_sequences.fasta_ file containing the genes (promoter included). +_basename_ is defined by the --outname option. + +## design_CRISPRi_gRNAs.py +This script designs the gRNA/array library. + +### Usage + +``` +usage: design_CRISPRi_gRNAs.py [-h] [--processors PROCESSORS] + [--in_name IN_NAME] [--coordinates COORDINATES] + [--folding FOLDING] [--left LEFT] + [--right RIGHT] [--arrays] [--non_targeting] + [--PAM PAM] [--spacer_length SPACER_LENGTH] + [--promoter_length PROMOTER_LENGTH] + [--mutants MUTANTS] --reference REFERENCE + +Designs spacers targeting the input genes. + +optional arguments: + -h, --help show this help message and exit + --processors PROCESSORS, -p PROCESSORS + Number of processors (Default: 1). + --in_name IN_NAME, -i IN_NAME + Input base name of the fasta file with sequences of + the target genes.(Default: target_genes.) + --coordinates COORDINATES, -c COORDINATES + File with coordinates of the target genes. (Default: " + --in_name + _coordinates.txt".) + --folding FOLDING, -f FOLDING + Minimal accepted correct folding probability of the + arrays. Scale: 0-1. (Default: 0.2). + --left LEFT, -lo LEFT + Left overhang for cloning mature spacers. (Default: + atctttgcagtaatttctactgttgtagat). + --right RIGHT, -ro RIGHT + Right overhang for cloning mature spacers. (Default: + ccggcttatcggtcagtttcacctgattta). + --arrays, -a The script attempts to design one array per target + gene if this flag is present. + --non_targeting, -nt The script designs nontargeting gRNAs if this flag is + present. + --PAM PAM, -pam PAM PAM sequence. (Default: TTV). + --spacer_length SPACER_LENGTH, -sl SPACER_LENGTH + Length of each spacer. No more than 26 if the -a flag + is used. (Default: 20). + --promoter_length PROMOTER_LENGTH, -pl PROMOTER_LENGTH + Length of the promoter region. (Default: 50). + --mutants MUTANTS, -m MUTANTS + Number of spacers (including one array if using the -a + flag) to be designed per targeted gene. (Default: 4). + --reference REFERENCE, -r REFERENCE + Folder containing the genome sequence(s). +``` +### Input +* The two files generated by _get_sequences_from_gff_, which are indicated by *in_name* (and *--coordinates*, if the names are different). + +### Output +The following files are generated by the script: +* _constructs_mature.txt_: fasta file with all gRNAs + homology arms. Non-targeting gRNAs are also included if the _-nt_ option is used. +* _mature_oligo_pool2.txt_: text file with all gRNAs + homology arms. Like the previous file, but without sequence ID. +* _nontargeting_oligo_pool2.txt_: only if the _-nt_ option is used. Text file with the non-targeting gRNAs + homology arms. +* _constructs_array.txt_: only if the _-a_ option is used. Fasta file with all arrays. +* _array_oligo_pool_n.txt_: only if the _-a_ option is used. n Text files with the sequences of the oligos needed to construct the array via CRATES. One file is generated per each pool to be ordered. \ No newline at end of file diff --git a/design_CRISPRi_gRNAs.py b/design_CRISPRi_gRNAs.py new file mode 100644 index 0000000..3bcf2ea --- /dev/null +++ b/design_CRISPRi_gRNAs.py @@ -0,0 +1,848 @@ +from Bio import Seq +from Bio import SeqIO +import csv +import itertools +import subprocess +import math +from multiprocessing import Pool +import tempfile +import RNA +import argparse +import os +import re +from datetime import datetime +import sys +import random + +startTime = datetime.now() + +parser = argparse.ArgumentParser(description='Designs spacers ' + 'targeting the input genes.') + +parser.add_argument('--processors', '-p', default=1, type=int, + help='Number of processors (Default: 1).') + +parser.add_argument('--in_name', '-i', type=str, default="target_genes", + help='Input base name of the fasta file with sequences of the target genes.' + '(Default: target_genes.)') + +parser.add_argument('--coordinates', '-c', type=str, + help='File with coordinates of the target genes. (Default: "--in_name + _coordinates.txt".)') + +parser.add_argument('--folding', '-f', default=0.2, type=float, + help='Minimal accepted correct folding probability of the arrays. ' + 'Scale: 0-1. (Default: 0.2).') + +parser.add_argument('--left', '-lo', default="atctttgcagtaatttctactgttgtagat", type=str, + help='Left overhang for cloning mature spacers. (Default: atctttgcagtaatttctactgttgtagat).') + +parser.add_argument('--right', '-ro', default="ccggcttatcggtcagtttcacctgattta", type=str, + help='Right overhang for cloning mature spacers. (Default: ccggcttatcggtcagtttcacctgattta).') + +parser.add_argument('--arrays', '-a', help='The script attempts to design one array per target gene if this flag is present.', + action='store_true') + +parser.add_argument('--non_targeting', '-nt', help='The script designs nontargeting gRNAs if this flag is present.', + action='store_true') + +parser.add_argument('--PAM', '-pam', default="TTV", type=str, + help='PAM sequence. (Default: TTV).') + +parser.add_argument('--spacer_length', '-sl', default="20", type=int, + help='Length of each spacer. No more than 26 if the -a flag is used. (Default: 20).') + +parser.add_argument('--promoter_length', '-pl', default="50", type=int, + help='Length of the promoter region. (Default: 50).') + +parser.add_argument('--mutants', '-m', default="4", type=int, + help='Number of spacers (including one array if using the -a flag) to be designed per targeted gene. (Default: 4).') + +parser.add_argument('--reference', '-r', type=str, required=True, + help='Folder containing the genome sequence(s).') + + +args = parser.parse_args() +processors = args.processors +inname = args.in_name +if args.coordinates is None: + coord_name = inname +else: + coord_name = args.coordinates +fold_threshold = args.folding +left_overh = args.left +right_overh = args.right +arrays_flag = args.arrays +non_targeting_flag = args.non_targeting +PAM = args.PAM +spacer_length = args.spacer_length +promoter_length = args.promoter_length +promoter_length = args.promoter_length +output_no = args.mutants +reference = args.reference + +#Parameters: +diff_crates = 26 - spacer_length #factor to extend target gene coordinates, + #since spacers in arrays to be cloned with the + #CRATES method (PMID:31270316) need to be 26nt long + +overlap_window = 10 +#window 3' and 5' of spacers in which spacers that fall are defined as overlapping +#NB: the windows sum up! So, if this value is 10, two spacers have to be +#>10+10=20 nt separated to be non-overlapping + +repeat = "GUCUAAGAACUUUAAAUAAUUUCUACUGUUGUAGAU" #sequence of the Cas12a CRISPR repeat + +stretches = [base*5 for base in ["A","C","G","T"]] #list of base stretches + +#make a unique multifasta file with all sequences present in the folder specified by the -r flag: +genome_fa_file = open("{0}/genome_sequence.fa".format(reference),mode= "w+") +for filename in os.listdir(reference): + if filename != "genome_sequence.fa": + _file = open("{0}/{1}".format(reference,filename),mode= "r+") + for line in _file: + genome_fa_file.write(line) + genome_fa_file.write("\n") +genome_fa_file.seek(0) + +#create genome_seqs dictionary with sequences of chromosome and plasmids: +genome = SeqIO.parse(genome_fa_file, 'fasta') +genome_seqs = {} +for record in genome: + record_id = re.findall('[^ ]+', record.description)[0] + genome_seqs[record_id] = str(record.seq) + +ambiguous_alph = {"N":["A","C","G","T"], "V":["A","C","G"], "H":["A","C","T"], "D":["A","G","T"], + "B":["C","G","T"], "M":["A","C"], "K":["G","T"], "W":["A","T"], "S":["C","G"], "Y":["C","T"], + "R":["A","G"], "A": ["A"], "C": ["C"], "G": ["G"], "T": ["T"]} + +def remove_ambiguous(sequence): + #returns a list of all possible sequences + #given the ambiguous sequence input (e.g. with N,V, etc. bases) + return ["".join(i) for i in itertools.product(*[ ambiguous_alph[j] for j in sequence ]) ] + +def revcomp(sequence): +# returns reverse complement of sequence + complement = {'A':'T', 'C':'G', 'T':'A', 'G':'C', 'N':'N','Y':'R', + 'R':'Y', 'W':'W', 'S':'S', 'K':'M', 'M':'K', 'D':'H', 'H':'D', + 'V':'B', 'B':'V'} + bases = list(sequence) + bases.reverse() + revcomp = '' + for base in bases: + if base.islower(): + base = base.upper() + revcomp += complement[base].lower() + else: + revcomp += complement[base] + return revcomp + +def GC_count(string): +# returns %GC of string + bases = list(string) + G = bases.count("G") + C = bases.count("C") + GC_content = 100*(G+C)/len(string) + return GC_content + +def find_all(string, substr): +# returns a list of the indexes of all occurrencies of substr in string + listindex=[] + start = 0 + i = string.find(substr, start) + while i >= 0: + listindex.append(i) + i = string.find(substr, i + 1) + return listindex + +def top_spacers(search_area, PAM_sequence): +# returns a list of lists [a,b,c,d,e,f] where a and b are the indexes +# of a spacer targeting the non template (top, +, sense... call it as you wish) +# strand of the search_area string; c is top (to indicate the strand), +# d and e are the spacer indexes + overlap_window and f is the sequence of the spacer. + PAM_indexes = find_all(search_area, revcomp(PAM_sequence)) + spacers_top = [] + if len(PAM_indexes) != 0: + for PAM_index in PAM_indexes: + #check that the index and the subsequent spacer fall within the gene limits + if PAM_index >= spacer_length and PAM_index <= gene_length - diff_crates: + start = PAM_index - spacer_length + if start < diff_crates: + continue + end = PAM_index + start_overlap = start - overlap_window + end_overlap = end + overlap_window + if start_overlap < 0: + start_overlap = 0 #don't allow indexes to go beyond the target's extremities + if end_overlap > gene_length: + end_overlap = gene_length #same as comment above + start_array = start - diff_crates + sequence = revcomp(search_area[start_array:end]) + PAMletter = revcomp(gene_sequence[PAM_index+PAM_length]) + spacers_top.append([start, PAMletter, "top", start_overlap, end_overlap,sequence]) + return spacers_top + +def bottom_spacers(search_area, PAM_sequence): +# returns a list of lists [a,b,c,d,e,f] where a and b are the indexes +# of a spacer targeting the template (bottom, -, antisense... call it as you wish) +# strand of the search_area string; c is 1 (to indicate the strand), +# d and e are the spacer indexes + overlap_window and f is the sequence of the spacer. + PAM_indexes = find_all(search_area, PAM_sequence) + spacers_bottom = [] + if len(PAM_indexes) != 0: + for PAM_index in PAM_indexes: + #check that the index and the subsequent spacer fall within the gene limits + if PAM_index <= gene_length - spacer_length - PAM_length and PAM_index >= diff_crates: + end = PAM_index + PAM_length + spacer_length + if end > gene_length - diff_crates: + continue + start = PAM_index + PAM_length + start_overlap = start - overlap_window + end_overlap = end + overlap_window + if start_overlap < 0: + start_overlap = 0 #don't allow indexes to go beyond the target's extremities + if end_overlap > gene_length: + end_overlap = gene_length #same as comment above + end_array = end + diff_crates + sequence = search_area[start:end_array] + PAMletter = search_area[PAM_index-1] + spacers_bottom.append([start, PAMletter, "bot", start_overlap, end_overlap,sequence]) + return spacers_bottom + +def score_arrays(arrays): +# gives a score to each array in arrays as follows: +# first it calculates the score of the array based on the folding probability, +# then it scores each spacer of the array individually, based on GC content, +# presence/absence of same-base stretches and position within the targeted gene. +# The scores of each feature are multiplied to give the score of the whole spacer. +# The score of the entire array is then obtained by multiplying the scores of +# all spacers that compose it and the folding probability. The array's score is +# then appended to the array list as last element. + scored_arrays = [] + for array in arrays: + prob = array[0] + #initialize scores list with the folding probability score of the whole array: + scores = [calc_arrayScore(prob)] + #score spacers individually and append the score to the scores list: + for spacer in array[1]: + all_stretches = 0 #number of stretches + seq = spacer[5] + shorter_seq = seq[:20] + for stretch in stretches: + n = 0 + indexes = find_all(shorter_seq, stretch) + n += len(indexes) + # count all stretches that are part of a larger stretch as 1 + # (i.e. AAAAAA (six A) counts as only 1, instead of 2) + if len(indexes) > 1: + for i in range(len(indexes)-1): + if indexes[i+1] - indexes[i] == 1: + n-= 1 + all_stretches += n + GC = GC_count(shorter_seq) + position = spacer[0] #start index of the spacer + PAMletter = spacer[1] + score = calc_spacerScore(GC,all_stretches,position,PAMletter) #score of the spacer + scores.append(score) + array_score = 1 + #generate the array score by multiplying the scores of all spacers + #that compose it and the folding probability score: + for x in scores: + array_score = array_score * x + array.append(array_score) + scored_arrays.append(array) + return scored_arrays + +def bestspacers(spacers,n,offt): +# picks the n spacers from the "spacers" list that have the highest scores. +# Scoring system similar to the arrays one: scores the spacer ased on +# GC content, position within the targeted gene and presence/absence of +# same-base stretches. The scores of each feature are multiplied to give +# the score of the whole spacer. + if offt:#if we are picking spacers from the ones with offtargets, just pick from + #spacers that have the highest number of mismatches + #sort by decreasing number of mismatches with the off-targeting sites: + #ordered_spacers = [k for k, v in sorted(spacers.items(), key = lambda item : item[1][1])] + #print("ppp",[key for key,value in spacers.items() if value[1] == max(value[1] for value in spacers.values())]) + if len(spacers) <= n: + return [spacer[0] for spacer in spacers.values()] + new_spacers_dic = {key:value for key,value in spacers.items() if value[1] == max(value[1] for value in spacers.values())} + for spacer in new_spacers_dic: + spacers.pop(spacer) + new_spacers = [value[0] for key,value in new_spacers_dic.items()] + best_spacers = bestspacers(new_spacers,n,False) + new_n = n - len(best_spacers) + if new_n != 0 and len(spacers) > 0: + best_spacers.extend(bestspacers(spacers,new_n,True)) + return best_spacers + best_spacers = [] + if len(spacers) <= n: + return spacers + for spacer in spacers: + #score spacer: + all_stretches = 0 + seq = spacer[5][:20] + for x in stretches: + stretches_no = 0 + indexes = find_all(seq, x) + stretches_no += len(indexes) + # count all stretches that are part of a larger stretch as 1 + # (i.e. AAAAAA (six A) counts as only 1, instead of 2) + if len(indexes) > 1: + for i in range(len(indexes)-1): + if indexes[i+1] - indexes[i] == 1: + stretches_no-= 1 + all_stretches += stretches_no + GC = GC_count(seq) + position = spacer[0] + PAMletter = spacer[1] + score = calc_spacerScore(GC,all_stretches,position,PAMletter) #score of the spacer + spacer.append(score) + spacers = sorted(spacers, key = lambda x: x[-1],reverse=True) #sort by decreasing score + best_spacers.append(spacers[0]) #append spacer with the highest score + i = 1 + while len(best_spacers) < n: + best_spacers.append(spacers[i]) #take spacers from second highest score on + i += 1 + return best_spacers + +def calc_spacerScore(GC,stretch,position,PAMletter): +# calculate score of the spacer + if 0 <= GC < 15: + GC_score = 0.1 + elif 15 <= GC < 20: + GC_score = 0.2 + elif 20 <= GC < 30: + GC_score = 5 + elif 30 <= GC < 35: + GC_score = 8 + elif 35 <= GC < 75: + GC_score = 10 + elif 75 <= GC < 80: + GC_score = 8 + elif 80 <= GC < 90: + GC_score = 4 + elif 90 <= GC: + GC_score = 0.1 + if stretch == 0: + stretch_score = 10 + elif stretch == 1: + stretch_score = 5 + elif stretch > 1: + stretch_score = 0.1 + if 0 <= position < promoter_length + overlap_window + diff_crates: + #if inside the promoter (extended by overlap_window) + position_score = 10 + else: + #calculate position as percentage inside the coding sequence + n = promoter_length + overlap_window + diff_crates + position = (position-n)*100/(gene_length-n) + if 0 <= position <= 50: + position_score = 9 + elif 50 < position <= 60: + position_score = 7 + elif 60 < position <= 70: + position_score = 4 + elif 70 < position <= 80: + position_score = 3 + elif 80 < position <= 90: + position_score = 2 + elif 90 < position <= 100: + position_score = 1 + PAM_score = 5 + if PAMletter == "T": + PAM_score = 10 + scores = [GC_score,stretch_score,position_score,PAM_score] + prod = 1 + #calculate the final score, which is the product of all sub-scores: + for x in scores: + prod = prod * x + return prod + +def calc_arrayScore(prob): +# calculate score of the array/spacer + if .2 <= prob < .3: + prob_score = 6 + elif .3 <= prob < .4: + prob_score = 7 + elif .4 <= prob < .5: + prob_score = 8 + elif prob >= .5: + prob_score = 10 + return prob_score + +def make_arrays(spacers): + selected = [] + if len(spacers) > 0: + #pick (up to) n spacers for gRNAs + n=output_no - 1 + selected = bestspacers(spacers,n,False) + #remove these n spacers from the list of available spacers: + for spacer in selected: + spacers.remove(spacer) + # make 1 array (of n spacers) from the "spacers" list and, if not possible, pick single spacers + if len(spacers) > 2: #try to make array + #make list of all possible combinations (= position doesn't matter) of n spacers + combinations = list(itertools.combinations(spacers,n)) + combinations_cp = combinations[:] + #remove combinations that contain overlapping spacers: + for comb in combinations_cp: + comb_sorted = sorted(comb, key = lambda x: x[0]) #sorts spacers by start index + i = 0 + while i < 2: + if comb_sorted[i][4] > comb_sorted[i+1][3]: #if the two spacers overlap + combinations.remove(comb) + i = 2 + i += 1 + if len(combinations) > 0: + pool = Pool(processors) + pool_results = pool.map(folding_prob, combinations) + pool.close() + pool.join() + permutations = [] #list of permutations (= position does matter) of n spacers + for valid_perm in pool_results: + if len(valid_perm) > 0: + permutations.extend(valid_perm) + if len(permutations) > 0: + permutations = score_arrays(permutations) + permutations = sorted(permutations, key = lambda x: x[-1], reverse=True) #sort by decreasing array score + array = list(permutations[0][1]) #permutation with the highest score + selected.append(array) + for spacer in array: + spacers.remove(spacer) + if len(selected) < output_no and len(spacers) > 1: + n = output_no - len(selected) + for spacer in bestspacers(spacers,n,False): + selected.append(spacer) + return selected + +def offtarget(list_of_PAMs,target_strand,spacer_id): +#checks if the offtarget (starting at index; with PAM coordinates start:end) +#is followed by a PAM and if it's outside a targeted gene. +#If so, it's considered as a bona-fide off target +#and the function returns 1, otherwise returns 0. + global multitargeting + global offtargeting + n = 0 + multitarget = False + if any(genome_seqs[scaffold][start:end] == PAM for PAM in list_of_PAMs):#check if PAM is a valid PAM + if "|" in thisgene: #if it's a group of genes having the same sequence + genes = thisgene.split("|") + for gene in genes: + [thisstart,thisend] = positions[scaffold][gene] + if index in range(thisstart,thisend+1):#if this spacer targets one of these genes, + return #it's not an offtarget + else: + [thisstart,thisend] = positions[scaffold][thisgene] + if not index in range(thisstart,thisend+1): #if doesn't target the current gene (= it' either an offtarget + #or a multitarget) + for gene,[i,k] in positions[scaffold].items(): + if gene not in thisgene: #in case that thisgene is a group of genes having the same sequence + if index in range(i,k+1): #checks if index is inside a target gene + multitarget = True + break + #look for the spacer in the spacers dictionary and remove it + for spacer in all_spacers_dic[thisgene]: + if spacer[5][:20] == seq and spacer[2] == target_strand: + all_spacers_dic[thisgene].remove(spacer) #remove the spacer + if row[5] != "": #if there are mismatches between the offtarget site and the spacer + mismatches = 1/(len(row[5].split(","))+1) + else: + mismatches = 1 + curr_spacer = spacer + #append the spacer to the "gene" key of the "offtargeting_spacers" dict. + #inserts the key into the dict if it doesn't exist: + if not multitarget: #is true offtarget + if thisgene not in offtargeting_spacers: + offtargeting_spacers[thisgene] = {} + if spacer_id not in offtargeting_spacers[thisgene]: + offtargeting_spacers[thisgene][thisspacer] = [curr_spacer,0] + offtargeting_spacers[thisgene][thisspacer][1] += mismatches + offtargeting += 1 + else: # targets another gene + if thisgene not in multitargeting_spacers: + multitargeting_spacers[thisgene] = {} + if spacer_id not in multitargeting_spacers[thisgene]: + multitargeting_spacers[thisgene][thisspacer] = [curr_spacer,0] + multitargeting_spacers[thisgene][thisspacer][1] += mismatches + multitargeting += 1 + break + +def folding_prob(combination): + #returns a list of all permutations (= position DOES matter) of 3 spacers + #whose array has a probability of folding correctly >= fold_threshold + valid_perms = [] + permutations = list(itertools.permutations(combination,3)) + for perm in permutations: + spacers_seqs = [x[5] for x in perm] + prob = folding(spacers_seqs) #folding probability of the array + if prob >= fold_threshold: + valid_perms.append([prob,perm,combination]) + return valid_perms + +def folding(spacers): + #calculate the probability of the array folding in the desired structure + array = ""#array sequence + repeat_c = "...................xx(((((....)))))." #desired folding of the array + #with stem-loop at the 3' end + array_c = "" #array dot-bracket constraint + for spacer in spacers: + array = array + repeat + spacer + spacer_c = "."*len(spacer) + array_c = array_c + repeat_c + spacer_c + R=0.00198 #Boltzmann constant + T=310.15 #37 celsius in Kelvin + # create a fold_compound object for the current sequence: + fc = RNA.fold_compound(array) + # compute the MFE and corresponding structure: + (mfe_struct, mfe) = fc.mfe() + # re-scale the Boltzmann factors for partition function computation + fc.exp_params_rescale(mfe) + # compute partition function: + (pp, Fu) = fc.pf() + # re-create a fold_compound object: + fc = RNA.fold_compound(array) + # add hard constraint from dot-bracket notation: + fc.hc_add_from_db(array_c) + # compute the new MFE and corresponding structure with the constraints: + (mfe_struct, mfeNew) = fc.mfe() + # re-scale the Boltzmann factors for partition function computation: + fc.exp_params_rescale(mfeNew) + # compute partition function: + (pp, Fc) = fc.pf() + #calculate probability of folding within the constraints: + array_prob=math.exp((Fu-Fc)/(R*T)) + return array_prob + +def random_string(length): + return ''.join(random.choice(['A','C','G','T']) for i in range(length)) + +#------------- main: + +#create a dictionary of start and end positions of the targeted genes +#of structure scaffold:[[start1,end1],[start2,end2]]: +positions = {} +if args.coordinates is None: + genes_coord = open("{}_coordinates.txt".format(coord_name),mode="r") +else: + genes_coord = open(coord_name,mode="r") +for line in genes_coord: + if not line.rstrip(): + continue + line = line.split("\t") + scaffold = line[0] + gene_id = line[1] + start = int(line[2]) + end = int(line[3]) + if scaffold not in positions: + positions[scaffold] = {} + positions[scaffold][gene_id] = [start,end] + +#create a dictionary of start and end positions of the targeted genes +#of structure scaffold:[[start1,end1],[start2,end2]]: +selected_per_gene = {} #for storing selected spacers/arrays for each PAM +gene_list = SeqIO.parse("{}_sequences.fasta".format(inname), 'fasta') +gene_seqs = {} #dic of target_gene_id:target_gene_sequence +for gene in gene_list: + gene_seqs[gene.id] = str(gene.seq) + selected_per_gene[gene.id] = [] +all_spacers_dic = {} #all possible spacers for each PAM, before any filtering +all_PAMs = [] #list of all PAMs (without IUPAC ambiguity codes) +all_revcomp_PAMs = [] +PAM_list = remove_ambiguous(PAM) +all_PAMs.extend(PAM_list) +revcomp_PAM_list = [revcomp(PAM) for PAM in PAM_list] +all_revcomp_PAMs.extend(revcomp_PAM_list) +PAM_length = len(PAM) +#index until which to extend the promoter, in order to include the spacers +#that target up to spacer_length/2 nt after the TSS: +lo = promoter_length + int(spacer_length/2) + PAM_length + diff_crates +for gene_id, gene_sequence in gene_seqs.items(): + #print(gene_id) + gene_length = len(gene_sequence) + longer_promoter = gene_sequence[:lo] #to include spacers targeting up to spacer_length/2 nt after TSS + spacers_top = [] + spacers_bottom = [] + for PAM_sequence in PAM_list: + #promoter: + spacers_top.extend(top_spacers(longer_promoter, PAM_sequence)) + #rest of gene: + spacers_bottom.extend(bottom_spacers(gene_sequence, PAM_sequence)) + all_spacers = spacers_top + spacers_bottom + if gene_id not in all_spacers_dic: + all_spacers_dic[gene_id] = [] + all_spacers_dic[gene_id] = all_spacers + +previous_gene = "" +bowtie_in = tempfile.NamedTemporaryFile(mode='w+') #input file for bowtie +previous_gene = "" +for gene_id, selectedspacers in all_spacers_dic.items(): + i = 0 + for spacer in selectedspacers: + seq = spacer[5][:spacer_length] #look only for offtargets in the spacer_length part of the spacer + strand = spacer[2] + bowtie_in.write(">") + bowtie_in.write("{0}_{1}_{2}".format(gene_id, i,strand)) #ID of the fasta entry: targeted gene + _i + _targetedstrand + bowtie_in.write("\n") + bowtie_in.write(seq) #spacer sequence + bowtie_in.write("\n") + i += 1 +bowtie_in.seek(0) + +#make bowtie indexes if not present: +if not os.path.exists("bowtie_files"): + os.mkdir("bowtie_files") +if not os.path.isfile('bowtie_files/*.ebwt'): + cmd = ("bowtie-build --threads {} -r -q -f {}/genome_sequence.fa bowtie_files/genome_index".format(processors,reference)) + try: + subprocess.check_output(cmd, shell=True,stderr=subprocess.STDOUT,cwd=None) + except subprocess.CalledProcessError as e: + print("ERROR: Creating the bowtie index with the command: \n{} \nfailed with this message: \n{}".format(e.cmd,e.output.decode('utf-8'))) + sys.exit() + +#run bowtie: +bowtie_out = tempfile.NamedTemporaryFile(mode='w+') +offtargeting = 0 #no. of spacers discarded because of off-targeting +multitargeting = 0 +cmd = ("bowtie --threads {0} -v 2 -a bowtie_files/genome_index --suppress 6,7 " +"-f {1} > {2}".format(processors,bowtie_in.name,bowtie_out.name)) #matches with up to 3 mismatches +try: + subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT,cwd=None) +except subprocess.CalledProcessError as e: + print("ERROR: Bowtie run with the command: \n{} \nfailed with this message: \n{}".format(e.cmd,e.output.decode('utf-8'))) + sys.exit() + +offtargeting_spacers = {} +multitargeting_spacers = {} + +csv_reader = csv.reader(bowtie_out, delimiter='\t') +rowlist = [] +for row in csv_reader: + rowlist.append(row) +rowlist = sorted(rowlist, key = lambda x: x[0]) # sort alphabetically by spacer sequence, so that + # we can stop at the first offtarget of a spacer +for row in rowlist: + #print(row) + thisspacer = row[0] + (thisgene, _, target_strand) = thisspacer.split("_") + strand = row[1] + scaffold = row[2] #chromosome or plasmid ID + index = int(row[3]) #starting index of off-target + if strand == "-": + seq = revcomp(row[4]) + start = index + spacer_length + end = index + spacer_length + PAM_length + offtarget(all_revcomp_PAMs,target_strand,thisspacer) + else: + seq = row[4] + start = index - PAM_length + end = index + offtarget(all_PAMs,target_strand,thisspacer) +bowtie_in.close() +bowtie_out.close() + +total_mature = 0 +for gene_id, spacers in all_spacers_dic.items(): + print("\n", gene_id,"\n") + gene_sequence = gene_seqs[gene_id][diff_crates:-diff_crates] + gene_length = len(gene_sequence) + offt = False + if arrays_flag == True: + selected = make_arrays(spacers) + else: + selected = bestspacers(spacers,output_no,False) + if len(selected) < output_no: + n = output_no - len(selected) + #if there are not enough (== output_no) selected arrays/spacers, + #pick spacers from the ones that have multipletargets + if gene_id in multitargeting_spacers: + selected.extend(bestspacers(multitargeting_spacers[gene_id],n,True)) + if len(selected) < output_no: + n = output_no - len(selected) + #if there are not enough (== output_no) selected arrays/spacers, + #pick spacers from the ones that have off-targets + if gene_id in offtargeting_spacers: + selected.extend(bestspacers(offtargeting_spacers[gene_id],n,True)) + out = [] + for j in selected: + if len(j) == 3: + out.append([x[5] for x in j]) + else: + seq = j[5] + out.append(seq[:spacer_length]) + selected_per_gene[gene_id] = out + total_mature += len(out) + print(out) + +if non_targeting_flag: + #make non-targeting gRNAs: + + print("making nt grnas") + PAM_no = len(all_PAMs) + + generated_guides = 50000 + min_output_size = max(20,int(total_mature/100)) + limit = 10 #base position up to which the mismathces are checked + + b = 1 + nt_spacers = [] + + with tempfile.NamedTemporaryFile(mode='w+') as outf: + while len(nt_spacers) < min_output_size: + strings = [] + for i in range(generated_guides): + string = random_string(spacer_length) + if 30<= GC_count(string) <= 70: + strings.append(string) + with tempfile.NamedTemporaryFile(mode='w+') as tmp_in: + i = 0 + for gRNA in strings: + tmp_in.write(">" + str(i) + '_' + "lol" + "\n") + tmp_in.write(gRNA + "\n") + i += 1 + tmp_in.flush() + #run bowtie: + cmd = ("bowtie --threads {0} -v 1 -3 8 -a bowtie_files/genome_index --suppress 6,7 " + "-f {1} --un {2}".format(processors,tmp_in.name,outf.name)) #matches with up to 3 mismatches + subprocess.call(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + outf.flush() + out_parse = SeqIO.parse(outf, 'fasta') + nontargeting = [[0, ''] for j in range(generated_guides)] + for record in out_parse: + nt_spacers.append(str(record.seq)) + nt_spacers = list(set(nt_spacers)) # removes duplicates + #print("Cycle run ", b, "times... I found", len(nt_spacers), "non-targeting gRNAs:") + b += 1 + #print("Non targeting gRNAs:",nt_spacers[:min_output_size]) + +#make oligos: + +arrays = [] +mature = {} +noarray = 0 +onearray = 0 +for gene, gRNAs in selected_per_gene.items(): + #print(gene) + #print(gRNAs) + n = 0 + for group in gRNAs: + if len(group) == 3: + arrays.append([gene,group]) + n=1 + else: + if gene in mature: + mature[gene].append(group) + else: + mature[gene] = [group] + if n == 0: + noarray += 1 + else: + onearray += 1 + +#finds spacers that target more sRNAs: +rev_dict = {} +for gene,spacers in mature.items(): + for spacer in spacers: + rev_dict.setdefault(spacer, set()).add(gene) + +#group these spacers under the name srna1_srna2: +for spacer,genes in rev_dict.items(): + if len(genes) > 1: + for gene in genes: + mature[gene].remove(spacer) + mature.setdefault("_".join(genes), set()).add(spacer) + +#remove sRNAs that have all spacers in common with another: +mature = {key:val for key, val in mature.items() if val != []} + + +junctions = ["CTAA", "GACA", "GCAC", "AATC", "GTAA", "TGAA", "ATTA", "CCAG", + "AGGA", "ACAA", "TAGA", "CGGA", "CATA", "CAGC", "AAGT", "CTCC", "AGAT", "ACCA", + "AGTG", "GGTA", "GCGA", "AAAA", "ATGA"] +start_junction = "CCTC" +end_junction = "AACG" +repeat_dna = "gtctaagaactttaaataatttctactgttgtagat" +rev_repeat_dna = revcomp(repeat_dna) + +if arrays_flag == True: + oligos = [[] for i in range(int(len(arrays)/len(junctions)+1))] + out_array_constructs = open("constructs_array.txt",mode="w") + print("Arrays:") + + previous_gene = "" + n = 0 + j = 1 + for group in oligos: + i = 0 + out = open("array_oligo_pool_{0}.txt".format(j),mode="w") + while i < len(junctions) and n < len(arrays): + array = arrays[n][1] + gene = arrays[n][0] + if gene != previous_gene: + print("\n") + print(gene) + junction = junctions[i] + revc_junction = revcomp(junction) + f1 = start_junction + repeat_dna + array[0] + junction + r1 = revcomp(array[0]) + rev_repeat_dna + f2 = repeat_dna + array[1] + r2 = revc_junction + revcomp(array[1]) + rev_repeat_dna + revc_junction + f3 = junction + repeat_dna + array[2] + r3 = revcomp(end_junction) + revcomp(array[2]) + rev_repeat_dna + group.extend([f1,r1,f2,r2,f3,r3]) + print("\n".join([f1,r1,f2,r2,f3,r3])) + i += 1 + n += 1 + out.write("\n".join([f1,r1,f2,r2,f3,r3])) + out.write("\n") + construct="".join([f1,f2,f3,end_junction]) + line = ">" + gene + "_array" + "\n" + out_array_constructs.write(line) + line = construct + "\n" + out_array_constructs.write(line) + j += 1 + print("--------------------------------------------------------------------------------------\n") + +print("Mature gRNAs:\n") +out_mature_constructs = open("constructs_mature.txt",mode="w") + +out = open("mature_oligo_pool2.txt",mode="w") +for gene, spacers in mature.items(): + print(gene) + i = 1 + for spacer in spacers: + oligo = left_overh + spacer + right_overh + print(oligo) + out.write(oligo) + out.write("\n") + line = ">" + gene + "_mature_" + str(i) + "\n" + out_mature_constructs.write(line) + line = oligo + "\n" + out_mature_constructs.write(line) + i += 1 + print("\n") + +if non_targeting_flag: + print("\nMature gRNAs (non-targeting):") + out = open("nontargeting_oligo_pool2.txt",mode="w") + i = 1 + for nt_gRNA in nt_spacers[:min_output_size]: + oligo = left_overh + nt_gRNA[:spacer_length] + right_overh + print(oligo) + out.write(oligo) + out.write("\n") + line = "non_targeting_" + str(i) + "\n" + out_mature_constructs.write(line) + line = oligo + "\n" + out_mature_constructs.write(line) + i += 1 + +if arrays_flag == True: + if not any(oligos): #if the list is empty, therefore no arrays have been designed + print("\nNo arrays could be designed for the input targets.") + else: + print("\nNumber of array-cloning reactions:", len(oligos)) + print("\nTarget genes with:\n0 arrays: {0}\n1 array: {1}".format(noarray, onearray)) +print("\ntot mature: {0}\n".format(total_mature)) + +print("Done!") +print(datetime.now() - startTime) diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..4b1a34d --- /dev/null +++ b/environment.yml @@ -0,0 +1,46 @@ +name: CRISPRi_design +channels: + - anaconda + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - biopython=1.79=py36h8f6f2f9_0 + - bowtie=1.3.1=py36h7144ea4_5 + - ca-certificates=2023.05.30=h06a4308_0 + - certifi=2020.6.20=pyhd3eb1b0_3 + - icu=72.1=hcb278e6_0 + - ld_impl_linux-64=2.40=h41732ed_0 + - libblas=3.9.0=16_linux64_openblas + - libcblas=3.9.0=16_linux64_openblas + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=12.2.0=h65d4601_19 + - libgfortran-ng=12.2.0=h69a702a_19 + - libgfortran5=12.2.0=h337968e_19 + - libgomp=12.2.0=h65d4601_19 + - libhwloc=2.9.1=nocuda_h7313eea_6 + - libiconv=1.17=h166bdaf_0 + - liblapack=3.9.0=16_linux64_openblas + - libnsl=2.0.0=h7f98852_0 + - libopenblas=0.3.21=pthreads_h78a6416_3 + - libstdcxx-ng=12.2.0=h46fd767_19 + - libxml2=2.11.4=h0d562d8_0 + - libzlib=1.2.13=h166bdaf_4 + - ncurses=6.4=h6a678d5_0 + - numpy=1.19.5=py36hfc0c790_2 + - openssl=1.1.1u=hd590300_0 + - perl=5.32.1=2_h7f98852_perl5 + - pip=20.0.2=py36_1 + - python=3.6.15=hb7a2778_0_cpython + - python_abi=3.6=2_cp36m + - readline=8.2=h8228510_1 + - setuptools=49.6.0=py36h5fab9bb_3 + - sqlite=3.41.1=h5eee18b_0 + - tbb=2021.9.0=hf52228f_0 + - tk=8.6.12=h27826a3_0 + - viennarna=2.5.1=py36pl5321hcac48a8_0 + - wheel=0.37.1=pyhd3eb1b0_0 + - xz=5.2.10=h5eee18b_1 + - zlib=1.2.13=h166bdaf_4 diff --git a/get_sequences_from_gff.py b/get_sequences_from_gff.py new file mode 100644 index 0000000..acc585b --- /dev/null +++ b/get_sequences_from_gff.py @@ -0,0 +1,119 @@ +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from collections import defaultdict +import argparse +import sys + +parser = argparse.ArgumentParser(description='Extract gene sequences from gff and genome files.') + +parser.add_argument('gff_file', type=str, + help='Input annotation (gff) file with sequences of the target genes.') + +parser.add_argument('fasta_file', type=str, + help='Input genome file.') + +parser.add_argument('--outname', '-o', type=str, default="target_genes", + help='Base name of the output files. The .fasta extension will be appended to this. ' + '(Default: target_genes.') + +parser.add_argument('--promoter', '-p', default=50, type=int, + help='Length before the gene start to be considered as the promoter. (Default: 50)') + +parser.add_argument('--genetype', '-t', type=str, + help='Consider only genes of this type (Third column of the gff file).') + +parser.add_argument('--attribute', '-a', type=str, + help='Consider only genes that have this attribute (ninth column of the gff file).' + 'E.g. if you want to filter for the attribute "sRNA_type=Intergenic" write' + 'sRNA_type=Intergenic.') + +parser.add_argument('--name', '-n', type=str, default="locus_tag", + help='Take gene name from this attribute of the gff attribute column (ninth column of the gff file)') + +args = parser.parse_args() +gff_file = args.gff_file +genome_file = args.fasta_file +outname = args.outname +promoter_length = args.promoter +genetype = args.genetype +attribute = args.attribute +gene_name = args.name + +def revcomp(sequence): +#reverse complements sequence + basecomplement = {'A':'T', 'C':'G', 'T':'A', 'G':'C'} + letters = list(sequence) + letters.reverse() + dna = '' + for base in letters: + dna += basecomplement[base] + return dna + +##### extend sRNA coordinates to add promoter_length before the start: +input_file = open(gff_file, mode="r") +genes = {} +for line in input_file: + linelist = line.split("\t") + if line.startswith("#") or not line.rstrip(): + continue + if genetype is not None: + if linelist[2] != genetype: + continue + if attribute is not None: + if attribute not in linelist[8]: + continue + seqname = linelist[0] + attributes = {} + for attr in linelist[8].split(";"): + attr = attr.split("=") + attributes[attr[0]] = attr[1] + try: + gene = attributes[gene_name].rstrip() + except KeyError: + print("ERROR: this line of the gff file: \n\n{}\ndoesn't have the " + "desired attribute '{}' which is set by the -n flag.".format(line,gene_name)) + sys.exit() + strand = linelist[6] + if strand == "+": + start = str(int(linelist[3]) - promoter_length - 6) + end = str(int(linelist[4]) + 6) + elif strand == "-": + end = str(int(linelist[4]) + promoter_length + 6) + start = str(int(linelist[3]) - 6) + genes[gene] = [seqname,start,end,strand] + +with open("{}_coordinates.txt".format(outname), mode="w") as outf: + for gene,[scaffold,start,end,strand] in genes.items(): + line = "\t".join([scaffold,gene,start,end,strand]) + outf.write(line) + outf.write("\n") + +##### make a fasta file with gene name and sequence: +input_file = open(genome_file) +#make a dictionary out of the genome: +assembly_seqs = SeqIO.to_dict(SeqIO.parse(input_file, "fasta")) +#make a list of SeqRecord items of each gene: +genes_sr = [] +for gene,coord in genes.items(): + [seqname,start,end,strand] = coord + gene_sequence = assembly_seqs[seqname][int(start)-1:int(end)].seq.upper() + if strand == "-": + gene_sequence = revcomp(gene_sequence) + gene_sr = SeqRecord(gene_sequence, id=gene, description ="") + genes_sr.append(gene_sr) + genes[gene].append(gene_sequence) + +#make a dictionary of sequence:geneID for each gene. +#this is to capture genes that have the same sequence +#and store their IDs as a list in the dictionary value +seq_dict = defaultdict(list) +for gene in genes_sr: + seq_dict[str(gene.seq)].append(gene.id) + +#write the output fasta file: +out_fasta = open("{}_sequences.fasta".format(outname), 'w') +for seq, IDs in seq_dict.items(): + #Join the IDs of genes that have the same sequence as gene1|gene2: + ID = '|'.join(IDs) + out_fasta.write(">{}\n".format(ID)) + out_fasta.write(seq + "\n")