-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparseGFF.py
102 lines (79 loc) · 4.25 KB
/
parseGFF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/env python3
import csv
import argparse #used to make arguments in terminal much nicer
from Bio import SeqIO
from collections import defaultdict # don't need value in before creating key. Avoids key errors
import re # regex
# inputs: 1) GFF file (.gff), corresponding genome sequence (FASTA format)
# output: the CDS for each transcriptional gene present
# create an argument parser object
parser = argparse.ArgumentParser(description='This script will parse a GFF file and extract whole exon sequences from the genome fasta file')
# add positional arguments (called so b/c their order matters)
parser.add_argument("gff", help='name of the GFF file')
parser.add_argument("fasta", help='name of the FASTA file')
parser.add_argument("feature_type_arg", help='feature type that contains exons')
# parse the arguments
args = parser.parse_args()
# read in FASTA file
genome = SeqIO.read(args.fasta.rstrip(),'fasta')
# create a function that reverse complements sequences of gff file that are on the - strand:
def rev_comp(gene_seq, strand):
# determine whether to reverse complement
if strand == "-": # if -, need to reverse and complement the strand
# print the reverse complement of nucleotides in the sequence
return(gene_seq.reverse_complement())
else:
# print the sequence of the gene
return(gene_seq)
## Extracting the [CDS] or feature_type_arg regions:
# declare dictionary: key = gene, value = list of exon(s) for that gene
gene_dict = defaultdict(dict)
# open and read in GFF file
with open(args.gff, 'r') as gff_in:
# create a csv reader object
reader = csv.reader(gff_in, delimiter = '\t')
# loop over all lines in reader object (i.e., the parsed file)
for line in reader:
# skip header line
if re.search('^#', line[0]):
continue
else:
# identify important components from gff, assign variables.
start = int(line[3]) # which column corresponds to start location of gene in genome
end = int(line[4]) # which column corresponds to end location of gene in genome
strand = line[6] # which strand is the gene on
header_info = line[-1] #last column of gff is information about the entry
feature_type = line[2].strip() #type of feature, used to get CDS sequences
species = line[0] # species from which genes are taken
exon_number = re.search("exon ?=?(\d)", header_info, re.I) # find the exon number
# create a variable that later serves as fasta header: (gene_key)
# only works if gene name is second object in description file, not generalizable
gene_key = ">" + species.replace(" ", "_") + "_" + header_info.split()[1]
# select gff objects that are `feature_type_arg` (e.g. CDS) features
if feature_type == args.feature_type_arg:
# add features with only one exon to the dictionary
if not exon_number:
# Assign the value (sequence, reverse complemented if needed) to dictionary
gene_dict[gene_key] = rev_comp(genome.seq[start-1:end], strand)
# for genes that are >1 exon but not in dictionary yet:
elif not gene_dict[gene_key]:
#create an entry in dict for new gene that is all null values
#NOTE: created 250 because no known CDS has more than 234 exons. would be better to find max of number in gff file, but I am unsure how to properly implement.
gene_dict[gene_key] = ['']*250
# Assign the sequence of exon number to the appropriate index in the list for that dictionary key
gene_dict[gene_key][int(exon_number[1])-1] = rev_comp(genome.seq[start-1:end], strand)
# for genes with >1 exon that are already have a gene_dict entry
else:
# add sequence entry to correct place in list
gene_dict[gene_key][int(exon_number[1])-1] = rev_comp(genome.seq[start-1:end], strand)
# convert the sequences from lists to a string with .join()
for key in gene_dict:
gene_dict[key] = ''.join(str(exon) for exon in gene_dict[key])
# print output
for key, value in gene_dict.items():
# print the key (which is gene_key, already in fasta format above)
print(key)
# print the sequence
print(value)#''.join(str(exon) for exon in j).strip())
# add spacing for easier formatting.
print()