Skip to content

Commit

Permalink
Add CDS as output
Browse files Browse the repository at this point in the history
  • Loading branch information
signalbash committed Aug 24, 2021
1 parent 929850d commit c9e2026
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
36 changes: 32 additions & 4 deletions borf/borf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import sys
import re
from Bio import SeqIO
from .get_orfs import get_orfs, write_orf_fasta, write_orf_data, batch_iterator
import pandas as pd
from .get_orfs import get_orfs, write_orf_fasta, write_orf_data, write_orf_cds, batch_iterator


def main():
Expand Down Expand Up @@ -35,14 +36,23 @@ def main():

output_path_pep = output_path + '.pep'
output_path_txt = output_path + '.txt'
output_path_cds = output_path + '.cds'

# check if files exist already
if os.path.isfile(output_path_pep) or os.path.isfile(output_path_txt):
if os.path.isfile(output_path_pep) or os.path.isfile(output_path_txt) or os.path.isfile(output_path_cds):

if os.path.isfile(output_path_pep) and os.path.isfile(output_path_txt):
if os.path.isfile(output_path_pep) and os.path.isfile(output_path_txt) and os.path.isfile(output_path_cds):
print(output_path_pep + ", " + output_path_txt + " and " + output_path_cds + " already exist")
elif os.path.isfile(output_path_pep) and os.path.isfile(output_path_txt):
print(output_path_pep + " and " + output_path_txt + " already exist")
elif os.path.isfile(output_path_pep) and os.path.isfile(output_path_cds):
print(output_path_pep + " and " + output_path_cds + " already exist")
elif os.path.isfile(output_path_cds) and os.path.isfile(output_path_txt):
print(output_path_txt + " and " + output_path_cds + " already exist")
elif os.path.isfile(output_path_pep):
print(output_path_pep + " already exists")
elif os.path.isfile(output_path_cds):
print(output_path_cds + " already exists")
else:
print(output_path_txt + " already exists")

Expand All @@ -56,12 +66,16 @@ def main():
os.remove(output_path_pep)
if os.path.isfile(output_path_txt):
os.remove(output_path_txt)
if os.path.isfile(output_path_cds):
os.remove(output_path_cds)
else:
print('Overwriting files')
if os.path.isfile(output_path_pep):
os.remove(output_path_pep)
if os.path.isfile(output_path_txt):
os.remove(output_path_txt)
if os.path.isfile(output_path_cds):
os.remove(output_path_cds)

# number of sequences
n_seqs = 0
Expand Down Expand Up @@ -121,15 +135,29 @@ def main():
min_upstream_length=args.upstream_incomplete_length,
genetic_code=args.genetic_code)

# extract nt seqs at CDS
nucleotide_seq = []
nucleotide_id = []
for seq_string in all_sequences:
nucleotide_seq.append(str(seq_string.seq))
nucleotide_id.append(str(seq_string.id))
seq_df = pd.DataFrame(list(zip(nucleotide_id, nucleotide_seq)), columns=['id', 'nt_seq'])

# merge orfs with all_sequences
orf_data = pd.merge(seq_df, orf_data, on='id', how='right')
orf_data['cds_seq'] = orf_data.apply(lambda x: x['nt_seq'][(x['start_site_nt']-1):x['stop_site_nt']], axis=1)


write_orf_data(orf_data, output_path_txt)
write_orf_fasta(orf_data, output_path_pep)
write_orf_cds(orf_data, output_path_cds)

start_seq_n = (i*batch_size) + 1
end_seq_n = min(start_seq_n + (batch_size - 1), n_seqs)
print("Processed sequences " + str(start_seq_n) + " to " + str(end_seq_n) + " of " + str(n_seqs))

print("Done with borf.")
print("Results in " + output_path_pep + " and " + output_path_txt)
print("Results in " + output_path_pep + " and " + output_path_txt + " and " + output_path_cds)

if strand_warning == True:
print("This data caused a warning based on strandedness. Please check the top of the log for details and rerun with appropriate flags if neccessary.")
15 changes: 15 additions & 0 deletions borf/get_orfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,21 @@ def write_orf_fasta(orf_df, file_out):
orf_df['fasta_id'] = '>' + orf_df.fasta_id
orf_df.to_csv(file_out, mode = 'a', index=False, sep='\n', header=False, columns=['fasta_id', 'orf_sequence'])

def write_orf_cds(orf_df, file_out):
"""
Write ORF CDS sequences to a fasta file.
Parameters
----------
orf_df : DataFrame
orf_df DataFrame
file_out : str
path to file to write fasta sequences
"""
orf_df['fasta_id'] = '>' + orf_df.fasta_id
orf_df.to_csv(file_out, mode = 'a', index=False, sep='\n', header=False, columns=['fasta_id', 'cds_seq'])

def batch_iterator(iterator, batch_size):
"""Returns lists of length batch_size.
Expand Down

0 comments on commit c9e2026

Please sign in to comment.