-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathrefresh.py
30 lines (23 loc) · 814 Bytes
/
refresh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import sys
import Bio
import subprocess
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
prodigal_cmd = 'prodigal -i dataset/nucl.fasta -a dataset/protein.fasta -f gff -p meta'
print("Running prodigal...")
_ = subprocess.check_call(prodigal_cmd, shell=True)
proteins = []
contigs = []
keywords = []
for record in SeqIO.parse('dataset/protein.fasta', 'fasta'):
name = record.id
contigs.append(name.rsplit("_", 1)[0])
proteins.append(name)
keywords.append('hypothetical protein')
gene2genome_df = pd.DataFrame({'protein_id': proteins, 'contig_id': contigs, 'keywords': keywords})
gene2genome_df.to_csv('dataset/database_gene_to_genome.csv', index=False)