def parse_integrated_tsv(input_file,wanted_id=None,wanted_ref=None,wanted_seq=None): ''' :param input_file: intregrated_annotation.tsv from mantis :param wanted_id: id type to retrieve, e.g., 'kegg_ko' :param wanted_ref: reference database to retrieve data from, e.g., 'kofam_merged' :return: dictionary with sequence IDs as keys (e.g., protein1). Each sequence will then be linked to another dictionary with the reference (e.g., 'kofam_merged') as keys. These in turn will be linked to yet another dictionary with the id types are keys (e.g., 'kegg_ko'). This last dictionary will contain a set of identifiers (e.g., 'K0001'). ''' res={} with open(input_file) as file: for line in file: line=line.strip('\n') line=line.split('\t') seq_id=line[0] ref_db=line[1] passed = True if wanted_seq: if wanted_seq != seq_id: passed = False if passed: if seq_id not in res: res[seq_id] = {} passed=True if wanted_ref: if wanted_ref != ref_db: passed=False if passed: if ref_db not in res[seq_id]: res[seq_id][ref_db]={} separator=line.index('|') annotations=line[separator+1:] for db_annot in annotations: db=db_annot.split(':')[0] passed = True if wanted_id: if wanted_id != db: passed = False if passed: #to avoid bad splitting when dealing with descriptions annot=db_annot[len(db)+1:] if db not in res[seq_id][ref_db]: res[seq_id][ref_db][db]=set() res[seq_id][ref_db][db].add(annot) return res def parse_consensus_tsv(input_file,wanted_id=None,wanted_seq=None): ''' :param input_file: consensus_annotation.tsv from mantis :param wanted_id: id type to retrieve, e.g., 'kegg_ko' :return: dictionary with sequence IDs as keys (e.g., protein1). These in turn will be linked to yet another dictionary with the id types are keys (e.g., 'kegg_ko'). This last dictionary will contain a set of identifiers (e.g., 'K0001'). ''' res={} with open(input_file) as file: for line in file: line=line.strip('\n') line=line.split('\t') seq_id=line[0] passed=True if wanted_seq: if wanted_seq != seq_id: passed = False if passed: if seq_id not in res: res[seq_id] = {} separator=line.index('|') annotations=line[separator+1:] for db_annot in annotations: db=db_annot.split(':')[0] passed=True if wanted_id: if wanted_id != db: passed = False if passed: #to avoid bad splitting when dealing with descriptions annot=db_annot[len(db)+1:] if db not in res[seq_id]: res[seq_id][db]=set() res[seq_id][db].add(annot) if wanted_seq: if wanted_seq == seq_id: return res return res if __name__ == '__main__': integrated_file='integrated_annotation.tsv' integrated_annotations=parse_integrated_tsv(integrated_file,wanted_seq='IICINOJM_154481') print(integrated_annotations) consensus_file='consensus_annotation.tsv' consensus_annotations=parse_consensus_tsv(consensus_file,wanted_seq='IICINOJM_154481') print(consensus_annotations)