def parse_integrated_tsv(input_file,wanted_id=None,wanted_ref=None,wanted_seq=None):
    '''
    :param input_file: intregrated_annotation.tsv from mantis
    :param wanted_id: id type to retrieve, e.g., 'kegg_ko'
    :param wanted_ref: reference database to retrieve data from, e.g., 'kofam_merged'
    :return:
    dictionary with sequence IDs as keys (e.g., protein1).
    Each sequence will then be linked to another dictionary with the reference (e.g., 'kofam_merged') as keys.
    These in turn will be linked to yet another dictionary with the id types are keys (e.g., 'kegg_ko').
    This last dictionary will contain a set of identifiers (e.g., 'K0001').
    '''
    res={}
    with open(input_file) as file:
        for line in file:
            line=line.strip('\n')
            line=line.split('\t')
            seq_id=line[0]
            ref_db=line[1]
            passed = True
            if wanted_seq:
                if wanted_seq != seq_id:
                    passed = False
            if passed:
                if seq_id not in res: res[seq_id] = {}
                passed=True
                if wanted_ref:
                    if wanted_ref != ref_db:
                        passed=False
                if passed:
                    if ref_db not in res[seq_id]: res[seq_id][ref_db]={}
                    separator=line.index('|')
                    annotations=line[separator+1:]
                    for db_annot in annotations:
                        db=db_annot.split(':')[0]
                        passed = True
                        if wanted_id:
                            if wanted_id != db:
                                passed = False
                        if passed:
                            #to avoid bad splitting when dealing with descriptions
                            annot=db_annot[len(db)+1:]
                            if db not in res[seq_id][ref_db]: res[seq_id][ref_db][db]=set()
                            res[seq_id][ref_db][db].add(annot)
    return res

def parse_consensus_tsv(input_file,wanted_id=None,wanted_seq=None):
    '''
    :param input_file: consensus_annotation.tsv from mantis
    :param wanted_id: id type to retrieve, e.g., 'kegg_ko'
    :return:
    dictionary with sequence IDs as keys (e.g., protein1).
    These in turn will be linked to yet another dictionary with the id types are keys (e.g., 'kegg_ko').
    This last dictionary will contain a set of identifiers (e.g., 'K0001').
    '''
    res={}
    with open(input_file) as file:
        for line in file:
            line=line.strip('\n')
            line=line.split('\t')
            seq_id=line[0]
            passed=True
            if wanted_seq:
                if wanted_seq != seq_id:
                    passed = False
            if passed:
                if seq_id not in res: res[seq_id] = {}
                separator=line.index('|')
                annotations=line[separator+1:]
                for db_annot in annotations:
                    db=db_annot.split(':')[0]
                    passed=True
                    if wanted_id:
                        if wanted_id != db:
                            passed = False
                    if passed:
                        #to avoid bad splitting when dealing with descriptions
                        annot=db_annot[len(db)+1:]
                        if db not in res[seq_id]: res[seq_id][db]=set()
                        res[seq_id][db].add(annot)
                if wanted_seq:
                    if wanted_seq == seq_id:
                        return  res
    return res


if __name__ == '__main__':
    integrated_file='integrated_annotation.tsv'
    integrated_annotations=parse_integrated_tsv(integrated_file,wanted_seq='IICINOJM_154481')
    print(integrated_annotations)
    consensus_file='consensus_annotation.tsv'
    consensus_annotations=parse_consensus_tsv(consensus_file,wanted_seq='IICINOJM_154481')
    print(consensus_annotations)