From 267909f3c54cf914ed814d688106c61c2491d441 Mon Sep 17 00:00:00 2001 From: Pedro Q Date: Sat, 2 Jul 2022 18:06:38 +0200 Subject: [PATCH] updated tcdb download to adjust for updated uniprot api calls --- mantis/assembler.py | 772 ++++++++++++++ mantis/consensus.py | 736 ++++++++++++++ mantis/database_generator.py | 1434 +++++++++++++++++++++++++++ mantis/exceptions.py | 28 + mantis/homology_processor.py | 762 ++++++++++++++ mantis/mantis.py | 550 ++++++++++ mantis/metadata.py | 519 ++++++++++ mantis/metadata_sqlite_connector.py | 197 ++++ mantis/multiprocessing.py | 661 ++++++++++++ mantis/taxonomy_sqlite_connector.py | 344 +++++++ mantis/unifunc_wrapper.py | 30 + 11 files changed, 6033 insertions(+) create mode 100644 mantis/assembler.py create mode 100644 mantis/consensus.py create mode 100644 mantis/database_generator.py create mode 100644 mantis/exceptions.py create mode 100644 mantis/homology_processor.py create mode 100644 mantis/mantis.py create mode 100644 mantis/metadata.py create mode 100644 mantis/metadata_sqlite_connector.py create mode 100644 mantis/multiprocessing.py create mode 100644 mantis/taxonomy_sqlite_connector.py create mode 100644 mantis/unifunc_wrapper.py diff --git a/mantis/assembler.py b/mantis/assembler.py new file mode 100644 index 0000000..1d24a00 --- /dev/null +++ b/mantis/assembler.py @@ -0,0 +1,772 @@ +try: + from mantis.database_generator import * + from mantis.taxonomy_sqlite_connector import Taxonomy_SQLITE_Connector + from mantis.metadata_sqlite_connector import Metadata_SQLITE_Connector +except: + from database_generator import * + from taxonomy_sqlite_connector import Taxonomy_SQLITE_Connector + from metadata_sqlite_connector import Metadata_SQLITE_Connector + + +def setup_databases(chunk_size=None, no_taxonomy=False, mantis_config=None, cores=None): + print_cyan('Setting up databases') + if chunk_size: + chunk_size = int(chunk_size) + if cores: + cores = int(cores) + mantis = Assembler(hmm_chunk_size=chunk_size, + mantis_config=mantis_config, + user_cores=cores, + no_taxonomy=no_taxonomy) + mantis.setup_databases() + + +def check_installation(mantis_config=None, no_taxonomy=False, check_sql=False): + yellow('Checking installation') + mantis = Assembler(mantis_config=mantis_config, no_taxonomy=no_taxonomy) + mantis.check_installation(check_sql=check_sql) + + +class Assembler(Database_generator, Taxonomy_SQLITE_Connector): + def __init__(self, verbose=True, + redirect_verbose=None, + no_taxonomy=False, + mantis_config=None, + hmm_chunk_size=None, + keep_files=False, + user_cores=None): + self.redirect_verbose = redirect_verbose + self.keep_files = keep_files + self.verbose = verbose + if no_taxonomy: + self.use_taxonomy = False + else: + self.use_taxonomy = True + self.mantis_config = mantis_config + self.user_cores = user_cores + self.broken_merged_hmms = set() + self.clean_merged_hmms = set() + self.start_time = time() + # to speed up hmm search we split very big hmms into smaller chunks - better job distribution + if hmm_chunk_size == 0: + self.hmm_chunk_size = None + elif hmm_chunk_size is None: + self.hmm_chunk_size = 5000 + else: + self.hmm_chunk_size = hmm_chunk_size + self.read_config_file() + + # I use manager instead of queue since I need to be able to add records to the end and start of the 'queue' (actually a list) which is not possible with the multiprocessing.Queue + # we need manager.list because a normal list cant communicate during multiprocessing + self.manager = Manager() + self.queue = self.manager.list() + + def __str__(self): + custom_refs = self.get_custom_refs_paths(folder=True) + custom_refs_str = '' + custom_res = '' + for cref in custom_refs: + custom_refs_str += cref + '\n' + if custom_refs_str: + custom_res = f'# Custom references:\n{custom_refs_str}' + + res = [] + if hasattr(self, 'output_folder'): + res.append(f'Output folder:\nself.output_folder') + res.append(f'Default references folder:\n{self.mantis_paths["default"]}') + res.append(f'Resources folder:\n{self.mantis_paths["resources"]}') + res.append(f'Custom references folder:\n{self.mantis_paths["custom"]}') + if self.mantis_paths['NOG'][0:2] != 'NA': + res.append(f'TAX NOG references folder:\n{self.mantis_paths["NOG"]}') + if self.mantis_paths['NCBI'][0:2] != 'NA': + res.append(f'TAX NCBI references folder:\n{self.mantis_paths["NCBI"]}') + if self.mantis_paths['pfam'][0:2] != 'NA': + res.append(f'Pfam reference folder:\n{self.mantis_paths["pfam"]}') + if self.mantis_paths['kofam'][0:2] != 'NA': + res.append(f'KOfam reference folder:\n{self.mantis_paths["kofam"]}') + if self.mantis_paths['tcdb'][0:2] != 'NA': + res.append(f'TCDB reference folder:\n{self.mantis_paths["tcdb"]}') + res.append('------------------------------------------') + res = '\n'.join(res) + if custom_res: res += '\n' + custom_res + ref_weights = ', '.join([f'{i}:{self.mantis_ref_weights[i]}' for i in self.mantis_ref_weights if i != 'else']) + if ref_weights: + res += f'# Weights:\n{ref_weights}\n' + nog_tax = ', '.join([i for i in self.mantis_nogt_tax]) + if nog_tax: + res += f'\n# NOG tax IDs:\n{nog_tax}\n' + + return res + + def check_internet_connection(self): + try: + requests.get("http://www.google.com") + return True + except requests.ConnectionError: + print( + "Could not connect to internet!\nIf you would like to run offline make sure you introduce organism NCBI IDs instead of synonyms!") + return False + + def set_nogt_line(self, line_path): + if line_path: + res = set() + tax_ids = [i for i in line_path.split(',')] + if tax_ids: + for t_id in tax_ids: + try: + ncbi_taxon_id = int(t_id) + organism_lineage = self.fetch_ncbi_lineage(ncbi_taxon_id) + res.update(organism_lineage) + except: + ncbi_taxon_id = self.get_taxa_ncbi(t_id) + if ncbi_taxon_id: + organism_lineage = self.fetch_ncbi_lineage(ncbi_taxon_id) + res.update(organism_lineage) + for i in res: + self.mantis_nogt_tax.add(str(i)) + + def setup_paths_config_file(self): + self.mantis_paths = {'default': None, + 'resources': None, + 'custom': None, + 'NOG': None, + 'pfam': None, + 'kofam': None, + 'NCBI': None, + 'tcdb': None, + } + + self.nog_db = 'dmnd' + nogt_line = None + with open(self.config_file) as file: + for line in file: + line = line.strip('\n') + if not line.startswith('#') and line: + # data sources configuration + if line.startswith('default_ref_folder='): + line_path = add_slash(line.replace('default_ref_folder=', '')) + if line_path: + self.mantis_paths['default'] = line_path + + elif line.startswith('resources_folder='): + line_path = add_slash(line.replace('resources_folder=', '')) + if line_path: + self.mantis_paths['resources'] = line_path + + elif line.startswith('custom_ref_folder='): + line_path = add_slash(line.replace('custom_ref_folder=', '')) + if line_path: + self.mantis_paths['custom'] = line_path + + elif line.startswith('nog_ref_folder='): + line_path = add_slash(line.replace('nog_ref_folder=', '')) + if line_path: + self.mantis_paths['NOG'] = line_path + + # taxa ids list for only downloading nogt specific to lineage + elif line.startswith('nog_tax='): + nogt_line = line.replace('nog_tax=', '') + + elif line.startswith('pfam_ref_folder='): + line_path = add_slash(line.replace('pfam_ref_folder=', '')) + if line_path: + self.mantis_paths['pfam'] = line_path + + elif line.startswith('kofam_ref_folder='): + line_path = add_slash(line.replace('kofam_ref_folder=', '')) + if line_path: + self.mantis_paths['kofam'] = line_path + + elif line.startswith('ncbi_ref_folder='): + line_path = add_slash(line.replace('ncbi_ref_folder=', '')) + if line_path: + self.mantis_paths['NCBI'] = line_path + + elif line.startswith('tcdb_ref_folder='): + line_path = add_slash(line.replace('tcdb_ref_folder=', '')) + if line_path: + self.mantis_paths['tcdb'] = line_path + + elif '_weight=' in line: + ref_source, weight = line.split('_weight=') + self.mantis_ref_weights[ref_source] = float(weight) + + elif line.startswith('nog_ref='): + nog_db = line.replace('nog_ref=', '').split()[0] + if nog_db.lower() not in ['dmnd', 'hmm']: + kill_switch(InvalidNOGType) + else: + self.nog_db = nog_db + if not self.mantis_paths['default']: + self.mantis_paths['default'] = add_slash(MANTIS_FOLDER + 'References') + default_ref_path = self.mantis_paths['default'] + + if not self.mantis_paths['resources']: + self.mantis_paths['resources'] = add_slash(MANTIS_FOLDER + 'Resources') + + if not self.mantis_paths['custom']: + self.mantis_paths['custom'] = add_slash(default_ref_path + 'Custom_references') + + if not self.mantis_paths['NOG']: + self.mantis_paths['NOG'] = add_slash(default_ref_path + 'NOG') + + if not self.mantis_paths['pfam']: + self.mantis_paths['pfam'] = add_slash(default_ref_path + 'pfam') + + if not self.mantis_paths['kofam']: + self.mantis_paths['kofam'] = add_slash(default_ref_path + 'kofam') + + if not self.mantis_paths['NCBI']: + self.mantis_paths['NCBI'] = add_slash(default_ref_path + 'NCBI') + + if not self.mantis_paths['tcdb']: + self.mantis_paths['tcdb'] = add_slash(default_ref_path + 'tcdb') + + # setting up which taxa we need to have references for + Taxonomy_SQLITE_Connector.__init__(self, resources_folder=self.mantis_paths['resources']) + if self.use_taxonomy: + if nogt_line: + if self.launch_taxonomy_connector(): + self.set_nogt_line(nogt_line) + + def read_config_file(self): + self.mantis_ref_weights = {'else': 0.7} + self.mantis_nogt_tax = set() + + if self.mantis_config: + green(f'Using custom MANTIS.cfg: {self.mantis_config}', flush=True, file=self.redirect_verbose) + self.config_file = self.mantis_config + else: + if not os.path.isdir(MANTIS_FOLDER): + print( + 'Make sure you are calling the folder to run this package, like so:\n python mantis/ \n ', + flush=True, file=self.redirect_verbose) + raise FileNotFoundError + self.config_file = f'{MANTIS_FOLDER}config{SPLITTER}MANTIS.cfg' + green(f'Using default MANTIS.cfg: {self.config_file}', flush=True, file=self.redirect_verbose) + + try: + open(self.config_file, 'r') + except: + red('MANTIS.cfg file has been deleted or moved, make sure you keep it in the root of the project!', + flush=True, file=self.redirect_verbose) + raise FileNotFoundError + + self.setup_paths_config_file() + if not self.use_taxonomy: + self.mantis_paths['NOG'] = f'NA{SPLITTER}' + self.mantis_paths['NCBI'] = f'NA{SPLITTER}' + if not os.path.isdir(self.mantis_paths['custom']): + Path(self.mantis_paths['custom']).mkdir(parents=True, exist_ok=True) + if self.verbose: print(self, flush=True, file=self.redirect_verbose) + + def order_by_size_descending(self, refs_list): + res = {} + for ref in refs_list: + res[ref] = os.stat(ref).st_size + # mixing big and low size HMMs so that we try not to run out of memory, might lead to more idle time. + sorted_res = sorted(res, key=res.get, reverse=True) + resorted_res = [] + c = 1 + while sorted_res: + if c == 1: + resorted_res.append(sorted_res.pop(0)) + c = -1 + elif c == -1: + resorted_res.append(sorted_res.pop(-1)) + c = 1 + return resorted_res + + def compile_refs_list(self, folder=False): + # doesnt include NOG or NCBI + refs_list = [] + default_list = [ + get_ref_in_folder(self.mantis_paths['pfam']) if not folder else self.mantis_paths['pfam'], + get_ref_in_folder(self.mantis_paths['kofam']) if not folder else self.mantis_paths['kofam'], + get_ref_in_folder(self.mantis_paths['tcdb']) if not folder else self.mantis_paths['tcdb'], + ] + for ref_path in self.get_custom_refs_paths(folder): + if ref_path[0:2] != 'NA': + refs_list.append(ref_path) + for ref_path in default_list: + if ref_path and ref_path[0:2] != 'NA': + refs_list.append(ref_path) + return refs_list + + #####SETTING UP DATABASE##### + + def get_path_default_ref(self, database, taxon_id=None): + target_file = None + if 'kofam' in database.lower(): + target_file = get_ref_in_folder(self.mantis_paths['kofam']) + elif 'pfam' in database.lower(): + target_file = get_ref_in_folder(self.mantis_paths['pfam']) + elif 'tcdb' in database.lower(): + target_file = get_ref_in_folder(self.mantis_paths['tcdb']) + elif 'NOG'.lower() in database.lower(): + if not taxon_id: taxon_id = 'NOGG' + target_file = get_ref_in_folder(self.mantis_paths['NOG'] + taxon_id) + elif 'NCBI'.lower() in database.lower(): + if not taxon_id: taxon_id = 'NCBIG' + target_file = get_ref_in_folder(self.mantis_paths['NCBI'] + taxon_id) + return target_file + + def check_reference_exists(self, database, taxon_id=None): + ncbi_resources = add_slash(self.mantis_paths['resources'] + 'NCBI') + + if database == 'ncbi_res': + if file_exists(ncbi_resources + 'gc.prt') and \ + file_exists(ncbi_resources + 'gc.prt'): + return True + elif database == 'taxonomy': + taxonomy_db = self.mantis_paths['resources'] + 'Taxonomy.db' + if file_exists(taxonomy_db): + return True + elif database == 'NOGSQL': + if file_exists(self.mantis_paths['NOG'] + 'eggnog.db'): + return True + elif database == 'tcdb': + if file_exists(self.mantis_paths['tcdb'] + 'tcdb.dmnd'): + return True + elif database == 'NOG_DMND': + if file_exists(self.mantis_paths['NOG'] + 'eggnog_proteins.dmnd'): + return True + target_file = self.get_path_default_ref(database, taxon_id) + if target_file: + if target_file.endswith('.dmnd'): + if not file_exists(target_file): + return False + else: + for extension in ['', '.h3f', '.h3i', '.h3m', '.h3p']: + if not file_exists(target_file + extension): + return False + else: + return False + return True + + #####LISTING HMMS DATABASE##### + + def check_installation_extras(self, res, verbose=True): + ncbi_resources = add_slash(self.mantis_paths['resources'] + 'NCBI') + essential_genes = f'{MANTIS_FOLDER}Resources{SPLITTER}essential_genes/essential_genes.txt' + taxonomy_db = self.mantis_paths['resources'] + 'Taxonomy.db' + + if verbose: + yellow('Checking extra files', flush=True, file=self.redirect_verbose) + + if not file_exists(essential_genes): + red('Essential genes list is missing, it should be in the github repo!') + if verbose: + red(f'Failed installation check on [files missing]: {essential_genes}', + flush=True, + file=self.redirect_verbose) + res.append(self.mantis_paths['resources'] + 'essential_genes/') + else: + if verbose: + green(f'Passed installation check on: {self.mantis_paths["resources"]}essential_genes', + flush=True, file=self.redirect_verbose) + + if not file_exists(ncbi_resources + 'gc.prt'): + if verbose: + red(f'Failed installation check on [files missing]: {ncbi_resources}gc.prt.dmp', + flush=True, + file=self.redirect_verbose) + res.append(ncbi_resources) + else: + if verbose: + green(f'Passed installation check on: {ncbi_resources}gc.prt.dmp', + flush=True, + file=self.redirect_verbose) + + if self.use_taxonomy: + if not file_exists(taxonomy_db): + if verbose: + red(f'Failed installation check on [files missing]: {taxonomy_db}', + flush=True, + file=self.redirect_verbose) + res.append(taxonomy_db) + else: + if verbose: + green(f'Passed installation check on: {taxonomy_db}', + flush=True, + file=self.redirect_verbose) + return res + + def check_chunks_dir(self, chunks_dir): + all_chunks = [] + for hmm in os.listdir(chunks_dir): + if hmm.endswith('.hmm'): + all_chunks.append(hmm) + for hmm in all_chunks: + if not self.check_missing_chunk_files(hmm, chunks_dir): + return False + return True + + def check_missing_chunk_files(self, hmm, chunks_dir): + missing_files = ['.h3f', '.h3i', '.h3m', '.h3p'] + res = 0 + for inner_file in os.listdir(chunks_dir): + for mf in missing_files: + if inner_file == f'{hmm}{mf}': + res += 1 + if res == len(missing_files): + return True + red(f'Failed installation check on [files missing]: {hmm} in chunks folder: {chunks_dir}', + flush=True, + file=self.redirect_verbose) + return False + + def check_installation_folder(self, ref_folder_path, res, verbose=True, extra_requirements=[]): + missing_files = set(extra_requirements) + try: + files_dir = os.listdir(ref_folder_path) + except: + if verbose: + red(f'Failed installation check on [path unavailable]: {ref_folder_path}', + flush=True, + file=self.redirect_verbose) + res.append(ref_folder_path) + self.passed_check = False + return + ref_type = None + for file in files_dir: + if file.endswith('.dmnd'): + ref_type = 'dmnd' + missing_files.update(['.dmnd']) + elif file.endswith('.hmm'): + ref_type = 'hmm' + missing_files.update(['.hmm', '.h3f', '.h3i', '.h3m', '.h3p']) + + if not ref_type: + if verbose: + red(f'Failed installation check on [invalid referecence type]: {ref_folder_path}', + flush=True, + file=self.redirect_verbose) + res.append(ref_folder_path) + self.passed_check = False + return + check = len(missing_files) + + if 'chunks' in files_dir: + if not self.check_chunks_dir(f'{ref_folder_path}chunks'): + self.passed_check = False + return + else: + missing_files = set(extra_requirements) + check = len(missing_files) + for file in files_dir: + if ref_type == 'hmm': + if file.endswith('.hmm'): + if '.hmm' in missing_files: + check -= 1 + missing_files.remove('.hmm') + elif file.endswith('.h3f'): + if '.h3f' in missing_files: + check -= 1 + missing_files.remove('.h3f') + elif file.endswith('.h3i'): + if '.h3i' in missing_files: + check -= 1 + missing_files.remove('.h3i') + elif file.endswith('.h3m'): + if '.h3m' in missing_files: + check -= 1 + missing_files.remove('.h3m') + elif file.endswith('.h3p'): + if '.h3p' in missing_files: + check -= 1 + missing_files.remove('.h3p') + elif ref_type == 'dmnd': + if file.endswith('.dmnd'): + check -= 1 + missing_files.remove('.dmnd') + if file in extra_requirements: + check -= 1 + missing_files.remove(file) + if check != 0: + missing_files_str = '; '.join(missing_files) + red(f'Failed installation check on [files missing]: {ref_folder_path}\n{missing_files_str}', + flush=True, + file=self.redirect_verbose) + res.append(ref_folder_path) + else: + if verbose: + green(f'Passed installation check on: {ref_folder_path}', + flush=True, + file=self.redirect_verbose) + + def compile_sql_metadata(self): + all_files = set() + for ref in self.compile_refs_list(folder=True): + metadata_file = f'{ref}metadata.tsv' + all_files.add(metadata_file) + if self.mantis_paths['NCBI'][0:2] != 'NA': + ncbi_tax = self.get_taxon_refs('NCBI', folder=True) + for ref in ncbi_tax: + metadata_file = f'{ref}metadata.tsv' + all_files.add(metadata_file) + if self.mantis_paths['NOG'][0:2] != 'NA': + nog_tax = self.get_taxon_refs('NOG', folder=True) + for ref in nog_tax: + metadata_file = f'{ref}metadata.tsv' + all_files.add(metadata_file) + for metadata_file in all_files: + + if not file_exists(metadata_file.replace('.tsv', '.db')): + cursor = Metadata_SQLITE_Connector(metadata_file) + cursor.close_sql_connection() + + def check_sql_databases(self, ref_dbs): + broken_refs = set() + broken_ids = {} + for db in ref_dbs: + yellow(f'Checking {db}metadata.db', flush=True, file=self.redirect_verbose) + cursor = Metadata_SQLITE_Connector(f'{db}metadata.tsv') + db_res = cursor.test_database() + if db_res: broken_refs.add(db) + if db_res: broken_ids[db] = db_res + cursor.close_sql_connection() + for db in broken_ids: + red(f'Failed SQL check in {db} for the following IDs:\n{broken_ids[db]}', flush=True, + file=self.redirect_verbose) + if not broken_refs: + green('------------------------------------------', flush=True, file=self.redirect_verbose) + green('-------------SQL CHECK PASSED-------------', flush=True, file=self.redirect_verbose) + green('------------------------------------------', flush=True, file=self.redirect_verbose) + else: + red('------------------------------------------', flush=True, file=self.redirect_verbose) + red('-------------SQL CHECK FAILED-------------', flush=True, file=self.redirect_verbose) + red('------------------------------------------', flush=True, file=self.redirect_verbose) + raise SQLCheckNotPassed + + def check_installation(self, verbose=True, check_sql=False): + # we use the verbose mode when running the check_installation directly + self.compile_sql_metadata() + self.passed_check = True + ref_dbs = set() + if not cython_compiled(): + self.passed_check = False + if verbose: + red('Cython needs to be compiled!', flush=True, file=self.redirect_verbose) + else: + if verbose: + green('Cython correctly compiled!', flush=True, file=self.redirect_verbose) + res = [] + res = self.check_installation_extras(res, verbose) + + if verbose: + yellow('Checking references installation', flush=True, file=self.redirect_verbose) + requirements = { + self.mantis_paths['pfam']: ['metadata.tsv'], + self.mantis_paths['tcdb']: ['metadata.tsv'], + self.mantis_paths['kofam']: ['metadata.tsv'], + } + # per tax level FOR EGGNOG + if self.mantis_paths['NOG'][0:2] != 'NA': + tax_refs = self.get_taxon_refs(db='NOG', folder=True) + if not tax_refs: + if verbose: + red(F'Failed installation check on [path unavailable]: {self.mantis_paths["NOG"]}', + flush=True, + file=self.redirect_verbose) + res.append(self.mantis_paths['NOG']) + for tax_ref_folder in tax_refs: + self.check_installation_folder(tax_ref_folder, res, verbose=False, extra_requirements=['metadata.tsv']) + ref_dbs.add(tax_ref_folder) + nogt_check = [i for i in res if self.mantis_paths['NOG'] in i] + if not nogt_check: + if verbose: green('Passed installation check on: ' + self.mantis_paths['NOG'], flush=True, + file=self.redirect_verbose) + + # per tax level FOR NCBI + if self.mantis_paths['NCBI'][0:2] != 'NA': + # checking those already present + tax_refs = self.get_taxon_refs(db='NCBI', folder=True) + if not tax_refs: + if verbose: + red(F'Failed installation check on [path unavailable]: self.mantis_paths["NCBI"]', + flush=True, + file=self.redirect_verbose) + res.append(self.mantis_paths['NCBI']) + for tax_ref_folder in tax_refs: + # we skip the taxon 1 since it has no hmms + self.check_installation_folder(tax_ref_folder, res, verbose=False, extra_requirements=['metadata.tsv']) + ref_dbs.add(tax_ref_folder) + + ncbi_check = [i for i in res if self.mantis_paths['NCBI'] in i] + if not ncbi_check: + if verbose: green('Passed installation check on: ' + self.mantis_paths['NCBI'], flush=True, + file=self.redirect_verbose) + for ref_folder in self.compile_refs_list(folder=True): + if ref_folder in requirements: + self.check_installation_folder(ref_folder, res, verbose, extra_requirements=requirements[ref_folder]) + else: + self.check_installation_folder(ref_folder, res, verbose) + ref_dbs.add(ref_folder) + if res: + self.passed_check = False + fail_res = '' + for i in res: + fail_res += f'{i}\n' + if verbose: + red(f'Installation check failed on:\n{fail_res}', flush=True, file=self.redirect_verbose) + if self.passed_check: + if verbose: + green('------------------------------------------', flush=True, file=self.redirect_verbose) + green('--------INSTALLATION CHECK PASSED!--------', flush=True, file=self.redirect_verbose) + green('------------------------------------------', flush=True, file=self.redirect_verbose) + else: + print_cyan('Installation check passed', flush=True, file=self.redirect_verbose) + + else: + if verbose: + yellow('------------------------------------------', flush=True, file=self.redirect_verbose) + red('--------INSTALLATION CHECK FAILED!--------', flush=True, file=self.redirect_verbose) + yellow('------------------------------------------', flush=True, file=self.redirect_verbose) + else: + print_cyan('Installation check failed', flush=True, file=self.redirect_verbose) + raise InstallationCheckNotPassed + if check_sql: self.check_sql_databases(ref_dbs) + + def get_custom_refs_paths(self, folder=False): + try: + custom_refs_folders = os.listdir(self.mantis_paths['custom']) + for potential_ref_folder in custom_refs_folders: + try: + files = os.listdir(self.mantis_paths['custom'] + potential_ref_folder) + for potential_file in files: + if potential_file.endswith('.hmm') or potential_file.endswith('.dmnd'): + if folder: + try: + yield add_slash(self.mantis_paths['custom'] + potential_ref_folder) + except GeneratorExit: + return '' + else: + try: + yield add_slash(self.mantis_paths['custom'] + potential_ref_folder) + potential_file + except GeneratorExit: + return '' + except: + pass + except: + print( + 'Custom references folder is missing, did you correctly set the path? If path is not set make sure you didn\'t delete the custom_ref folder!', + flush=True, file=self.redirect_verbose) + self.passed_check = False + return + with open(self.config_file, 'r') as file: + line = file.readline() + while line: + if line[0] != '#': + if 'custom_ref=' in line: + line = line.strip('\n') + ref_path = line.replace('custom_ref=', '') + if not (ref_path.endswith('.hmm') or ref_path.endswith('.dmnd')): + if os.path.isdir(ref_path): + for inner_file in os.listdir(ref_path): + if inner_file.endswith('.hmm') or inner_file.endswith('.dmnd'): + ref_path = add_slash(ref_path) + inner_file + if folder: + try: + yield add_slash(SPLITTER.join(ref_path.split(SPLITTER)[:-1])) + except GeneratorExit: + return '' + else: + try: + yield ref_path + except GeneratorExit: + return '' + line = file.readline() + + def get_taxon_ref_path(self, taxon_id, db): + tax_refs = self.get_local_ref_taxon_ids(db=db) + if taxon_id in tax_refs: + if db == 'NOG' and self.nog_db == 'dmnd': + return add_slash(self.mantis_paths[db] + taxon_id) + f'{taxon_id}.dmnd' + else: + return add_slash(self.mantis_paths[db] + taxon_id) + f'{taxon_id}_merged.hmm' + else: + return None + + def get_ref_taxon_ids(self, db): + res = set() + if not file_exists(self.mantis_paths[db]): return res + if db == 'NOG': + available_taxon_ids = self.get_taxon_ids_eggNOG() + if self.mantis_nogt_tax: + for tax_id in self.mantis_nogt_tax: + if tax_id in available_taxon_ids: + res.add(tax_id) + return res + else: + return set(available_taxon_ids) + else: + for i in os.listdir(self.mantis_paths[db]): + if re.search('\d+', i): res.add(i) + return res + + def get_local_ref_taxon_ids(self, db): + res = set() + if file_exists(self.mantis_paths[db]): + if db == 'NOG': + if self.mantis_nogt_tax: + for i in self.mantis_nogt_tax: + res.add(i) + for i in os.listdir(self.mantis_paths[db]): + if re.search('\d+', i): res.add(i) + return res + + def get_taxon_refs(self, db, folder=False): + # locally available taxon ids + local_taxon_ids = self.get_local_ref_taxon_ids(db) + # all taxon ids + taxon_ids = self.get_ref_taxon_ids(db) + res = [] + for t in taxon_ids: + if t in local_taxon_ids: + if folder: + res.append(add_slash(self.mantis_paths[db] + t)) + else: + if self.nog_db == 'hmm': + res.append(add_slash(self.mantis_paths[db] + t) + f'{t}_merged.hmm') + else: + res.append(add_slash(self.mantis_paths[db] + t) + f'{t}_merged.dmnd') + global_folder = add_slash(self.mantis_paths[db] + db + 'G') + if folder: + if file_exists(global_folder): + res.append(global_folder) + else: + if self.nog_db == 'hmm': + if file_exists(f'{global_folder}{db}G_merged.hmm'): + res.append(f'{global_folder}{db}G_merged.hmm') + else: + if file_exists(f'{global_folder}{db}G_merged.dmnd'): + res.append(f'{global_folder}{db}G_merged.dmnd') + + return res + + def processes_handler(self, target_worker_function, worker_count, add_sentinels=True): + ''' + this will first generate one process per worker, then we add sentinels to the end of the list which will basically tell us when the queue is empty + if we need to add new work (e.g. when doing taxa annotation) we just add the new work to the start of the list + ''' + # os.getpid to add the master_pid + processes = [Process(target=target_worker_function, + args=(self.queue, os.getpid(),)) for _ in range(worker_count)] + # adding sentinel record since queue can be signaled as empty when its really not + if add_sentinels: + for _ in range(worker_count): self.queue.append(None) + for process in processes: + process.start() + # we could manage the processes memory here with a while cycle + for process in processes: + process.join() + # exitcode 0 for sucessful exists + if process.exitcode != 0: + sleep(5) + print('Ran into an issue, check the log for details. Exitting!') + os._exit(1) + + +if __name__ == '__main__': + p = Assembler(mantis_config='/media/HDD/data/mantis_references/MANTIS.cfg') diff --git a/mantis/consensus.py b/mantis/consensus.py new file mode 100644 index 0000000..a26e41b --- /dev/null +++ b/mantis/consensus.py @@ -0,0 +1,736 @@ +try: + from mantis.assembler import * +except: + from assembler import * + +try: + from mantis.cython_src.get_non_overlapping_hits import get_non_overlapping_hits +except: + if not cython_compiled(): + compile_cython() + try: + from mantis.cython_src.get_non_overlapping_hits import get_non_overlapping_hits + except: + kill_switch(CythonNotCompiled, f'{MANTIS_FOLDER}mantis{SPLITTER}utils.py') + + +class Consensus(UniFunc_wrapper): + + def __init__(self): + UniFunc_wrapper.__init__(self) + + def get_ref_weight(self, ref): + ''' + NCBI + NOG + Pfam-A + kofam + ''' + if re.search('NCBI[GT]', ref): + corrected_ref = 'ncbi' + elif re.search('NOG[GT]', ref): + corrected_ref = 'nog' + elif 'Pfam-A' in ref: + corrected_ref = 'pfam' + elif ref == 'kofam_merged': + corrected_ref = 'kofam' + else: + corrected_ref = str(ref) + + if corrected_ref in self.mantis_ref_weights: + return self.mantis_ref_weights[corrected_ref] + else: + return self.mantis_ref_weights['else'] + + # this will add the descriptions from the respective GO IDs, but we only add them if we dont have too many GO IDs + def add_from_go_obo(self, query_dict): + for ref_file in query_dict: + for ref_hit in query_dict[ref_file]: + clean_query_identifiers = self.remove_unspecific_identifiers( + query_dict[ref_file][ref_hit]['identifiers']) + temp_ref_hit_dict_identifiers = set(query_dict[ref_file][ref_hit]['identifiers']) + temp_ref_hit_dict_description = set(query_dict[ref_file][ref_hit]['description']) + for id_str in clean_query_identifiers: + if 'go:' in id_str[0:3]: + if id_str in self.go_dict: + extra_info = self.go_dict[id_str] + for go_id in extra_info['identifiers']: + temp_ref_hit_dict_identifiers.add(go_id) + for syn in extra_info['synonyms']: + temp_ref_hit_dict_description.add(syn) + # we use the temp info to help matching annotations, but we dont want to add them to the final annotations + query_dict[ref_file][ref_hit]['temp_identifiers'] = set(temp_ref_hit_dict_identifiers) + for id_str in temp_ref_hit_dict_identifiers: + # some annotations have unspecific ec numbers, if so, we remove them from the similarity analysis + if 'enzyme_ec:' in id_str and '-' in id_str: + query_dict[ref_file][ref_hit]['temp_identifiers'].remove(id_str) + query_dict[ref_file][ref_hit]['temp_description'] = temp_ref_hit_dict_description + + def read_interpreted_annotation(self, interpreted_annotation_tsv): + with open(interpreted_annotation_tsv) as file: + line = file.readline() + line = file.readline() + dict_annotations = {} + while line: + line = line.strip('\n') + line = line.strip() + line = line.split('\t') + query, ref_file, ref_hit, ref_hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line[0:13] + annotations = line[14:] + if query not in dict_annotations: dict_annotations[query] = {'query_len': int(query_len), + 'ref_files': {}} + # since the same hmm file can have different hits for the same query + if ref_file not in dict_annotations[query]['ref_files']: + dict_annotations[query]['ref_files'][ref_file] = {} + dict_annotations[query]['ref_files'][ref_file][ref_hit] = {'identifiers': set(), 'description': set(), + 'evalue': float(evalue), + 'bitscore': float(bitscore), + 'direction': direction, + 'ref_hit_accession': ref_hit_accession, + 'query_start': int(query_start), + 'query_end': int(query_end), + 'ref_start': int(ref_start), + 'ref_end': int(ref_end), + 'ref_len': int(ref_len), + } + for annot in annotations: + if 'description:' in annot[0:len('description:')]: + annot_text = annot.split(':')[1:] + annot_text = ' '.join(annot_text) + dict_annotations[query]['ref_files'][ref_file][ref_hit]['description'].add(annot_text) + elif 'is_essential_gene:' in annot: + dict_annotations[query]['is_essential_gene'] = True + else: + dict_annotations[query]['ref_files'][ref_file][ref_hit]['identifiers'].add(annot) + line = file.readline() + for query in dict_annotations: + self.add_from_go_obo(dict_annotations[query]['ref_files']) + return dict_annotations + + def generate_gff_line_consensus(self, query, + dict_annotations, + is_essential, + consensus_hits, + total_hits, + ref_files_consensus, + ref_names_consensus): + # https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + # verified with http://genometools.org/cgi-bin/gff3validator.cgi + for ref_file in dict_annotations: + ref_annotations = dict_annotations[ref_file] + for ref_id in ref_annotations: + ref_id_annotations = dict_annotations[ref_file][ref_id] + query_start = str(ref_id_annotations['query_start']) + query_end = str(ref_id_annotations['query_end']) + evalue = str(ref_id_annotations['evalue']) + direction = ref_id_annotations['direction'] + ref_start = ref_id_annotations['ref_start'] + ref_end = ref_id_annotations['ref_end'] + ref_len = ref_id_annotations['ref_len'] + hit_accession = ref_id_annotations['ref_hit_accession'] + hit_identifiers = ref_id_annotations['identifiers'] + hit_description = ref_id_annotations['description'] + gff_line = '\t'.join([query, 'Mantis', 'CDS', query_start, query_end, evalue, direction, '0']) + '\t' + if hit_accession != '-': + attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end};Alias={hit_accession}' + else: + attributes = f'Name={ref_id};Target={ref_id} {ref_start} {ref_end}' + notes = f'Note=ref_file:{ref_file},ref_len:{ref_len}' + descriptions = [] + for d in hit_description: + descriptions.append(f'description:{d}') + if descriptions: + notes += ',' + ','.join(descriptions) + if is_essential: + notes += f',is_essential_gene:True' + + dbxref = [] + ontology_terms = [] + for i in hit_identifiers: + if not i.startswith('go:'): + dbxref.append(i) + else: + ontology_terms.append(i) + all_annotations = None + if dbxref and ontology_terms: + all_annotations = 'Dbxref=' + ','.join(dbxref) + ';' + 'Ontology_term=' + ','.join(ontology_terms) + elif dbxref and not ontology_terms: + all_annotations = 'Dbxref=' + ','.join(dbxref) + elif not dbxref and ontology_terms: + all_annotations = 'Ontology_term=' + ','.join(ontology_terms) + if all_annotations: + gff_line += ';'.join([attributes, notes, all_annotations]) + else: + gff_line += ';'.join([attributes, notes]) + + yield gff_line + + def yield_seq_regions(self, interpreted_annotation_tsv): + already_added = set() + with open(interpreted_annotation_tsv) as file: + file.readline() + line = file.readline() + while line: + line = line.strip('\n') + line = line.strip() + line = line.split('\t') + query, ref_file, ref_hit, ref_hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line[0:13] + if query not in already_added: + already_added.add(query) + yield f'##sequence-region {query} 1 {query_len}\n' + line = file.readline() + + def generate_consensus(self, interpreted_annotation_tsv, stdout_file_path=None): + dict_annotations = self.read_interpreted_annotation(interpreted_annotation_tsv) + self.query_match_hits = {} + for query in dict_annotations: + hits = [] + query_dict = dict_annotations[query] + query_len = query_dict['query_len'] + ref_files = query_dict['ref_files'] + for ref_file in ref_files: + for ref_hit in ref_files[ref_file]: + hits.append([ref_file, ref_hit, ref_files[ref_file][ref_hit]]) + if self.domain_algorithm == 'heuristic': + best_hits = self.get_best_hits_approximation(list(hits), sorting_class='consensus', + sorting_type=self.sorting_type) + elif self.domain_algorithm == 'bpo': + best_hits = self.get_lowest_hit(list(hits), sorting_class='consensus', sorting_type=self.sorting_type) + else: + best_hits = self.get_best_cython_consensus(list(hits), + query_len, + query, + stdout_file_path=stdout_file_path) + # expanding hits + consensus_hits, total_hits, ref_files_consensus, ref_names_consensus = self.expand_best_combination(best_hits, dict_annotations[query]['ref_files']) + if 'is_essential_gene' in dict_annotations[query]: + is_essential = True + else: + is_essential = False + consensus_line = self.generate_consensus_line(query, + dict_annotations[query]['ref_files'], + is_essential, + consensus_hits, + total_hits, + ref_files_consensus, + ref_names_consensus) + gff_lines = [] + if self.output_gff: + gff_lines = self.generate_gff_line_consensus(query, + dict_annotations[query]['ref_files'], + is_essential, + consensus_hits, + total_hits, + ref_files_consensus, + ref_names_consensus) + yield consensus_line, gff_lines + + # same method as non_overlapping hits from Homology_processor , with some minor alterations to account for consensus + # @timeit_function + def get_best_cython_consensus(self, hits, query_len, query, stdout_file_path=None): + try: + best_hits = self.get_best_hits_Consensus(list(hits), query_len) + except (TimeoutError, RecursionError): + best_hits = self.get_best_hits_approximation(list(hits), sorting_class='consensus', sorting_type='evalue') + stdout_file = open(stdout_file_path, 'a+') + print(f'Query {query} was approximated during consensus generation', flush=True, file=stdout_file) + stdout_file.close() + return best_hits + + def query_hits_to_cython_Consensus(self, query_hits): + conversion_dict = {} + res = set() + for hit_i in range(len(query_hits)): + ref_file = query_hits[hit_i][0] + ref_hit = query_hits[hit_i][1] + hit_info = query_hits[hit_i][2] + query_start, query_end = recalculate_coordinates(hit_info['query_start'], + hit_info['query_end'], + self.overlap_value) + res.add(tuple([ + hit_i, + query_start, + query_end, + ref_hit + ])) + conversion_dict[hit_i] = [ref_file, ref_hit, hit_info] + return res, conversion_dict + + # this is for heuristic and bpo + def sort_scaled_hits(self, query_hits, sorting_type): + if not query_hits: + return query_hits + self.add_scaled_values(query_hits) + # this sorting is similar to self.sort_hits but is a bit more specific + sorted_hits = sorted(query_hits, key=lambda k: k[2][f'scaled_{sorting_type}'], reverse=True) + res = [] + # then we separate by sorting value + sorted_hits_groups = [] + c = 0 + for i in sorted_hits: + hit_value = i[2][f'scaled_{sorting_type}'] + if not sorted_hits_groups: + sorted_hits_groups.append([]) + current = hit_value + if hit_value == current: + sorted_hits_groups[c].append(i) + else: + sorted_hits_groups.append([i]) + c += 1 + current = hit_value + sec_sorting_type = 'bitscore' if sorting_type == 'evalue' else 'evalue' + for sg in sorted_hits_groups: + temp = sorted(sg, key=lambda k: k[2][f'scaled_{sec_sorting_type}'], reverse=True) + res.extend(temp) + for i in res: + i[2].pop('scaled_evalue') + i[2].pop('scaled_bitscore') + return res + + def get_min_max_alt_alg(self, query_hits): + all_bitscore, all_evalue = [], [] + for hit in query_hits: + ref_file, ref_hit, hit_info = hit + if hit_info['evalue']: + all_evalue.append(hit_info['evalue']) + if hit_info['bitscore']: + all_bitscore.append(hit_info['bitscore']) + min_val_bitscore, max_val_bitscore = log10(min(all_bitscore)), log10(max(all_bitscore)) + if all_evalue: + max_val_evalue, min_val_evalue = log10(min(all_evalue)), log10(max(all_evalue)) + else: + min_val_evalue, max_val_evalue = 0, 0 + return min_val_evalue, max_val_evalue, min_val_bitscore, max_val_bitscore + + def add_scaled_values(self, query_hits): + min_val_evalue, max_val_evalue, min_val_bitscore, max_val_bitscore = self.get_min_max_alt_alg(query_hits) + for hit in query_hits: + ref_file, ref_hit, hit_info = hit + hit_weight = self.get_ref_weight(ref_file) + hit_annotation_score = self.get_annotation_score(hit_info) + if hit_info['evalue']: + hit_info['scaled_evalue'] = min_max_scale(log10(hit_info['evalue']), + min_val_evalue, + max_val_evalue) + 0.01 + else: + hit_info['scaled_evalue'] = 2 + hit_info['scaled_bitscore'] = min_max_scale(log10(hit_info['bitscore']), + min_val_bitscore, + max_val_bitscore) + 0.01 + + hit_info['scaled_evalue'] *= (hit_annotation_score + hit_weight) / 2 + hit_info['scaled_bitscore'] *= (hit_annotation_score + hit_weight) / 2 + + def get_annotation_score(self, hit_info): + if hit_info['temp_identifiers'] and hit_info['temp_description']: + annotation_score = 1 + elif hit_info['temp_identifiers']: + annotation_score = 0.75 + elif hit_info['temp_description']: + annotation_score = 0.5 + else: + annotation_score = 0.25 + return annotation_score + + def get_intersecting_sources(self, combo, query_hits): + res = 0 + for hit in combo: + hit_source, hit_name, hit_info = hit + for q_hit in query_hits: + q_hit_source, q_hit_name, q_hit_info = q_hit + if hit_source != q_hit_source: + if self.is_hit_match(hit_info, hit_name, hit_source, q_hit_info, q_hit_name, q_hit_source): + res += 1 + return (res + len(combo)) / len(query_hits) + + def get_combo_score_Consensus(self, combo, query_length, min_val, max_val, ref_sources): + ''' + ref_sources= number of intersecting sources divided by number of hits from all sources + metrics for consensus: + intersecting sources = amount of different sources that intersect with the combo + combo coverage = percentage of the sequence covered by the combo + combo evalue = summed scaled evalue of the combo. Is averaged + combo weight = summed ref weight of the combo. Better sources should give better combinations of hits . Is averaged + combo annotation score = score of the annotation (0.25 if just the ref name, 0.5 if just the description, 0.75 if just identifiers, 1 if identifiers and description + ''' + average_value = 0 + average_hit_coverage = 0 + total_weight = 0 + annotation_score = 0 + hit_ranges = [] + + for hit in combo: + ref_file, ref_hit, hit_info = hit + hit_start, hit_end, ref_start, ref_end = hit_info['query_start'], hit_info['query_end'], \ + hit_info['ref_start'], hit_info['ref_end'] + ref_len = hit_info['ref_len'] + hit_evalue = hit_info['evalue'] + hit_bitscore = hit_info['bitscore'] + hit_coverage = (hit_end - hit_start + 1) / query_length + # ref_coverage = (ref_end - ref_start + 1) / ref_len + average_hit_coverage += hit_coverage + # lower is best + if self.sorting_type == 'evalue': + if hit_evalue: + scaled_value = min_max_scale(log10(hit_evalue), min_val, max_val) + 0.01 + else: + scaled_value = 2 + # higher is best + elif self.sorting_type == 'bitscore': + if hit_bitscore: + scaled_value = min_max_scale(log10(hit_bitscore), min_val, max_val) + 0.01 + else: + scaled_value = 0 + average_value += scaled_value + + hit_ranges.append([hit_start, hit_end]) + total_weight += self.get_ref_weight(ref_file) + annotation_score += self.get_annotation_score(hit_info) + + # sources score + sources_score = (ref_sources + annotation_score / len(combo) + total_weight / len(combo)) / 3 + # sequence hit quality + average_value = average_value / len(combo) + # average coverage of all hits + average_hit_coverage /= len(combo) + # coverage of combination + combination_coverage = get_combination_ranges(hit_ranges) + combination_coverage /= query_length + if self.best_combo_formula == 1: # default + combo_score = sources_score * average_value + elif self.best_combo_formula == 2: + combo_score = sources_score * average_value * combination_coverage * average_hit_coverage + elif self.best_combo_formula == 3: + combo_score = (sources_score * average_value * combination_coverage * average_hit_coverage) ** (1 / 4) + elif self.best_combo_formula == 4: + combo_score = sources_score * average_value * combination_coverage + elif self.best_combo_formula == 5: + combo_score = (sources_score * average_value * combination_coverage) ** (1 / 3) + elif self.best_combo_formula == 6: + combo_score = (sources_score * average_value + average_hit_coverage + combination_coverage) / 3 + elif self.best_combo_formula == 7: + combo_score = (sources_score * 2 * average_value + average_hit_coverage + combination_coverage) / 4 + elif self.best_combo_formula == 8: + combo_score = (sources_score * 2 * average_value + combination_coverage) / 3 + elif self.best_combo_formula == 9: + combo_score = (sources_score * 3 * average_value + average_hit_coverage + combination_coverage) / 5 + elif self.best_combo_formula == 10: + combo_score = (sources_score * 3 * average_value + combination_coverage) / 4 + elif self.best_combo_formula == 11: + combo_score = average_value * sources_score * (combination_coverage) ** 2 * (average_hit_coverage) ** 2 + elif self.best_combo_formula == 12: + combo_score = average_value * sources_score * (combination_coverage) ** 2 + + return combo_score + + def get_best_hits_Consensus(self, query_hits, query_length): + cython_hits, conversion_dict = self.query_hits_to_cython_Consensus(query_hits) + cython_possible_combos = get_non_overlapping_hits(cython_hits, time_limit=self.time_limit) + # sometimes this calculation is not feasible (when we have too many small hits and the query sequence is too long, the calculation of conbinations would take forever) + if not cython_possible_combos: return None + best_combo_score = 0 + best_combo = None + min_val, max_val = self.get_min_max_dfs(cython_possible_combos, conversion_dict, sorting_class='consensus') + for len_combo in cython_possible_combos: + if len_combo: + for cython_combo in cython_possible_combos[len_combo]: + combo = self.cython_to_query_hits(cython_combo, conversion_dict) + # we benefit the combo which intersect with other sources + ref_sources = self.get_intersecting_sources(combo, query_hits) + combo_score = self.get_combo_score_Consensus(combo, query_length, min_val, max_val, ref_sources) + if best_combo is None: + best_combo = combo + best_combo_score = combo_score + elif combo_score > best_combo_score: + best_combo_score = combo_score + best_combo = combo + return best_combo + + def is_overlap_Consensus(self, temp_queries, current_query): + # the coordinates here already take into account the overlap value, so even if the y set is small or empty, it doesnt matter + if not temp_queries or not current_query: + return False + y_start, y_end = recalculate_coordinates(current_query[2]['query_start'], + current_query[2]['query_end'], + self.overlap_value) + y = set(range(y_start, y_end)) + for t in temp_queries: + if t[1] == current_query[1]: + return True + x_start, x_end = recalculate_coordinates(t[2]['query_start'], + t[2]['query_end'], + self.overlap_value) + x = set(range(x_start, x_end)) + res = x.intersection(y) + if res: + return True + return False + + # @timeit_function + def expand_best_combination(self, best_hits, query_dict): + hits_merged = set() + ref_files_consensus = [] + ref_names_consensus = [] + for best_hit in best_hits: + best_hit_file, best_hit_name, best_hit_info = best_hit + hits_merged.add(best_hit_name) + ref_files_consensus.append(best_hit_file) + ref_names_consensus.append(best_hit_name) + temp_best_hit_info = dict(best_hit_info) + first_check = False + # iterations might change already iterated over best_hit_info, this way we check if there is any info added, if so, we repeat the cycle one more otherwise we return + while temp_best_hit_info != best_hit_info or not first_check: + for ref_file in query_dict: + for hit_to_test in query_dict[ref_file]: + if self.is_hit_match(query_dict[ref_file][hit_to_test], hit_to_test, ref_file, best_hit_info, + best_hit_name, best_hit_file): + self.add_to_hit(query_dict[ref_file][hit_to_test], best_hit_info) + hits_merged.add(hit_to_test) + ref_files_consensus.append(ref_file) + ref_names_consensus.append(hit_to_test) + first_check = True + temp_best_hit_info = dict(best_hit_info) + # checking how many hits we managed to merge out of all the hits - coverage_consensus + total_hits = 0 + for ref_file in query_dict: + for i in query_dict[ref_file]: + total_hits += 1 + return str(len(hits_merged)), str(total_hits), ref_files_consensus, ref_names_consensus + + def is_nlp_match(self, hit1_info_description, hit2_info_description): + if self.no_unifunc: + return False + for hit1_d in hit1_info_description: + for hit2_d in hit2_info_description: + score = self.get_similarity_score(hit1_d, hit2_d, only_return=True, verbose=False) + if score > self.nlp_threshold: + return True + return False + + # eggnog is not specific enough with the go identifiers, so if there is more than one, we dont use it for the consensus + def remove_unspecific_identifiers(self, identifiers, specificity_threshold=10): + res = set() + dict_counter = {} + for i in identifiers: + link_type = i.split(':')[0] + if link_type not in dict_counter: dict_counter[link_type] = 0 + dict_counter[link_type] += 1 + for i in identifiers: + link_type = i.split(':')[0] + if dict_counter[link_type] <= specificity_threshold: + res.add(i) + return res + + def is_overlap_coordinates(self, hit1, hit2): + if hit1[1] - hit1[0] < hit2[1] - hit2[0]: + smaller_hit = hit1 + else: + smaller_hit = hit2 + min_len = smaller_hit[1] - smaller_hit[0] + 1 + hit1_set = set(range(hit1[0], hit1[1] + 1)) + hit2_set = set(range(hit2[0], hit2[1] + 1)) + intersection_hits = hit1_set.intersection(hit2_set) + if len(intersection_hits) / min_len > self.minimum_consensus_overlap: + return True + return False + + # this makes mantis more efficient since it saves matches in memory, however it also makes it more ram intensive + def is_hit_match(self, hit1_info, hit1_name, hit1_source, hit2_info, hit2_name, hit2_source): + if hit1_source == hit2_source and hit1_name == hit2_name: + return False + if self.no_consensus_expansion: + return False + # cleaning up memory + if float(psutil.virtual_memory().percent) > 95: self.query_match_hits = {} + # to avoid adding duplicate entries + sorted_hits = sorted([hit1_source, hit2_source]) + if hit1_source == sorted_hits[0]: + first_hit_source = hit1_source + first_hit_name = hit1_name + second_hit_source = hit2_source + second_hit_name = hit2_name + else: + first_hit_source = hit2_source + first_hit_name = hit2_name + second_hit_source = hit1_source + second_hit_name = hit1_name + dict_key = first_hit_source + '_' + first_hit_name + '__' + second_hit_source + '_' + second_hit_name + sufficient_coord_overlap = self.is_overlap_coordinates([hit1_info['query_start'], hit1_info['query_end']], + [hit2_info['query_start'], hit2_info['query_end']]) + if dict_key in self.query_match_hits: + if sufficient_coord_overlap: + return self.query_match_hits[dict_key] + else: + return False + # now to actually check + hit1_info_description = hit1_info['temp_description'] + hit1_info_identifiers = hit1_info['temp_identifiers'] + hit2_info_description = hit2_info['temp_description'] + hit2_info_identifiers = hit2_info['temp_identifiers'] + temp_hit1_info_identifiers = self.remove_unspecific_identifiers(hit1_info_identifiers) + temp_hit2_info_identifiers = self.remove_unspecific_identifiers(hit2_info_identifiers) + + if sufficient_coord_overlap: + if temp_hit1_info_identifiers.intersection(temp_hit2_info_identifiers): + self.query_match_hits[dict_key] = True + return True + elif self.is_nlp_match(hit1_info_description, hit2_info_description): + self.query_match_hits[dict_key] = True + return True + else: + self.query_match_hits[dict_key] = False + return False + else: + return False + + def add_to_hit(self, hit_to_test_info, best_hit_info): + best_hit_info['identifiers'].update(hit_to_test_info['identifiers']) + best_hit_info['description'].update(hit_to_test_info['description']) + + # to remove bad descriptions + def remove_trash_descriptions(self, all_descriptions): + res = set() + for d in all_descriptions: + current_d = d.strip().lower() + if d.strip().lower() not in [ + 'enzyme', + 'domain', + 'protein', + 'unknown function', + 'domain of unknown function', + 'protein of unknown function', + 'uncharacterised protein family', + 'unknown protein family', + 'uncharacterized protein', + 'uncharacterised protein', + 'uncharacterised conserved protein', + 'uncharacterized conserved protein', + 'hypothetical protein', + ]: + if re.search('(protein|domain|domian|family|repeat|short repeats|region) (of|with) (unknown|unknwon) function(\s\(?[dp]uf\d{2,}\)?)?', current_d): + pass + else: + res.add(d) + return res + + # to remove redundant descriptions + def remove_redundant_descriptions(self, all_descriptions): + res = set() + already_added = set() + unspecific_tokens = ['enzyme', 'protein', 'domain'] + all_descriptions = sorted(all_descriptions) + for d in all_descriptions: + test = d.lower() + for p in set(punctuation): + test = test.replace(p, '') + test = test.replace('\'', '') + test = test.replace('\"', '') + test = test.strip() + test = test.split(' ') + test = [i.strip() for i in test if i not in unspecific_tokens] + test = ' '.join(test) + + if test not in already_added: + res.add(d) + already_added.add(test) + res = sorted(res) + return res + + def sort_ref_files_and_hits(self, ref_files_consensus, ref_names_consensus): + res = {} + for i in range(len(ref_files_consensus)): + r_file = ref_files_consensus[i] + if r_file not in res: res[r_file] = set() + res[r_file].add(ref_names_consensus[i]) + ref_files = [] + ref_hits = [] + + for r_file in sorted(res): + for r_hit in sorted(res[r_file]): + ref_files.append(r_file) + ref_hits.append(r_hit) + + ref_files = ';'.join(ref_files) + ref_hits = ';'.join(ref_hits) + return ref_files, ref_hits + + def generate_consensus_line(self, query, query_dict, is_essential, consensus_hits, total_hits, ref_files_consensus, + ref_names_consensus): + ref_files, ref_hits = self.sort_ref_files_and_hits(ref_files_consensus, ref_names_consensus) + # consensus_coverage is a str with consensus sources/all sources, better as a str instead of float as its easier to understand + row_start = [query, ref_files, ref_hits, consensus_hits, total_hits, '|'] + row_start = [str(i) for i in row_start] + all_descriptions = set() + all_identifiers = set() + for ref_file in ref_files_consensus: + for ref_hit_name in ref_names_consensus: + if ref_hit_name in query_dict[ref_file]: + description = query_dict[ref_file][ref_hit_name]['description'] + identifiers = query_dict[ref_file][ref_hit_name]['identifiers'] + all_identifiers.update(identifiers) + all_descriptions.update(description) + res = [] + sorted_identifiers = sorted(all_identifiers) + for link in sorted_identifiers: + if 'enzyme_ec' in link: + if link not in res: + res.append(link) + # so that description always comes in the end + for link in sorted_identifiers: + if 'kegg_map_lineage' not in link: + if link not in res: + res.append(link) + for link in sorted_identifiers: + if 'kegg_map_lineage' in link: + if link not in res: + res.append(link) + if is_essential: + res.append('is_essential_gene:True') + clean_descriptions = self.remove_redundant_descriptions(all_descriptions) + clean_descriptions = self.remove_trash_descriptions(clean_descriptions) + for link in clean_descriptions: + if 'description:' + link not in res: + res.append('description:' + link) + res = sorted(res) + res = row_start + res + return res + + def generate_consensus_output(self, interpreted_annotation_tsv, consensus_annotation_tsv, stdout_file_path=None): + first_line = ['Query', + 'Ref_Files', + 'Ref_Hits', + 'Consensus_hits', + 'Total_hits', + '|', + 'Links', + ] + first_line = '\t'.join(first_line) + output_file = open(consensus_annotation_tsv, 'w+') + output_file.write(first_line + '\n') + + gff_file = None + gff_file_path = consensus_annotation_tsv.replace('.tsv', '.gff') + if self.output_gff: + gff_file = open(gff_file_path, 'w+') + gff_file.write('##gff-version 3' + '\n') + + consensus_annotation = self.generate_consensus(interpreted_annotation_tsv, stdout_file_path=stdout_file_path) + if self.output_gff: + seq_regions = self.yield_seq_regions(interpreted_annotation_tsv) + for sr in seq_regions: + gff_file.write(sr) + + for consensus_query, gff_lines in consensus_annotation: + line = '\t'.join(consensus_query) + output_file.write(line + '\n') + if self.output_gff: + for gff_l in gff_lines: + gff_file.write(gff_l + '\n') + + if gff_file: + gff_file.close() + output_file.close() + + +if __name__ == '__main__': + m = Consensus() diff --git a/mantis/database_generator.py b/mantis/database_generator.py new file mode 100644 index 0000000..c65db50 --- /dev/null +++ b/mantis/database_generator.py @@ -0,0 +1,1434 @@ +import re + +try: + from mantis.exceptions import * + from mantis.utils import * + from mantis.unifunc_wrapper import UniFunc_wrapper + from mantis.uniprot_api import (submit_id_mapping, + check_id_mapping_results_ready, + get_id_mapping_results_link, + get_id_mapping_results_search) +except: + from exceptions import * + from utils import * + from unifunc_wrapper import UniFunc_wrapper + +if not unifunc_downloaded(): + download_unifunc() + + +class Database_generator(UniFunc_wrapper): + + ##################### Main function + @timeit_class + def setup_databases(self): + if not cython_compiled(): + compile_cython() + if not unifunc_downloaded(): + download_unifunc() + self.output_folder = f'{MANTIS_FOLDER}setup_databases/' + self.mantis_out = f'{self.output_folder}Mantis.out' + if file_exists(self.mantis_out): + os.remove(self.mantis_out) + Path(self.mantis_paths['default']).mkdir(parents=True, exist_ok=True) + Path(self.output_folder).mkdir(parents=True, exist_ok=True) + dbs_list = [ + 'ncbi_res', + 'pfam' if self.mantis_paths['pfam'][0:2] != 'NA' else None, + 'kofam' if self.mantis_paths['kofam'][0:2] != 'NA' else None, + 'tcdb' if self.mantis_paths['tcdb'][0:2] != 'NA' else None, + 'NCBI' if self.mantis_paths['NCBI'][0:2] != 'NA' else None, + ] + if self.use_taxonomy: + dbs_list.insert(0, 'taxonomy') + # DOWNLOADING + self.prepare_queue_setup_databases(dbs_list) + # for unzipping tax specific hmms + passed_tax_check = self.prepare_queue_setup_databases_tax() + worker_count = estimate_number_workers_setup_database(len(self.queue), user_cores=self.user_cores) + if worker_count: + print(f'Database will be setup with {worker_count} workers!', + flush=True, + file=self.redirect_verbose) + + self.processes_handler(self.worker_setup_databases, worker_count) + print_cyan('Finished downloading all data!', flush=True, file=self.redirect_verbose) + + if not passed_tax_check: + # METADATA for NOG + if self.nog_db == 'hmm': + # for NOG HMM we just recompile incomplete ones + self.prepare_queue_extract_metadata_NOG_HMM() + worker_count = estimate_number_workers_setup_database(len(self.queue), + minimum_jobs_per_worker=2, + user_cores=self.user_cores) + if worker_count: print(f'NOG metadata will be extracted with {worker_count} workers!', + flush=True, + file=self.redirect_verbose) + self.processes_handler(self.worker_extract_NOG_metadata_HMM, worker_count) + print('NOGG will now be compiled', flush=True, file=self.redirect_verbose) + self.compile_NOGG_HMM() + remove_file(self.mantis_paths['NOG'] + 'Pfam-A.hmm.dat') + for file in ['eggnog.db', 'Pfam-A.hmm.dat']: + file_path = self.mantis_paths['NOG'] + file + remove_file(file_path) + else: + # for NOG diamond we recompile everything + seqs_taxons = self.compile_NOG_DMND() + self.prepare_queue_extract_fastas_DMND(seqs_taxons) + worker_count = estimate_number_workers_setup_database(len(self.queue), + minimum_jobs_per_worker=2, + user_cores=self.user_cores) + if worker_count: print(f'NOG diamond fastas will be extracted with {worker_count} workers!', + flush=True, + file=self.redirect_verbose) + self.processes_handler(self.worker_extract_fastas_DMND, worker_count) + print('Creating NOGT diamond databases', flush=True, file=self.redirect_verbose) + self.compile_NOGT_DMND() + self.compile_NOGG_DMND() + for file in ['eggnog_proteins.dmnd', 'eggnog_proteins.dmnd.gz', 'eggnog.db', 'eggnog_seqs.faa', + 'metadata.tsv', 'Pfam-A.hmm.dat']: + file_path = self.mantis_paths['NOG'] + file + remove_file(file_path) + # we also remove the 1 folder since it doesn't actually exist in NCBI, this taxa is just a general taxon which we already added to NOGG anyway + if file_exists(self.mantis_paths['NOG'] + '1/'): + shutil.rmtree(self.mantis_paths['NOG'] + '1/') + + # SPLITTING + if self.hmm_chunk_size: + print_cyan('Will now split data into chunks!', flush=True, file=self.redirect_verbose) + self.prepare_queue_split_hmms() + worker_count = estimate_number_workers_setup_database(len(self.queue), user_cores=self.user_cores) + print(f'Database will be split with {worker_count} workers!', flush=True, file=self.redirect_verbose) + self.processes_handler(self.worker_split_hmms, worker_count) + self.prepare_queue_press_custom_hmms() + worker_count = estimate_number_workers_setup_database(len(self.queue), user_cores=self.user_cores) + print(f'HMMs will be pressed with {worker_count} workers!', flush=True, file=self.redirect_verbose) + self.processes_handler(self.worker_press_custom_hmms, worker_count) + print('Preparing NLP Resources!', flush=True, file=self.redirect_verbose) + UniFunc_wrapper.__init__(self) + print_cyan('Finished setting up databases!', flush=True, file=self.redirect_verbose) + + ##################### Filling queue with jobs + + def prepare_queue_setup_databases(self, dbs_list): + for database in dbs_list: + if database: + self.queue.append([database, self.mantis_out]) + + def prepare_queue_setup_databases_tax(self): + if not self.use_taxonomy: + return True + stdout_file = open(self.mantis_out, 'a+') + passed_tax_check = True + if self.mantis_paths['NOG'][0:2] != 'NA': + Path(self.mantis_paths['NOG']).mkdir(parents=True, exist_ok=True) + list_taxon_ids = self.get_taxon_for_queue_NOGT() + if not file_exists(self.mantis_paths['NOG']): passed_tax_check = False + for taxon_id in list_taxon_ids: + if taxon_id != '1': + if not self.check_reference_exists('NOGT', taxon_id=taxon_id): + passed_tax_check = False + break + if self.nog_db == 'hmm': + if not passed_tax_check: + for taxon_id in list_taxon_ids: + self.queue.append(['NOG_HMM', taxon_id, self.mantis_out]) + else: + if not passed_tax_check: + self.queue.append(['NOG_DMND', self.mantis_out]) + + if not passed_tax_check: + if file_exists(self.mantis_paths['NOG']): + shutil.rmtree(self.mantis_paths['NOG']) + Path(self.mantis_paths['NOG']).mkdir(parents=True, exist_ok=True) + with open(self.mantis_paths['NOG'] + 'readme.md', 'w+') as file: + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + file.write(f'This data was downloaded on {datetime_str}') + self.download_and_unzip_eggnogdb() + self.download_pfam_id_to_acc() + + return passed_tax_check + + def prepare_queue_split_hmms(self): + print('Checking which HMMs need to be split, this may take a while...', flush=True, file=self.redirect_verbose) + taxon_nog_hmms = self.get_taxon_for_queue_NOG_split_hmms() + taxon_ncbi_hmms = self.get_taxon_for_queue_ncbi_split_hmms() + general_hmms = self.get_hmm_for_queue_split_hmms() + print('Will split: ', [get_path_level(i) for i in taxon_nog_hmms + taxon_ncbi_hmms + general_hmms], + flush=True, + file=self.redirect_verbose) + for hmm_path in taxon_nog_hmms + taxon_ncbi_hmms + general_hmms: + self.queue.append([hmm_path, self.mantis_out]) + + def prepare_queue_press_custom_hmms(self): + print('Checking which custom hmms need to be pressed', flush=True, file=self.redirect_verbose) + hmms_list = [] + for hmm_path in self.get_custom_refs_paths(folder=False): + if hmm_path.endswith('.hmm'): + hmm_folder = add_slash(SPLITTER.join(hmm_path.split(SPLITTER)[:-1])) + hmm_name = hmm_path.split(SPLITTER)[-1] + if 'chunks' not in os.listdir(hmm_folder): + if hmm_folder[0:2] != 'NA': + if not self.check_missing_chunk_files(hmm_name, hmm_folder): + hmms_list.append(hmm_path) + if hmms_list: + print(f'Will hmmpress: {hmms_list}', flush=True, file=self.redirect_verbose) + for hmm_path in hmms_list: + self.queue.append([hmm_path, self.mantis_out]) + + def prepare_queue_extract_metadata_NOG_HMM(self): + if self.mantis_paths['NOG'][0:2] != 'NA': + print_cyan('Will now extract metadata from NOG!', flush=True, file=self.redirect_verbose) + stdout_file = open(self.mantis_out, 'a+') + for taxon_id in os.listdir(self.mantis_paths['NOG']): + if os.path.isdir(self.mantis_paths['NOG'] + taxon_id) and taxon_id != 'NOGG': + target_annotation_file = add_slash( + self.mantis_paths['NOG'] + taxon_id) + f'{taxon_id}_annotations.tsv' + target_sql_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + 'metadata.tsv' + if file_exists(target_sql_file): + if len(self.get_hmms_annotation_file(target_sql_file, hmm_col=0)) != len( + self.get_hmms_annotation_file(target_annotation_file, hmm_col=1)): + self.queue.append([target_sql_file, target_annotation_file, taxon_id, self.mantis_out]) + else: + print(f'Skipping metadata extraction for NOGT {taxon_id}', flush=True, file=stdout_file) + else: + self.queue.append([taxon_id, self.mantis_out]) + stdout_file.close() + + ##################### Executing queue jobs + + def worker_setup_databases(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: + break + if len(record) == 2: + database, stdout_path = record + taxon_id = None + else: + database, taxon_id, stdout_path = record + self.download_database(database, taxon_id, stdout_path) + if not self.check_reference_exists(database, taxon_id): + stdout_file = open(stdout_path, 'a+') + if taxon_id: + print(f'Setup failed on {database} {taxon_id}', flush=True, file=stdout_file) + else: + print(f'Setup failed on {database}', flush=True, file=stdout_file) + stdout_file.close() + queue.insert(0, record) + + def worker_split_hmms(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: + break + hmm_path, stdout_path = record + self.split_hmm_into_chunks(hmm_path, stdout_path) + + def worker_press_custom_hmms(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + hmm_path, stdout_path = record + self.press_custom_hmms(hmm_path, stdout_path) + + def worker_extract_NOG_metadata_HMM(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + taxon_id, stdout_path = record + self.get_metadata_hmms(taxon_id=taxon_id, stdout_path=stdout_path) + + ##################### Main download function + + def download_database(self, database, taxon_id=None, stdout_path=None): + stdout_file = open(stdout_path, 'a+') + if database == 'taxonomy': + self.download_taxonomy_resources(stdout_file=stdout_file) + elif database == 'ncbi_res': + self.download_ncbi_resources(stdout_file=stdout_file) + elif database == 'pfam': + self.download_pfam(stdout_file=stdout_file) + elif database == 'kofam': + self.download_kofam(stdout_file=stdout_file) + elif database == 'tcdb': + self.download_tcdb(stdout_file=stdout_file) + elif database == 'NCBI': + self.download_NCBI(stdout_file=stdout_file) + elif database == 'NOG_HMM': + self.download_NOGT(taxon_id=taxon_id, stdout_file=stdout_file) + elif database == 'NOG_DMND': + self.download_NOG_DMND(stdout_file=stdout_file) + if taxon_id: + print(f'Finished downloading {database} with taxon {taxon_id}', flush=True, file=stdout_file) + else: + print(f'Finished downloading {database}', flush=True, file=stdout_file) + stdout_file.close() + + def download_ncbi_resources(self, stdout_file=None): + ncbi_resources = add_slash(self.mantis_paths['resources'] + 'NCBI') + Path(ncbi_resources).mkdir(parents=True, exist_ok=True) + if file_exists(ncbi_resources + 'gc.prt'): + print('Translation tables already exist! Skipping...', flush=True, file=stdout_file) + return + try: + os.remove(ncbi_resources + 'gc.prt') + except: + pass + translation_tables_url = 'https://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt' + with open(ncbi_resources + 'readme.md', 'w+') as file: + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + file.write( + f'These files were downloaded on {datetime_str} from:\n{translation_tables_url}\nThey are used to translate CDS') + print_cyan('Downloading and unzipping NCBI resources', flush=True, file=stdout_file) + for url in [translation_tables_url]: + download_file(url, output_folder=ncbi_resources, stdout_file=stdout_file) + + def download_taxonomy_resources(self, stdout_file=None): + if not file_exists(self.mantis_paths['resources'] + 'Taxonomy.db'): + self.launch_taxonomy_connector() + self.create_taxonomy_db() + self.close_taxonomy_connection() + + def get_common_links_metadata(self, string, res): + ec = find_ecs(string) + if ec: + if 'enzyme_ec' not in res: res['enzyme_ec'] = set() + res['enzyme_ec'].update(ec) + tc = find_tcdb(string) + if tc: + if 'tcdb' not in res: res['tcdb'] = set() + res['tcdb'].update(tc) + tigr = find_tigrfam(string) + if tigr: + if 'tigrfam' not in res: res['tigrfam'] = set() + res['tigrfam'].update(tigr) + ko = find_ko(string) + if ko: + if 'kegg_ko' not in res: res['kegg_ko'] = set() + res['kegg_ko'].update(ko) + pfam = find_pfam(string) + if pfam: + if 'pfam' not in res: res['pfam'] = set() + res['pfam'].update(pfam) + cog = find_cog(string) + if cog: + if 'cog' not in res: res['cog'] = set() + res['cog'].update(cog) + arcog = find_arcog(string) + if arcog: + if 'arcog' not in res: res['arcog'] = set() + res['arcog'].update(cog) + go = find_go(string) + if go: + if 'go' not in res: res['go'] = set() + res['go'].update(go) + + def write_metadata(self, metadata, metadata_file): + with open(metadata_file, 'w+') as file: + for seq in metadata: + link_line = f'{seq}\t|' + for link_type in metadata[seq]: + for inner_link in metadata[seq][link_type]: + if inner_link: + link_line += f'\t{link_type}:{inner_link}' + link_line += '\n' + file.write(link_line) + + ##################### PFAM + + def read_pfam2go(self): + res = {} + with open(self.mantis_paths['pfam'] + 'pfam2go') as pfam2go_file: + line = pfam2go_file.readline() + while line: + line = line.strip('\n') + if '!' not in line[0]: + line = line.split('>') + line = [i.strip() for i in line] + pfam_id = line[0].split()[0].replace('Pfam:', '') + go_annots = line[1].split(';') + go_description = [i.replace('GO:', '').strip() for i in go_annots if not re.search('GO:\d{3,}', i)] + go_ids = [i.replace('GO:', '').strip() for i in go_annots if re.search('GO:\d{3,}', i)] + if pfam_id not in res: + res[pfam_id] = {'go': set(go_ids), 'description': set(go_description)} + else: + res[pfam_id]['go'].update(go_ids) + res[pfam_id]['description'].update(go_description) + line = pfam2go_file.readline() + return res + + def is_good_description(self, hmm, row_description): + temp = [i.lower() for i in row_description.split()] + if hmm.lower() in temp and 'protein' in temp and len(temp) == 2: + return False + if re.search(' [uU]nknown [Ff]unction', row_description): return False + return True + + def build_pfam_line(self, hmm, metadata): + link_line = '' + pfam_ids = set(metadata['pfam']) + metadata['pfam'] = set() + for p_id in pfam_ids: + metadata['pfam'].add(p_id.split('.')[0]) + for link_type in metadata: + for inner_link in metadata[link_type]: + link_line += f'\t{link_type}:{inner_link}' + return hmm + f'\t|{link_line}\n' + + def get_hmm_info_pfam(self, line, file, pfam2go): + if line.startswith('#=GF ID'): + hmm = line.replace('#=GF ID', '').strip('\n').strip() + if hmm: + line = file.readline() + if line.startswith('#=GF AC'): + pfam_accession = line.replace('#=GF AC', '').strip('\n').strip().split('.')[0] + line = file.readline() + if line.startswith('#=GF DE'): + hmm_description = line.replace('#=GF DE', '').strip('\n').strip() + + if pfam_accession in pfam2go: + current_metadata = pfam2go[pfam_accession] + else: + current_metadata = {'description': set()} + + current_metadata['pfam'] = set() + current_metadata['pfam'].add(pfam_accession) + current_metadata['pfam'].add(hmm) + if self.is_good_description(hmm, hmm_description): + current_metadata['description'].add(hmm_description) + get_common_links_metadata(hmm_description, current_metadata) + metadata_line = self.build_pfam_line(hmm, current_metadata) + return metadata_line + + def compile_pfam_metadata(self): + pfam2go = self.read_pfam2go() + with open(self.mantis_paths['pfam'] + 'metadata.tsv', 'w+') as pfam_metadata_file: + with open(self.mantis_paths['pfam'] + 'Pfam-A.hmm.dat') as pfam_dat_file: + for line in pfam_dat_file: + line = line.strip('\n') + if line.startswith('#=GF ID'): + metadata_line = self.get_hmm_info_pfam(line, pfam_dat_file, pfam2go) + pfam_metadata_file.write(metadata_line) + remove_file(self.mantis_paths['pfam'] + 'Pfam-A.hmm.dat') + remove_file(self.mantis_paths['pfam'] + 'pfam2go') + + def download_pfam(self, stdout_file=None): + Path(self.mantis_paths['pfam']).mkdir(parents=True, exist_ok=True) + if self.check_reference_exists('pfam') and \ + file_exists(self.mantis_paths['pfam'] + 'metadata.tsv'): + print('Pfam hmm already exists! Skipping...', flush=True, file=stdout_file) + return + pfam_hmm = 'http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz' + pfam_metadata = 'http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz' + pfam2go = 'http://current.geneontology.org/ontology/external2go/pfam2go' + with open(self.mantis_paths['pfam'] + 'readme.md', 'w+') as file: + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + file.write( + f'This hmm was downloaded on {datetime_str} from:\n{pfam_hmm}\nMetadata was downloaded from:\n{pfam_metadata}\nPfam2GO was downloaded from:\n{pfam2go}') + print_cyan('Downloading and unzipping Pfam hmms ', flush=True, file=stdout_file) + to_download = [] + to_unzip = [] + if not file_exists(self.mantis_paths['pfam'] + 'Pfam-A.hmm'): + to_download.append(pfam_hmm) + to_unzip.append('Pfam-A.hmm.gz') + if not file_exists(self.mantis_paths['pfam'] + 'metadata.tsv'): + to_unzip.append('Pfam-A.hmm.dat.gz') + to_download.append(pfam_metadata) + to_download.append(pfam2go) + for url in to_download: + download_file(url, output_folder=self.mantis_paths['pfam'], stdout_file=stdout_file) + for file_to_unzip in to_unzip: + uncompress_archive(source_filepath=self.mantis_paths['pfam'] + file_to_unzip, stdout_file=stdout_file, + remove_source=True) + if not self.check_reference_exists('pfam'): + run_command('hmmpress ' + self.mantis_paths['pfam'] + 'Pfam-A.hmm', stdout_file=stdout_file) + self.compile_pfam_metadata() + + ##################### KOFAM + + def compile_kofam_metadata(self): + metadata_to_write = {} + self.get_link_kofam_ko_list(metadata_to_write) + self.get_link_kofam_ko_to_binary(metadata_to_write, target_file='ko2cog.xl') + self.get_link_kofam_ko_to_binary(metadata_to_write, target_file='ko2go.xl') + self.get_link_kofam_ko_to_binary(metadata_to_write, target_file='ko2tc.xl') + self.get_link_kofam_ko_to_binary(metadata_to_write, target_file='ko2cazy.xl') + self.write_metadata(metadata_to_write, self.mantis_paths['kofam'] + 'metadata.tsv') + remove_file(self.mantis_paths['kofam'] + 'ko_list') + remove_file(self.mantis_paths['kofam'] + 'ko2cazy.xl') + remove_file(self.mantis_paths['kofam'] + 'ko2cog.xl') + remove_file(self.mantis_paths['kofam'] + 'ko2go.xl') + remove_file(self.mantis_paths['kofam'] + 'ko2tc.xl') + + def get_link_kofam_ko_list(self, res): + file_path = self.mantis_paths['kofam'] + 'ko_list' + with open(file_path) as file: + line = file.readline() + while line: + line = line.strip('\n').split('\t') + ko, description = line[0], line[-1] + if ko not in res: res[ko] = {} + if '[EC:' in description: + description, temp_links = description.split('[EC:') + else: + temp_links = description + get_common_links_metadata(temp_links, res[ko]) + if 'kegg_ko' not in res[ko]: res[ko]['kegg_ko'] = set() + res[ko]['kegg_ko'].add(ko) + if 'description' not in res[ko]: + res[ko]['description'] = set() + res[ko]['description'].add(description) + line = file.readline() + + def get_link_kofam_ko_to_binary(self, res, target_file): + file_path = self.mantis_paths['kofam'] + target_file + if 'ko2tc' in target_file: + target_link = 'tcdb' + else: + target_link = target_file.replace('ko2', '').replace('.xl', '') + with open(file_path) as file: + line = file.readline() + line = file.readline() + while line: + line = line.strip('\n').split('\t') + ko, link = line + link = link.strip('[]').split(':')[1].split() + if ko not in res: res[ko] = {} + if target_link not in res[ko]: res[ko][target_link] = set() + res[ko][target_link].update(link) + line = file.readline() + + def download_kofam(self, stdout_file=None): + Path(self.mantis_paths['kofam']).mkdir(parents=True, exist_ok=True) + if self.check_reference_exists('kofam') and \ + file_exists(self.mantis_paths['kofam'] + 'metadata.tsv'): + print('KOfam HMM already exists! Skipping...', flush=True, file=stdout_file) + return + kofam_hmm = 'https://www.genome.jp/ftp/db/kofam/profiles.tar.gz' + ko_list = 'https://www.genome.jp/ftp/db/kofam/ko_list.gz' + ko_to_cog = 'https://www.kegg.jp/kegg/files/ko2cog.xl' + ko_to_go = 'https://www.kegg.jp/kegg/files/ko2go.xl' + ko_to_tc = 'https://www.kegg.jp/kegg/files/ko2tc.xl' + ko_to_cazy = 'https://www.kegg.jp/kegg/files/ko2cazy.xl' + with open(self.mantis_paths['kofam'] + 'readme.md', 'w+') as file: + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + file.write( + f'This hmm was downloaded on {datetime_str} from:\n{kofam_hmm}\nMetadata was downloaded from:\n{ko_list}\n{ko_to_cog}\n{ko_to_go}\n{ko_to_tc}\n{ko_to_cazy}') + print_cyan('Downloading and unzipping KOfam hmms ', flush=True, file=stdout_file) + for url in [kofam_hmm, ko_list, ko_to_cog, ko_to_go, ko_to_tc, ko_to_cazy]: + download_file(url, output_folder=self.mantis_paths['kofam'], stdout_file=stdout_file) + uncompress_archive(source_filepath=self.mantis_paths['kofam'] + 'profiles.tar.gz', + extract_path=self.mantis_paths['kofam'], stdout_file=stdout_file, remove_source=True) + uncompress_archive(source_filepath=self.mantis_paths['kofam'] + 'ko_list.gz', stdout_file=stdout_file, + remove_source=True) + merge_profiles(self.mantis_paths['kofam'] + 'profiles/', self.mantis_paths['kofam'] + 'kofam_merged.hmm', + stdout_file=stdout_file) + run_command('hmmpress ' + self.mantis_paths['kofam'] + 'kofam_merged.hmm', stdout_file=stdout_file) + self.compile_kofam_metadata() + + ##################### NCBI + + def download_NCBI(self, stdout_file=None): + Path(self.mantis_paths['NCBI']).mkdir(parents=True, exist_ok=True) + ncbi_hmm = 'https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.HMM.tgz' + metadata = 'https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv' + + # we cant verify a priori which foulders we should have, so you need to delete the folder to restart + + if self.check_reference_exists('NCBI') and \ + file_exists(self.mantis_paths['NCBI'] + 'readme.md'): + print('NCBI hmm folder already exists! Skipping...', flush=True, file=stdout_file) + return + + with open(self.mantis_paths['NCBI'] + 'readme.md', 'w+') as file: + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + file.write( + f'This hmm was downloaded on {datetime_str} from:\n{ncbi_hmm}\nMetadata was downloaded from:\n{metadata}') + + print_cyan('Downloading and unzipping NCBI hmms ', flush=True, file=stdout_file) + for url in [ncbi_hmm, metadata]: + download_file(url, output_folder=self.mantis_paths['NCBI'], stdout_file=stdout_file) + move_file(self.mantis_paths['NCBI'] + 'hmm_PGAP.HMM.tgz', self.mantis_paths['NCBI'] + 'hmm_PGAP.HMM.tar.gz') + uncompress_archive(source_filepath=self.mantis_paths['NCBI'] + 'hmm_PGAP.HMM.tar.gz', + extract_path=self.mantis_paths['NCBI'], stdout_file=stdout_file, remove_source=True) + self.compile_hmms_NCBI(stdout_file=stdout_file) + remove_file(self.mantis_paths['NCBI'] + 'hmm_PGAP.tsv') + if file_exists(self.mantis_paths['NCBI'] + 'hmm_PGAP'): + shutil.rmtree(self.mantis_paths['NCBI'] + 'hmm_PGAP') + + # sorting profiles by the taxonomic id from ncbi metadata file + def compile_hmms_NCBI(self, stdout_file=None): + sorted_metadata = self.sort_hmms_NCBI() + self.write_metadata_ncbi(sorted_metadata) + self.assign_hmm_profiles(sorted_metadata, stdout_file=stdout_file) + + def assign_hmm_profiles(self, sorted_metadata, stdout_file=None): + for taxa in sorted_metadata: + for hmm, hmm_label, description, enzyme_ec, go_terms, common_links in sorted_metadata[taxa]: + try: + copy_file(add_slash(self.mantis_paths['NCBI'] + 'hmm_PGAP') + hmm + '.HMM', + add_slash(add_slash(self.mantis_paths['NCBI'] + taxa) + 'to_merge') + hmm + '.hmm') + except: + pass + if os.listdir(add_slash(add_slash(self.mantis_paths['NCBI'] + taxa) + 'to_merge')): + merge_profiles(add_slash(add_slash(self.mantis_paths['NCBI'] + taxa) + 'to_merge'), + add_slash(self.mantis_paths['NCBI'] + taxa) + taxa + '_merged.hmm') + run_command('hmmpress ' + add_slash(self.mantis_paths['NCBI'] + taxa) + taxa + '_merged.hmm', + stdout_file=stdout_file) + else: + shutil.rmtree(self.mantis_paths['NCBI'] + taxa) + + def write_metadata_ncbi(self, sorted_metadata): + for taxa in sorted_metadata: + Path(self.mantis_paths['NCBI'] + taxa).mkdir(parents=True, exist_ok=True) + Path(add_slash(self.mantis_paths['NCBI'] + taxa) + 'to_merge').mkdir(parents=True, exist_ok=True) + with open(add_slash(self.mantis_paths['NCBI'] + taxa) + 'metadata.tsv', 'w+') as metadata_file: + for hmm, hmm_label, description, enzyme_ec, go_terms, common_links in sorted_metadata[taxa]: + line = [hmm_label, '|', f'description:{description}'] + + for db in common_links: + for db_id in common_links[db]: + if f'{db}:{db_id}' not in line: + line.append(f'{db}:{db_id}') + for ec in enzyme_ec: + if f'enzyme_ec:{ec}' not in line: + line.append(f'enzyme_ec:{ec}') + for go_term in go_terms: + if f'go:{go_term}' not in line: + line.append(f'go:{go_term}') + # ncbi also contains tigrfam hmms + if hmm_label.startswith('TIGR'): + if f'tigrfam:{hmm_label}' not in line: + line.append(f'tigrfam:{hmm_label}') + + line = '\t'.join(line) + '\n' + metadata_file.write(line) + + def get_ncbi_domains(self): + # this is from NCBI's top level taxonomy page + return [ + '1', # top level in NOG + '2157', # Archaea + '2', # Bacteria + '2759', # Eukaryota + '10239', # Viruses + '28384', # Others + '12908', # Unclassified + ] + + def sort_hmms_NCBI(self): + general_taxon_ids = self.get_ncbi_domains() + res = {'NCBIG': []} + already_added_NCBIG = set() + metadata = self.mantis_paths['NCBI'] + 'hmm_PGAP.tsv' + with open(metadata) as file: + line = file.readline() + line = file.readline() + while line: + line = line.strip('\n') + line = line.split('\t') + hmm, hmm_label, description, enzyme_ec, go_terms, taxa_id = line[0], line[2], line[10], line[12], line[ + 13], line[15] + common_links = {} + get_common_links_metadata(description, common_links) + for db in common_links: + for db_id in common_links[db]: + description = description.replace(db_id, '').strip() + description = description.replace('(Provisional)', '') + description = description.strip() + enzyme_ec = [i for i in enzyme_ec.split(',') if i] + go_terms = [i.replace('GO:', '') for i in go_terms.split(',') if i] + line = file.readline() + if taxa_id: + if taxa_id not in res: res[taxa_id] = [] + res[taxa_id].append([hmm, hmm_label, description, enzyme_ec, go_terms, common_links]) + if taxa_id in general_taxon_ids: + if hmm not in already_added_NCBIG: + res['NCBIG'].append([hmm, hmm_label, description, enzyme_ec, go_terms, common_links]) + already_added_NCBIG.add(hmm) + else: + if hmm not in already_added_NCBIG: + res['NCBIG'].append([hmm, hmm_label, description, enzyme_ec, go_terms, common_links]) + already_added_NCBIG.add(hmm) + return res + + ##################### TCDB + + def parse_tsv_tcdb(self, file_path, key_col, val_col, res_key, res, val_clean_function=None): + with open(file_path) as file: + line = file.readline() + while line: + line = line.strip('\n') + line = line.split('\t') + if line[key_col] not in res: res[line[key_col]] = {} + if res_key not in res[line[key_col]]: res[line[key_col]][res_key] = set() + if val_clean_function: + line[val_col] = val_clean_function(line[val_col]) + res[line[key_col]][res_key].add(line[val_col]) + line = file.readline() + + def read_tcdb_headers(self): + file_path = self.mantis_paths['tcdb'] + 'tcdb' + res = {} + with open(file_path) as file: + line = file.readline() + while line: + line = line.strip('\n') + if '>' in line: + line = line.split('|') + uniprot_accession = line[2] + tc_id = line[3].split()[0] + description = line[3].replace(tc_id, '').strip() + if re.search('([Hh]ypothetical|[Uu]ncharacterized|[Uu]ndetermined)', description): description = '' + description = description.split('[')[0] + if re.search('[A-Z]+=', description): + description = description.split(re.search('[A-Z]+=', description).group())[0] + if ' - ' in description: + description = description.split(' - ')[0] + description = description.strip() + if description: + res[uniprot_accession] = {'tcdb': tc_id, 'description': {description.strip()}} + else: + res[uniprot_accession] = {'tcdb': tc_id} + line = file.readline() + return res + + def remove_bad_entries(self, all_seqs): + # some proteins are not in uniprot, but are in some other dbs.... Im not sure if they should be removed, but we will consider uniprot as the central repo, so we will remove them + chunks_post = chunk_generator(all_seqs, 500) + all_found = set() + for seqs_chunk in chunks_post: + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=seqs_chunk) + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + results = get_id_mapping_results_search(link) + if 'failedIds' in results: + failed_ids = results['failedIds'] + else: + failed_ids = [] + for seq in seqs_chunk: + if seq not in failed_ids: + all_found.add(seq) + return all_found + + def yield_tcdb_seqs(self, tcdb_fasta, seqs_found): + tcdb_seqs = read_protein_fasta_generator(tcdb_fasta) + for seq_id, seq in tcdb_seqs: + uniprot_accession = seq_id.split('|')[2] + if uniprot_accession in seqs_found: + yield uniprot_accession, seq + + def generate_tcdb_fasta(self, tcdb_fasta, seqs_found): + write_fasta_generator(self.yield_tcdb_seqs(tcdb_fasta, seqs_found), self.mantis_paths['tcdb'] + 'tcdb.faa') + + def compile_tcdb_metadata(self): + # acession will be the key + all_seqs = self.read_tcdb_headers() + seqs_found = self.remove_bad_entries(list(all_seqs.keys())) + self.generate_tcdb_fasta(self.mantis_paths['tcdb'] + 'tcdb', seqs_found) + # here tc ids will be keys + metadata = {} + self.parse_tsv_tcdb(self.mantis_paths['tcdb'] + 'go.py', 1, 0, 'go', metadata, + val_clean_function=lambda a: a.replace('GO:', '')) + self.parse_tsv_tcdb(self.mantis_paths['tcdb'] + 'pfam.py', 1, 0, 'pfam', metadata) + # we now add the tc specific metadata to each acession + metadata_to_write = {} + for seq in seqs_found: + tc_id = all_seqs[seq]['tcdb'] + if tc_id in metadata: + metadata_to_write[seq] = metadata[tc_id] + else: + metadata_to_write[seq] = {} + if 'description' in all_seqs[seq]: metadata_to_write[seq]['description'] = all_seqs[seq]['description'] + metadata_to_write[seq]['tcdb'] = {tc_id} + + self.write_metadata(metadata_to_write, self.mantis_paths['tcdb'] + 'metadata.tsv') + + def download_tcdb(self, stdout_file=None): + Path(self.mantis_paths['tcdb']).mkdir(parents=True, exist_ok=True) + if self.check_reference_exists('tcdb') and \ + file_exists(self.mantis_paths['tcdb'] + 'metadata.tsv'): + print('TCDB sequences already exists! Skipping...', flush=True, file=stdout_file) + return + tcdb_seqs = 'http://www.tcdb.org/public/tcdb' + tcdb_go = 'https://www.tcdb.org/cgi-bin/projectv/public/go.py' + tcdb_pfam = 'https://www.tcdb.org/cgi-bin/projectv/public/pfam.py' + + with open(self.mantis_paths['tcdb'] + 'readme.md', 'w+') as file: + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + file.write(f'These sequences were downloaded on {datetime_str} from:\n{tcdb_seqs}\n' + f'Metadata was downloaded from:\n{tcdb_go}\n{tcdb_pfam}\n') + print_cyan('Downloading and unzipping TCDB sequences', flush=True, file=stdout_file) + for link in [tcdb_seqs, + tcdb_go, + tcdb_pfam, + ]: + download_file(link, output_folder=self.mantis_paths['tcdb'], stdout_file=stdout_file) + self.compile_tcdb_metadata() + + if not file_exists(self.mantis_paths['tcdb'] + 'tcdb.dmnd'): + run_command(f'diamond makedb --in ' + self.mantis_paths['tcdb'] + 'tcdb.faa -d ' + self.mantis_paths[ + 'tcdb'] + 'tcdb', stdout_file=stdout_file) + + ##################### NOG + + def check_completeness_NOGG(self, nogg_file, list_file_paths): + if not file_exists(nogg_file): + return False + nogg_hmms = set() + nogt_hmms = set() + with open(nogg_file) as file: + line = file.readline() + while line: + hmm_name = line.split('\t')[0] + nogg_hmms.add(hmm_name) + line = file.readline() + for f in list_file_paths: + with open(f) as file: + line = file.readline() + while line: + hmm_name = line.split('\t')[0] + if hmm_name not in nogg_hmms: + return False + nogt_hmms.add(hmm_name) + line = file.readline() + if not nogg_hmms == nogt_hmms: + return False + return True + + def compile_NOGG_HMM(self): + # this is not feasible for all HMMs, so we just select the most general taxa (i.e., domains) + if self.mantis_paths['NOG'][0:2] != 'NA': + stdout_file = open(self.mantis_out, 'a+') + target_annotation_file = add_slash(self.mantis_paths['NOG'] + 'NOGG') + 'metadata.tsv' + target_merged_hmm = add_slash(self.mantis_paths['NOG'] + 'NOGG') + 'NOGG_merged.hmm' + all_sql = set() + all_hmm = set() + for taxon_id in self.get_ncbi_domains(): + if taxon_id != 'NOGG': + if os.path.isdir(self.mantis_paths['NOG'] + taxon_id): + target_hmm_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + f'{taxon_id}_merged.hmm' + target_sql_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + 'metadata.tsv' + all_sql.add(target_sql_file) + all_hmm.add(target_hmm_file) + + if not self.check_completeness_NOGG(target_annotation_file, all_sql) or \ + not self.check_reference_exists('NOGG'): + if file_exists(self.mantis_paths['NOG'] + 'NOGG'): shutil.rmtree(self.mantis_paths['NOG'] + 'NOGG') + Path(self.mantis_paths['NOG'] + 'NOGG').mkdir(parents=True, exist_ok=True) + else: + print('NOGG already compiled, skipping...', flush=True, file=stdout_file) + return + print_cyan('Compiling global NOG hmms ', flush=True, file=stdout_file) + # now to merge all tshmms into one + merge_redundant_profiles(output_file=target_merged_hmm, list_file_paths=all_hmm, stdout_file=stdout_file) + merge_redundant_sql_annotations(output_file=target_annotation_file, list_file_paths=all_sql, + stdout_file=stdout_file) + run_command('hmmpress ' + target_merged_hmm, stdout_file=stdout_file) + stdout_file.close() + + def download_NOGT(self, taxon_id, stdout_file=None): + + folder_path = add_slash(self.mantis_paths['NOG'] + taxon_id) + Path(folder_path).mkdir(parents=True, exist_ok=True) + + target_annotation_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + f'{taxon_id}_annotations.tsv' + target_merged_hmm = add_slash(self.mantis_paths['NOG'] + taxon_id) + f'{taxon_id}_merged.hmm' + eggnog_downloads_page = f'http://eggnog5.embl.de/download/latest/per_tax_level/{taxon_id}/' + + for file in ['_annotations.tsv.gz', '_hmms.tar.gz']: + url = f'{eggnog_downloads_page}{taxon_id}{file}' + download_file(url, output_folder=folder_path, stdout_file=stdout_file) + if file_exists(f'{folder_path}profiles'): shutil.rmtree(f'{folder_path}profiles') + uncompress_archive(source_filepath=f'{folder_path}{taxon_id}_hmms.tar.gz', + extract_path=f'{folder_path}profiles', stdout_file=stdout_file, remove_source=True) + uncompress_archive(source_filepath=f'{folder_path}{taxon_id}_annotations.tsv.gz', + stdout_file=stdout_file, + remove_source=True) + # removing gz extension + hmm_files = os.listdir(folder_path + 'profiles') + for hmm_profile in hmm_files: + if '.hmm' in hmm_profile: move_file(hmm_profile, hmm_profile.strip('.gz')) + merge_profiles(f'{folder_path}profiles/{taxon_id}', f'{folder_path}{taxon_id}_merged.hmm', + stdout_file=stdout_file) + if file_exists(f'{folder_path}profiles'): shutil.rmtree(f'{folder_path}profiles') + run_command(f'hmmpress {target_merged_hmm}', stdout_file=stdout_file) + + def download_and_unzip_eggnogdb(self, stdout_file=None): + Path(self.mantis_paths['default']).mkdir(parents=True, exist_ok=True) + if file_exists(self.mantis_paths['NOG'] + 'eggnog.db'): + print('eggnog.db already exists! Skipping...', flush=True, file=stdout_file) + return + else: + if file_exists(self.mantis_paths['default'] + 'eggnog.NOG'): + remove_file(self.mantis_paths['NOG'] + 'eggnog.db') + url = 'http://eggnogdb.embl.de/download/emapperdb-' + self.get_latest_version_eggnog() + '/eggnog.db.gz' + download_file(url, output_folder=self.mantis_paths['NOG'], stdout_file=stdout_file) + uncompress_archive(source_filepath=self.mantis_paths['NOG'] + 'eggnog.db.gz', + stdout_file=stdout_file, + remove_source=True) + + ### Support functions for setting up queue to download NOGT hmms + + def get_latest_version_eggnog(self): + url = 'http://eggnog5.embl.de/download/' + webpage = None + c = 0 + while not webpage and c <= 10: + try: + req = requests.get(url) + webpage = req.text + except: + c += 1 + versions_id = re.findall('href="emapperdb-.*"', webpage) + res = [] + for v in versions_id: + version_id = v.replace('href="emapperdb-', '') + version_id = version_id.replace('/"', '') + res.append(version_id) + if res: + return max(res) + else: + # hard coded just in case the user doesnt have access to internet, should be updated + current_version = '5.0.2' + print( + f'eggNOG database version retrieval failed, so returning hardcoded eggnog database version {current_version}, please change if a newer version is available') + return current_version + + def get_taxon_ids_eggNOG(self): + # this will get all available taxon ids from the eggnog webpage + url = 'http://eggnog5.embl.de/download/latest/per_tax_level/' + webpage = None + c = 0 + while not webpage and c <= 10: + try: + req = requests.get(url) + webpage = req.text + except: + c += 1 + if isinstance(webpage, str): + taxons_search = re.findall('href="\d+/"', webpage) + taxons = [re.search('\d+', i).group() for i in taxons_search] + else: + taxons = [] + # if no connection is established, we just return the local taxon ids (will be empty if no connection is available during setup) + if not taxons: return self.get_local_ref_taxon_ids('NOG') + return taxons + + def get_taxon_for_queue_NOGT(self, stdout_file=None): + # we will download tax specific hmms, which can be used for more specific hmmscans + taxon_ids = self.get_ref_taxon_ids('NOG') + res = [] + for taxon_id in taxon_ids: + target_annotation_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + taxon_id + '_annotations.tsv' + if self.check_reference_exists('NOG', taxon_id) and file_exists(target_annotation_file): + hmm_path = get_ref_in_folder(self.mantis_paths['NOG'] + taxon_id) + profile_count = get_hmm_profile_count(hmm_path) + annotations_count = len(self.get_hmms_annotation_file(target_annotation_file, 1)) + # should be the same but some HMMs dont have an annotation (probably an error from NOG) + if profile_count in range(annotations_count - 10, annotations_count + 10 + 1): + print( + f'Tax {taxon_id} hmm and annotations.tsv files already exist, and they were merged correctly. Skipping setup...', + flush=True, file=stdout_file) + else: + print(f'Tax {taxon_id} hmm already exists but was not merged correctly!', flush=True, + file=stdout_file) + res.append(taxon_id) + else: + res.append(taxon_id) + print('Will check data for the following NOGT:\n' + ','.join(res), flush=True, file=stdout_file) + return res + + ### Support functions for setting up queue for split hmms + + def get_taxon_for_queue_NOG_split_hmms(self): + res = [] + if self.mantis_paths['NOG'][0:2] != 'NA' and self.nog_db == 'hmm': + stdout_file = open(self.mantis_out, 'a+') + + for taxon_id in self.get_ref_taxon_ids('NOG'): + if os.path.isdir(self.mantis_paths['NOG'] + taxon_id): + hmm_path = get_ref_in_folder(self.mantis_paths['NOG'] + taxon_id) + print('Checking NOG for splitting:', hmm_path, flush=True, file=stdout_file) + if 'chunks' not in os.listdir(self.mantis_paths['NOG'] + taxon_id): + profile_count = get_hmm_profile_count(hmm_path) + if profile_count > self.hmm_chunk_size: + res.append(hmm_path) + else: + print(f'NOG already split: {hmm_path} {taxon_id}', flush=True, file=stdout_file) + else: + print(f'NOG already split: {hmm_path} {taxon_id}', flush=True, file=stdout_file) + stdout_file.close() + return res + + def get_taxon_for_queue_ncbi_split_hmms(self): + res = [] + stdout_file = open(self.mantis_out, 'a+') + if self.mantis_paths['NCBI'][0:2] != 'NA': + for taxon_id in self.get_ref_taxon_ids('NCBI'): + if os.path.isdir(self.mantis_paths['NCBI'] + taxon_id): + hmm_path = get_ref_in_folder(self.mantis_paths['NCBI'] + taxon_id) + print(f'Checking NCBI for splitting: {hmm_path} {taxon_id}', flush=True, file=stdout_file) + if 'chunks' not in os.listdir(self.mantis_paths['NCBI'] + taxon_id): + profile_count = get_hmm_profile_count(hmm_path) + if profile_count > self.hmm_chunk_size: + res.append(hmm_path) + else: + print(f'NCBI already split: {hmm_path} {taxon_id}', flush=True, file=stdout_file) + else: + print(f'NCBI already split: {hmm_path} {taxon_id}', flush=True, file=stdout_file) + + stdout_file.close() + return res + + def get_hmm_for_queue_split_hmms(self): + hmms_list = self.compile_refs_list() + res = [] + for hmm in hmms_list: + hmm_folder = get_folder(hmm) + if 'chunks' not in os.listdir(hmm_folder): + hmm_profile_count = get_hmm_profile_count(hmm) + if hmm_profile_count > self.hmm_chunk_size: + res.append(hmm) + return res + + ### Support functions for splitting hmms + + def split_hmm_into_chunks(self, hmm_path, stdout_path=None): + # we dont really load balance the profiles based on profile length because we would need to keep too much data in memory + # so we just balance out the number of profiles + stdout_file = open(stdout_path, 'a+') + print(f'Counting profiles in {hmm_path}', flush=True, file=stdout_file) + profile_count = get_hmm_profile_count(hmm_path, stdout=stdout_file) + print(f'{hmm_path} has {profile_count} profiles', flush=True, file=stdout_file) + hmm_folder = get_folder(hmm_path) + hmm_chunks_folder = f'{hmm_folder}chunks/' + if file_exists(hmm_chunks_folder): + shutil.rmtree(hmm_chunks_folder) + Path(hmm_chunks_folder).mkdir(parents=True, exist_ok=True) + print('Load balancing chunks', flush=True, file=stdout_file) + load_balanced_chunks = chunk_generator_load_balanced([i for i in range(profile_count)], + get_hmm_chunk_size(total_profiles=profile_count, + current_chunk_size=self.hmm_chunk_size, + max_chunks=100), time_limit=None) + print(f'Splitting {hmm_path} into {len(load_balanced_chunks)} chunks.', flush=True, file=stdout_file) + chunks_name = get_path_level(hmm_path, remove_extension=True) + '_chunk_' + with open(hmm_path) as hmm_file: + for chunk_i in range(len(load_balanced_chunks)): + chunk_file_path = f'{hmm_chunks_folder}{chunks_name}{chunk_i}.hmm' + with open(chunk_file_path, 'w+') as chunk_file: + for profile_i in range(len(load_balanced_chunks[chunk_i])): + profile = ''.join(read_profile(hmm_file)) + chunk_file.write(profile) + chunks_dir = os.listdir(hmm_chunks_folder) + for chunk in chunks_dir: + run_command(f'hmmpress {hmm_chunks_folder}{chunk}', stdout_file=stdout_file) + stdout_file.close() + + def press_custom_hmms(self, hmm_path, stdout_path=None): + stdout_file = open(stdout_path, 'a+') + hmm_folder = add_slash(SPLITTER.join(hmm_path.split(SPLITTER)[:-1])) + old_files = ['h3f', 'h3i', 'h3m', 'h3p'] + for inner_file in os.listdir(hmm_folder): + inner_file_ending = inner_file.split('.')[-1] + if inner_file_ending in old_files: + os.remove(hmm_folder + inner_file) + run_command(f'hmmpress {hmm_path}', stdout_file=stdout_file) + stdout_file.close() + + ### Support functions for extracting metadata + + def get_hmms_annotation_file(self, annotations_file, hmm_col): + res = set() + if not file_exists(annotations_file): return res + with open(annotations_file, 'r') as file: + line = file.readline() + while line: + line = line.split('\t') + res.add(line[hmm_col]) + line = file.readline() + return res + + def download_pfam_id_to_acc(self): + if not file_exists(self.mantis_paths['NOG'] + 'Pfam-A.hmm.dat'): + pfam_metadata = 'http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz' + download_file(pfam_metadata, output_folder=self.mantis_paths['NOG']) + uncompress_archive(source_filepath=self.mantis_paths['NOG'] + 'Pfam-A.hmm.dat.gz', remove_source=True) + + def pfam_id_to_acc(self): + res = {} + with open(self.mantis_paths['NOG'] + 'Pfam-A.hmm.dat') as pfam_dat_file: + line = pfam_dat_file.readline() + while line: + line = line.strip('\n').split(' ') + if len(line) == 2: + row_header, row_description = line + if row_header == '#=GF ID': + hmm = str(row_description) + elif row_header == '#=GF AC': + pfam_accession = str(row_description) + pfam_accession = pfam_accession.split('.')[0].strip() + res[hmm] = pfam_accession + line = pfam_dat_file.readline() + return res + + def clean_up_sql_results_description(self, sql_row, ids_res): + og, level, description = sql_row + description = description.strip() + if description: + if og not in ids_res: ids_res[og] = {'description': set(), 'cog': set(), 'arcog': set()} + ids_res[og]['description'].add(description) + get_common_links_metadata(description, res=ids_res[og]) + + def clean_up_sql_results_ids_hmm(self, sql_row, taxon_id, pfam_id_to_acc): + codes_to_exclude = ['IEA', 'ND'] + res = { + 'go': set(), + 'pfam': set(), + 'kegg_ko': set(), + 'cog': set(), + 'arcog': set(), + 'enzyme_ec': set(), + 'kegg_pathway': set(), + 'kegg_module': set(), + 'kegg_reaction': set(), + 'kegg_rclass': set(), + 'kegg_brite': set(), + 'cazy': set(), + 'bigg_reaction': set(), + 'description': set(), + 'eggnog': set(), + 'tcdb': set(), + } + eggnog_hmms = set() + ogs, gene_ontology_gos, pfam_ids, kegg_ko, kegg_cog, kegg_ec, kegg_brite, kegg_rclass, kegg_tc, kegg_cazy, kegg_pathway, kegg_module, kegg_reaction, kegg_go = sql_row + for query_hmm_taxon in ogs.split(','): + query_hmm, query_taxon = query_hmm_taxon.split('@') + if int(taxon_id) == int(query_taxon): + eggnog_hmms.add(query_hmm) + if eggnog_hmms: + if gene_ontology_gos: + # http://geneontology.org/docs/guide-go-evidence-codes/ + gene_ontology_gos_copy = str(gene_ontology_gos) + gene_ontology_gos_copy = gene_ontology_gos_copy.split(',') + for go_group in gene_ontology_gos_copy: + if '|' not in go_group: print(ids_command, '\n', go_group) + _, go, evidence_code = go_group.split('|') + if evidence_code not in codes_to_exclude: + res['go'].add(go.split(':')[1]) + if pfam_ids: + temp_pfam_ids = pfam_ids.split(',') + for tpi in temp_pfam_ids: + if tpi in pfam_id_to_acc: + res['pfam'].add(pfam_id_to_acc[tpi]) + if kegg_ko: + kegg_ko = kegg_ko.replace('ko:', '') + res['kegg_ko'].update(kegg_ko.split(',')) + if kegg_cog: res['cog'].update(kegg_cog.split(',')) + if kegg_ec: res['enzyme_ec'].update(kegg_ec.split(',')) + if kegg_brite: res['kegg_brite'].update(kegg_brite.split(',')) + if kegg_rclass: res['kegg_rclass'].update(kegg_rclass.split(',')) + if kegg_tc: res['tcdb'].update(kegg_tc.split(',')) + if kegg_cazy: res['cazy'].update(kegg_cazy.split(',')) + if kegg_pathway: res['kegg_pathway'].update(kegg_pathway.split(',')) + if kegg_module: res['kegg_module'].update(kegg_module.split(',')) + if kegg_reaction: res['kegg_reaction'].update(kegg_reaction.split(',')) + if kegg_go: res['go'].update(kegg_go.split(',')) + return eggnog_hmms, res + + def yield_sql_rows(self, cursor, step_size=1000): + while True: + results = cursor.fetchmany(step_size) + if not results: + break + for result in results: + yield result + + def fetch_eggnog_metadata_hmm(self, cursor, taxon_id, ids_command, description_command, pfam_id_to_acc): + res = {} + cursor.execute(ids_command) + rows = self.yield_sql_rows(cursor) + for row in rows: + eggnog_hmms, processed_row = self.clean_up_sql_results_ids_hmm(row, taxon_id, pfam_id_to_acc) + for hmm in eggnog_hmms: + if hmm not in res: + res[hmm] = dict(processed_row) + res[hmm]['eggnog'].add(hmm) + if find_cog(hmm): res[hmm]['cog'].add(hmm) + if find_arcog(hmm): res[hmm]['arcog'].add(hmm) + for link_type in res[hmm]: + res[hmm][link_type].update(processed_row[link_type]) + cursor.execute(description_command) + rows = self.yield_sql_rows(cursor) + for row in rows: + self.clean_up_sql_results_description(row, res) + return res + + def get_metadata_hmms(self, taxon_id, stdout_path=None): + stdout_file = open(stdout_path, 'a+') + + print(f'Exporting metadata for NOG {taxon_id}', flush=True, file=stdout_file) + metadata_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + 'metadata.tsv' + eggnog_db_path = self.mantis_paths['NOG'] + "eggnog.db" + connection = sqlite3.connect(eggnog_db_path) + cursor = connection.cursor() + pfam_id_to_acc = self.pfam_id_to_acc() + ids_command = f'SELECT ogs, gos, pfam, kegg_ko,kegg_cog,kegg_ec,kegg_brite,kegg_rclass,kegg_tc,kegg_cazy,kegg_pathway,kegg_module,kegg_reaction,kegg_go FROM prots WHERE ogs LIKE "%@{taxon_id}%";' + description_command = f'SELECT og,level,description FROM og WHERE level="{taxon_id}"'; + taxon_metadata = self.fetch_eggnog_metadata_hmm(cursor, taxon_id, ids_command, description_command, + pfam_id_to_acc) + + with open(metadata_file, "a+") as file: + for hmm in taxon_metadata: + link_line = '' + for link_type in taxon_metadata[hmm]: + for inner_link in taxon_metadata[hmm][link_type]: + inner_link = inner_link.strip() + if inner_link: + link_line += '\t' + link_type + ':' + inner_link + file.write(hmm + '\t|' + link_line + '\n') + print(f'Finished exporting metadata for NOGT {taxon_id}', flush=True, file=stdout_file) + stdout_file.close() + + ###### NOG diamond (not implemented yet) + + def download_NOG_DMND(self, stdout_file=None): + folder_path = add_slash(self.mantis_paths['NOG']) + if file_exists(folder_path + 'eggnog_proteins.dmnd'): + print('eggnog_proteins.dmnd already exists! Skipping...', flush=True, file=stdout_file) + return + url = 'http://eggnogdb.embl.de/download/emapperdb-' + self.get_latest_version_eggnog() + '/eggnog_proteins.dmnd.gz' + download_file(url, output_folder=folder_path, stdout_file=stdout_file) + uncompress_archive(source_filepath=f'{folder_path}eggnog_proteins.dmnd.gz', extract_path=folder_path, + stdout_file=stdout_file, remove_source=False) + + def merge_fasta_files(self, target_merged_faa, all_faa): + already_added = set() + with open(target_merged_faa, 'w+') as file: + for faa_file in all_faa: + for query, seq in read_protein_fasta_generator(faa_file): + if query not in already_added: + line = f'>{query}\n{seq}\n' + file.write(line) + already_added.add(query) + + def merge_metadata_files(self, target_metadata_file, all_metadata): + already_added = set() + with open(target_merged_faa, 'w+') as outfile: + for metadata_file in all_metadata: + with open(metadata_file) as infile: + for line in infile: + query = line.split('\t')[0] + if query not in already_added: + outfile.write(line) + already_added.add(query) + + def compile_NOGT_DMND(self): + if self.mantis_paths['NOG'][0:2] != 'NA': + for taxon in self.get_ref_taxon_ids('NOG'): + taxon_folder = self.mantis_paths['NOG'] + taxon + SPLITTER + taxon_fasta = f'{taxon_folder}{taxon}_merged.faa' + taxon_dmnd = f'{taxon_folder}{taxon}' + if not file_exists(f'{taxon_dmnd}.dmnd') and file_exists(taxon_fasta): + run_command(f'diamond makedb --in {taxon_fasta} -d {taxon_dmnd}') + + def compile_NOGG_DMND(self): + if self.mantis_paths['NOG'][0:2] != 'NA': + stdout_file = open(self.mantis_out, 'a+') + nogg_folder_path = add_slash(self.mantis_paths['NOG'] + 'NOGG') + target_metadata_file = f'{nogg_folder_path}metadata.tsv' + target_merged_faa = f'{nogg_folder_path}NOGG_merged.faa' + all_metadata = set() + all_faa = set() + for taxon_id in self.get_ncbi_domains(): + if taxon_id != 'NOGG': + if os.path.isdir(self.mantis_paths['NOG'] + taxon_id): + taxon_faa_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + f'{taxon_id}_merged.faa' + all_faa.add(taxon_faa_file) + taxon_metadata_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + 'metadata.tsv' + all_metadata.add(taxon_metadata_file) + + if not self.check_reference_exists('NOGG'): + if file_exists(nogg_folder_path): shutil.rmtree(nogg_folder_path) + Path(nogg_folder_path).mkdir(parents=True, exist_ok=True) + else: + print('NOGG already compiled, skipping...', flush=True, file=stdout_file) + return + print_cyan('Compiling global NOG diamond database ', flush=True, file=stdout_file) + self.merge_fasta_files(target_merged_faa, all_faa) + concat_files(target_metadata_file, all_metadata) + + nogg_dmnd = f'{nogg_folder_path}NOGG_merged' + if not file_exists(f'{nogg_dmnd}.dmnd'): + run_command(f'diamond makedb --in {target_merged_faa} -d {nogg_dmnd}', stdout_file=stdout_file) + + stdout_file.close() + + def compile_NOG_DMND(self): + if self.mantis_paths['NOG'][0:2] != 'NA': + folder_path = add_slash(self.mantis_paths['NOG']) + diamond_db = f'{folder_path}eggnog_proteins' + eggnog_proteins_path = f'{folder_path}eggnog_seqs.faa' + extract_seqs_command = f'diamond getseq -d {diamond_db} > {eggnog_proteins_path}' + if not file_exists(eggnog_proteins_path): + print('Extracting sequences from NOG Diamond database', flush=True, file=self.redirect_verbose) + run_command(extract_seqs_command, join_command=True, shell=True) + return self.create_fastas_NOG_DMND() + + def create_fastas_NOG_DMND(self, stdout_path=None): + eggnog_db_path = self.mantis_paths['NOG'] + "eggnog.db" + all_metadata = self.mantis_paths['NOG'] + 'metadata.tsv' + stdout_file = open(self.mantis_out, 'a+') + print('Generating metadata tsv', flush=True, file=stdout_file) + seqs_taxons = self.generate_metadata_diamond(eggnog_db_path, all_metadata) + print('Generating taxon specific metadata tsvs', flush=True, file=stdout_file) + stdout_file.close() + return seqs_taxons + + def clean_up_sql_results_ids_dmnd(self, sql_row, pfam_id_to_acc): + codes_to_exclude = ['IEA', 'ND'] + res = { + 'go': set(), + 'pfam': set(), + 'kegg_ko': set(), + 'cog': set(), + 'arcog': set(), + 'enzyme_ec': set(), + 'kegg_pathway': set(), + 'kegg_module': set(), + 'kegg_reaction': set(), + 'kegg_rclass': set(), + 'kegg_brite': set(), + 'cazy': set(), + 'bigg_reaction': set(), + 'description': set(), + 'eggnog': set(), + 'tcdb': set(), + } + seq_name, ogs, gene_ontology_gos, pfam_ids, kegg_ko, kegg_cog, kegg_ec, kegg_brite, kegg_rclass, kegg_tc, kegg_cazy, kegg_pathway, kegg_module, kegg_reaction, kegg_go = sql_row + if seq_name == '#member': return None, None, None + eggnog_hmms = set() + cogs = set() + arcogs = set() + taxons = set() + if ogs: + for query_hmm_taxon in ogs.split(','): + query_hmm, query_taxon = query_hmm_taxon.split('@') + eggnog_hmms.add(query_hmm) + query_cogs = find_cog(query_hmm) + query_arcogs = find_arcog(query_hmm) + if query_cogs: cogs.update(query_cogs) + if query_arcogs: arcogs.update(query_arcogs) + taxons.add(query_taxon) + if gene_ontology_gos: + # http://geneontology.org/docs/guide-go-evidence-codes/ + gene_ontology_gos_copy = str(gene_ontology_gos) + gene_ontology_gos_copy = gene_ontology_gos_copy.split(',') + for go_group in gene_ontology_gos_copy: + if '|' not in go_group: print(ids_command, '\n', go_group) + _, go, evidence_code = go_group.split('|') + if evidence_code not in codes_to_exclude: + res['go'].add(go.split(':')[1]) + if pfam_ids: + temp_pfam_ids = pfam_ids.split(',') + for tpi in temp_pfam_ids: + if tpi in pfam_id_to_acc: + res['pfam'].add(pfam_id_to_acc[tpi]) + if kegg_ko: + kegg_ko = kegg_ko.replace('ko:', '') + res['kegg_ko'].update(kegg_ko.split(',')) + if kegg_cog: res['cog'].update(kegg_cog.split(',')) + if kegg_ec: res['enzyme_ec'].update(kegg_ec.split(',')) + if kegg_brite: res['kegg_brite'].update(kegg_brite.split(',')) + if kegg_rclass: res['kegg_rclass'].update(kegg_rclass.split(',')) + if kegg_tc: res['tcdb'].update(kegg_tc.split(',')) + if kegg_cazy: res['cazy'].update(kegg_cazy.split(',')) + if kegg_pathway: res['kegg_pathway'].update(kegg_pathway.split(',')) + if kegg_module: res['kegg_module'].update(kegg_module.split(',')) + if kegg_reaction: res['kegg_reaction'].update(kegg_reaction.split(',')) + if kegg_go: res['go'].update(kegg_go.split(',')) + if eggnog_hmms: res['eggnog'].update(eggnog_hmms) + if cogs: res['cog'].update(cogs) + if arcogs: res['arcog'].update(arcogs) + + return seq_name, taxons, res + + # we generate a metadata.tsv with the functional annotation of each sequence + def generate_metadata_diamond(self, eggnog_db, target_sql_file): + stdout_file = open(self.mantis_out, 'a+') + codes_to_exclude = ['IEA', 'ND'] + res = {} + # this will convert protein names to pfam ids (which is typically what is used with mantis) + pfam_id_to_acc = self.pfam_id_to_acc() + connection = sqlite3.connect(eggnog_db) + sql_command = f'SELECT name,ogs, gos, pfam, kegg_ko,kegg_cog,kegg_ec,kegg_brite,kegg_rclass,kegg_tc,kegg_cazy,kegg_pathway,kegg_module,kegg_reaction,kegg_go FROM prots;' + print(f'Querying SQL:\n{sql_command}', flush=True, file=stdout_file) + cursor = connection.cursor() + cursor.execute(sql_command) + rows = self.yield_sql_rows(cursor) + with open(target_sql_file, 'w+') as file: + for row in rows: + seq_name, taxons, row_info = self.clean_up_sql_results_ids_dmnd(row, pfam_id_to_acc) + if seq_name: + for t in taxons: + if t not in res: res[t] = set() + res[t].add(seq_name) + + link_line = [seq_name, '|'] + for link_type in row_info: + for inner_link in row_info[link_type]: + inner_link = inner_link.strip() + if inner_link: + link_line.append(f'{link_type}:{inner_link}') + if len(link_line) > 2: + link_line = '\t'.join(link_line) + file.write(f'{link_line}\n') + stdout_file.close() + return res + + def prepare_queue_extract_fastas_DMND(self, seqs_taxons): + if self.mantis_paths['NOG'][0:2] != 'NA': + for taxon in self.get_ref_taxon_ids('NOG'): + if taxon in seqs_taxons: + current_seqs = seqs_taxons[taxon] + taxon_folder = self.mantis_paths['NOG'] + taxon + SPLITTER + taxon_fasta = f'{taxon_folder}{taxon}_merged.faa' + taxon_dmnd = f'{taxon_folder}{taxon}_merged.dmnd' + taxon_metadata = f'{taxon_folder}metadata.tsv' + if not file_exists(taxon_fasta) or \ + not file_exists(taxon_metadata) or \ + not file_exists(taxon_dmnd): + self.queue.append([taxon, current_seqs, self.mantis_out]) + + def worker_extract_fastas_DMND(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + folder_path = add_slash(self.mantis_paths['NOG']) + all_proteins_path = f'{folder_path}eggnog_seqs.faa' + all_metadata_path = f'{folder_path}metadata.tsv' + taxon, taxon_seqs, stdout_path = record + self.create_fasta_DMND(taxon, taxon_seqs, all_proteins_path, all_metadata_path, stdout_path) + + def create_fasta_DMND(self, taxon, taxon_seqs, all_proteins_path, all_metadata_path, stdout_path): + + stdout_file = open(stdout_path, 'a+') + sequences_generator = yield_target_seqs(all_proteins_path, taxon_seqs) + metadata_generator = yield_target_metadata(all_metadata_path, taxon_seqs) + taxon_folder = self.mantis_paths['NOG'] + taxon + SPLITTER + taxon_faa = f'{taxon_folder}{taxon}_merged.faa' + taxon_metadata = f'{taxon_folder}metadata.tsv' + + Path(taxon_folder).mkdir(parents=True, exist_ok=True) + + with open(taxon_faa, 'w+') as seq_file: + for seq in sequences_generator: + seq_file.write(seq) + + with open(taxon_metadata, 'w+') as met_file: + for met_seq in metadata_generator: + met_file.write(met_seq) + + stdout_file.close() diff --git a/mantis/exceptions.py b/mantis/exceptions.py new file mode 100644 index 0000000..80220ab --- /dev/null +++ b/mantis/exceptions.py @@ -0,0 +1,28 @@ +NoValidFiles = 'No valid files to annotate' +TargetFileNotFound = 'Target file does not exist!\n' +InvalidTargetFile = 'You did not insert a valid target file!\n' +InvalidFastaFormat = 'Fasta format is not valid!\n' +InstallationCheckNotPassed = 'Installation check not passed! Make sure you\'ve setup the databases and your system meets all the requirements!' +SQLCheckNotPassed = 'SQL check not passed! Make sure you\'ve setup the databases and your system meets all the requirements!' +CythonNotCompiled = 'Cython has not been correctly compiled! Please run:\n' +BadNumberWorkers = 'You should not be seeing this, please contact the developer. Invalid number of workers in ' +ConnectionError = 'Could not connect to url:\n' +InvalidTranslation = 'Invalid residues for translation. Please make sure you provided a CDS of DNA or RNA in the target file:\n' +InvalidGFFVersion = 'No valid GFF version found!' +InvalidNOGType = 'Your config file does not contain a valid database type for NOG (i.e. or )!' + + +class SQLCheckNotPassed(Exception): + def __init__(self): + self.message = 'SQL check not passed!' + + def __str__(self): + return self.message + + +class InstallationCheckNotPassed(Exception): + def __init__(self): + self.message = 'Installation check not passed! Make sure you\'ve setup the databases' + + def __str__(self): + return self.message diff --git a/mantis/homology_processor.py b/mantis/homology_processor.py new file mode 100644 index 0000000..7520cb5 --- /dev/null +++ b/mantis/homology_processor.py @@ -0,0 +1,762 @@ +try: + from mantis.assembler import * +except: + from assembler import * + +try: + from mantis.cython_src.get_non_overlapping_hits import get_non_overlapping_hits +except: + if not cython_compiled(): + compile_cython() + try: + from mantis.cython_src.get_non_overlapping_hits import get_non_overlapping_hits + except: + kill_switch(CythonNotCompiled, f'{MANTIS_FOLDER}mantis{SPLITTER}utils.py') + + +# This class will process the domtblout output and get the best hit for our queries, it is inherited by the MANTIS + +class Homology_processor(): + + def create_chunk_output_dirs(self, chunk_dir): + to_create = ['searchout'] + if self.keep_files: + to_create.append('output_hmmer') + for hmmer_dir in to_create: + if not file_exists(chunk_dir + hmmer_dir): + Path(chunk_dir + hmmer_dir).mkdir(parents=True, exist_ok=True) + + ######SPLITTING TARGET FASTA INTO CHUNKS###### + def chunk_dict_generator(self, protein_seqs, seq_chunks): + res = {} + for seq in seq_chunks: + res[seq] = protein_seqs[seq] + return res + + ######CHECKING HMMER JOBS###### + + def get_taxon_chunks(self, taxon_id, domtblout_path, db): + all_domtblout_with_chunks = os.listdir(domtblout_path) + res = [] + for domtblout in all_domtblout_with_chunks: + if db + str(taxon_id) in domtblout: + res.append(domtblout) + return res + + def taxon_annotation_finished(self, target_ref, output_folder, chunks_n): + c = 0 + domtblout_folder = add_slash(add_slash(output_folder) + 'searchout') + domtblout_files = os.listdir(domtblout_folder) + for domtblout in domtblout_files: + if target_ref in domtblout: + if '_finished' in domtblout: c += 1 + if c == chunks_n: + for domtblout in domtblout_files: + if target_ref in domtblout: + if '_finished' in domtblout: + move_file(domtblout_folder + domtblout, domtblout_folder + domtblout.strip('_finished')) + return True + else: + sleep(5) + return False + + def save_temp_fasta_length(self, chunk_dir, hmm, count_seqs, db): + with open(f'{chunk_dir}missing_annotation.{db}.length', 'a+') as file: + file.write(hmm + '\t' + str(count_seqs) + '\n') + + ########To merge domtblout + + def group_output_chunks(self, searchout_path, all_searchout_with_chunks, chunk_suffix): + # grouping the domtblout into the corresponding original 'hmm' + res = {} + for searchout in all_searchout_with_chunks: + if 'chunk' in searchout: + ref_name = searchout.split('_chunk_')[0] + ref_searchout = ref_name + chunk_suffix + if ref_searchout not in res: res[ref_searchout] = [] + res[ref_searchout].append(searchout_path + searchout) + return res + + def merge_output_chunks(self, searchout_path, all_searchout_with_chunks, chunk_suffix, stdout_file=None): + grouped_searchout = self.group_output_chunks(searchout_path, all_searchout_with_chunks, chunk_suffix) + for searchout in grouped_searchout: + concat_files(output_file=searchout_path + searchout, list_file_paths=grouped_searchout[searchout], + stdout_file=stdout_file) + for to_delete in grouped_searchout[searchout]: + os.remove(to_delete) + + def get_dmndout_line(self, line): + res = line.strip('\n').split() + if res: return res[0] + + def get_domtblout_line(self, line): + if line[0] != '#': + res = line.strip('\n').split()[0:22] + if '---' not in line: + if len(res) == 22: + return res[0] + else: + print('BAD LINE', line) + + def split_hits(self, searchout_path, domtblout, chunks_domtblout, current_chunk_dir, worker_count): + # split the hits into different chunk allows for memory saving during hit processing + split_searchout_path = searchout_path.replace('raw_searchout', 'searchout') + list_of_files = [f'{split_searchout_path}{domtblout}_chunk_{i}' for i in range(worker_count)] + Path(f'{current_chunk_dir}searchout').mkdir(parents=True, exist_ok=True) + hit_to_file = {} + file_yielder = yield_file(list_of_files) + for chunk_searchout in chunks_domtblout: + chunk_searchout_path = searchout_path + chunk_searchout + if '.dmndout' in chunk_searchout: + read_function = self.get_dmndout_line + elif '.domtblout' in chunk_searchout: + read_function = self.get_domtblout_line + with open(chunk_searchout_path) as file: + line = file.readline() + while line: + query_name = read_function(line) + if query_name: + if query_name not in hit_to_file: + file_name = next(file_yielder) + hit_to_file[query_name] = file_name + out_file_path = hit_to_file[query_name] + with open(out_file_path, 'a+') as opened_file: + opened_file.write(line) + line = file.readline() + if not self.keep_files: + os.remove(chunk_searchout_path) + + ########To calculate new evalue + + def get_temp_fasta_length(self, chunk_dir, domtblout, db): + ref = domtblout.split('_')[0] + with open(f'{chunk_dir}missing_annotation.{db}.length') as file: + line = file.readline() + while line: + ref_key, ref_len = line.split('\t') + if ref == ref_key: return int(ref_len) + line = file.readline() + + def remove_temp_fasta_length(self, chunk_dir, db): + if file_exists(f'{chunk_dir}missing_annotation.{db}.length'): + os.remove(f'{chunk_dir}missing_annotation.{db}.length') + + def remove_annotated_queries(self, missing_queries, annotated_queries): + for p in annotated_queries: + if p in missing_queries: + del missing_queries[p] + + def generate_temp_fasta(self, missing_queries, output_folder, db): + temp_path = f'{output_folder}missing_annotations.{db}.tmp' + if file_exists(temp_path): + remove_temp_fasta(temp_path, db) + with open(temp_path, 'w+') as file: + for mq in missing_queries: + chunks = [missing_queries[mq][x:x + 60] for x in range(0, len(missing_queries[mq]), 60)] + chunk_str = '\n'.join(i for i in chunks) + file.write('>' + mq + '\n' + chunk_str + '\n') + return temp_path + + ########Processing HMMER hits + + def is_overlap(self, temp_queries, current_query): + # the coordinates here already take into account the overlap value, so even if the y set is small or empty, it doesnt matter + if not temp_queries or not current_query: return False + y_start, y_end = recalculate_coordinates(current_query['hit_start'], + current_query['hit_end'], + self.overlap_value) + y = set(range(y_start, y_end)) + + for t in temp_queries: + if t['hit_name'] == current_query['hit_name']: return True + x_start, x_end = recalculate_coordinates(t['hit_start'], + t['hit_end'], + self.overlap_value) + x = set(range(x_start, x_end)) + res = x.intersection(y) + if res: return True + return False + + def get_best_hits_approximation(self, query_hits, sorting_class, sorting_type): + ''' + this is just a lazy implementation when the amount of hits is too big and/or the hits are too small + even with multiprocessing and cython we may run into computationally unfeasable calculations + when this happens, we do generate a straightforward "best hit" + Best hit will take the lowest evalue hit and add the next lowest evalue hit (without overlapping) until we reach a point where this cycle cant be repeated. + This doesnt effectively calculate the "best hit", just a biased (since we start with lowest evalue as root) approximation + Still, it is a pretty good approximation anyhow + ''' + if sorting_class == 'consensus': + query_hits = self.sort_scaled_hits(query_hits, sorting_type=sorting_type) + else: + query_hits = self.sort_hits(query_hits, sorting_class, sorting_type=sorting_type) + combo = [] + while query_hits: + next_hit = query_hits.pop(0) + if sorting_class == 'consensus': + if not self.is_overlap_Consensus(combo, next_hit): + combo.append(next_hit) + elif sorting_class == 'processor': + if not self.is_overlap(combo, next_hit): + combo.append(next_hit) + return combo + + def get_lowest_hit(self, query_hits, sorting_class, sorting_type): + ''' + this will take the hit with the lowest evalue, regardless if there are multiple domains + ''' + if not query_hits: return [] + if sorting_class == 'consensus': + query_hits = self.sort_scaled_hits(query_hits, sorting_type=sorting_type) + else: + query_hits = self.sort_hits(query_hits, sorting_class, sorting_type=sorting_type) + lowest_hit = query_hits.pop(0) + combo = [lowest_hit] + return combo + + def cython_to_query_hits(self, cython_hits, conversion_dict): + res = [] + for hit in cython_hits: + res.append(conversion_dict[hit[0]]) + return res + + def get_min_max_dfs(self, cython_possible_combos, conversion_dict, sorting_class): + min_val = None + max_val = None + for len_combo in cython_possible_combos: + for cython_combo in cython_possible_combos[len_combo]: + combo = self.cython_to_query_hits(cython_combo, conversion_dict) + for hit in combo: + if sorting_class == 'processor': + hit_info = hit + elif sorting_class == 'consensus': + hmm_file, hmm_hit, hit_info = hit + # we dont consider 0 for the scaling, 0 will always be scaled to max/1 + if hit_info[self.sorting_type]: + current_val = log10(hit_info[self.sorting_type]) + if min_val is None: min_val = current_val + if max_val is None: max_val = current_val + if current_val > max_val: max_val = current_val + if current_val < min_val: min_val = current_val + # lower is best + if self.sorting_type == 'evalue': + return max_val, min_val + # higher is best + elif self.sorting_type == 'bitscore': + return min_val, max_val + + def get_combo_score(self, combo, query_length, min_val, max_val): + + average_value = 0 + average_hit_coverage = 0 + hit_ranges = [] + for hit in combo: + hit_start, hit_end, ref_start, ref_end = hit['hit_start'], hit['hit_end'], \ + hit['ref_start'], hit['ref_end'] + ref_len = hit['ref_len'] + hit_evalue = hit['evalue'] + hit_bitscore = hit['bitscore'] + hit_coverage = (hit_end - hit_start + 1) / query_length + ref_coverage = (ref_end - ref_start + 1) / ref_len + average_hit_coverage += hit_coverage + # lower is better + if self.sorting_type == 'evalue': + if hit_evalue: + scaled_value = min_max_scale(log10(hit_evalue), min_val, max_val) + 0.01 + else: + scaled_value = 2 + # higher is better + elif self.sorting_type == 'bitscore': + if hit_bitscore: + scaled_value = min_max_scale(log10(hit_bitscore), min_val, max_val) + 0.01 + else: + scaled_value = 0 + average_value += scaled_value + hit_ranges.append([hit_start, hit_end]) + + # sequence hit quality + average_value = average_value / len(combo) + # average coverage of all hits + average_hit_coverage /= len(combo) + # coverage of combination + combination_coverage = get_combination_ranges(hit_ranges) + combination_coverage /= query_length + ''' + average_hit_coverage 0-1 + combination_coverage 0-1 + average_value 0-1 and exceptionally above when e-value is 0 + + the user is not meant to choose a formula, this was written for internal quality testing + for 1e-3 formula 1 with bitscore and dfs algorithm produced the best results + ''' + if self.best_combo_formula == 1: # default + combo_score = average_value + elif self.best_combo_formula == 2: + combo_score = average_value * combination_coverage * average_hit_coverage + elif self.best_combo_formula == 3: + combo_score = (average_value * combination_coverage * average_hit_coverage) ** (1 / 3) + elif self.best_combo_formula == 4: + combo_score = average_value * combination_coverage + elif self.best_combo_formula == 5: + combo_score = (average_value * combination_coverage) ** (1 / 2) + elif self.best_combo_formula == 6: + combo_score = (average_value + average_hit_coverage + combination_coverage) / 3 + elif self.best_combo_formula == 7: + combo_score = (2 * average_value + average_hit_coverage + combination_coverage) / 4 + elif self.best_combo_formula == 8: + combo_score = (2 * average_value + combination_coverage) / 3 + elif self.best_combo_formula == 9: + combo_score = (3 * average_value + average_hit_coverage + combination_coverage) / 5 + elif self.best_combo_formula == 10: + combo_score = (3 * average_value + combination_coverage) / 4 + elif self.best_combo_formula == 11: + combo_score = average_value * (combination_coverage) ** 2 * (average_hit_coverage) ** 2 + elif self.best_combo_formula == 12: + combo_score = average_value * (combination_coverage) ** 2 + + return combo_score + + def get_best_combo_bit_score(self, good_combos): + best_bitscore = None + best_combo = None + for combo in good_combos: + combo_bitscore = 0 + for hit in combo: + combo_bitscore += hit['bitscore'] + combo_bitscore /= len(combo) + if best_bitscore is None: + best_bitscore = combo_bitscore + best_combo = combo + else: + if combo_bitscore > best_bitscore: + best_combo = combo + return best_combo + + def get_best_hits(self, query_hits, query_length, sorting_class): + ''' + we take into consideration the coverage of our hits and their evalue + steps: + 1- get all possible combinations of hits + 2- check which possible combinations don't overlap + 3- get best combination (best evalue and coverage) + ''' + ordered_query_hits = self.sort_hits(query_hits, sorting_class=sorting_class, sorting_type=self.sorting_type) + cython_hits, conversion_dict = self.query_hits_to_cython(ordered_query_hits) + cython_possible_combos = get_non_overlapping_hits(cython_hits, time_limit=self.time_limit) + # sometimes this calculation is not feasible (when we have too many small hits and the query sequence is too long, the calculation of conbinations would take forever - limit of 5mins) + if not cython_possible_combos: return None + best_hit_score = 0 + best_combo = None + min_val, max_val = self.get_min_max_dfs(cython_possible_combos, conversion_dict, sorting_class='processor') + good_combos = [] + for len_combo in cython_possible_combos: + if len_combo: + for cython_combo in cython_possible_combos[len_combo]: + combo = self.cython_to_query_hits(cython_combo, conversion_dict) + combo_score = self.get_combo_score(combo, query_length, min_val, max_val) + if combo_score > 1.5: good_combos.append(combo) + if not best_combo: + best_combo = combo + best_hit_score = combo_score + elif combo_score > best_hit_score: + best_hit_score = combo_score + best_combo = combo + # some sequences will have extremely high scores against multiple profiles (e-value of 0), when this is the case, and the combo score is also high, we use bitscore to select the best + # this can only be done here because we are comparing the same db + if self.sorting_type == 'evalue': + if good_combos: + best_combo = self.get_best_combo_bit_score(good_combos) + return best_combo + + def query_hits_to_cython(self, query_hits): + ''' + keep in mind this is not reproducible since sets are used in cython + one could keep the original order but this would be extra overhead, which is probably not needed since it would only happen with outliers + plus, the dfs is no longer default + ''' + conversion_dict = {} + res = set() + for hit_i in range(len(query_hits)): + hit_start, hit_end = recalculate_coordinates(query_hits[hit_i]['hit_start'], + query_hits[hit_i]['hit_end'], + self.overlap_value) + hit_name = query_hits[hit_i]['hit_name'] + res.add(tuple([ + hit_i, + hit_start, + hit_end, + hit_name + ])) + conversion_dict[hit_i] = query_hits[hit_i] + return res, conversion_dict + + def sort_hits(self, query_hits, sorting_class, sorting_type): + # we do a 2 step sorting to break ties (very frequent in evalue, less so in bitscore) + if not query_hits: return query_hits + # first we sort by the preferred sorting type + if sorting_type == 'bitscore': + to_reverse = True + else: + to_reverse = False + if sorting_class == 'processor': + sorted_hits = sorted(query_hits, key=lambda k: k[sorting_type], reverse=to_reverse) + elif sorting_class == 'consensus': + sorted_hits = sorted(query_hits, key=lambda k: k[2][sorting_type], reverse=to_reverse) + res = [] + # then we separate by sorting value + sorted_hits_groups = [] + c = 0 + for i in sorted_hits: + if sorting_class == 'processor': + hit_value = i[sorting_type] + elif sorting_class == 'consensus': + hit_value = i[2][sorting_type] + if not sorted_hits_groups: + sorted_hits_groups.append([]) + current = hit_value + if hit_value == current: + sorted_hits_groups[c].append(i) + else: + sorted_hits_groups.append([i]) + c += 1 + current = hit_value + # then we sort each subgroup by the secondary sorting type + sec_sorting_type = 'bitscore' if sorting_type == 'evalue' else 'evalue' + if sec_sorting_type == 'bitscore': + to_reverse = True + else: + to_reverse = False + for sg in sorted_hits_groups: + if sorting_class == 'processor': + temp = sorted(sg, key=lambda k: k[sec_sorting_type], reverse=to_reverse) + elif sorting_class == 'consensus': + temp = sorted(sg, key=lambda k: k[2][sec_sorting_type], reverse=to_reverse) + res.extend(temp) + return res + + def calculate_evalue_threshold(self, query_len, diamond=False): + # See : https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-3416-y + # if the user sets evalue threshold, we use that + if self.evalue_threshold and self.evalue_threshold != 'dynamic': + return self.evalue_threshold + # if the user sets the evalue threshold to dynamic OR if the user doesnt set evalue threshold and we are analysing diamond's output + elif self.evalue_threshold == 'dynamic' or diamond: + # this somewhat translates to the findings in the paper + res = float('1e-' + str(ceil(query_len / 10))) + # for very small sequences, we set a hard threshold + if res > self.default_evalue_threshold: res = self.default_evalue_threshold + return res + # if the user doesnt set the evalue threshold and we are analysing hmmer's output + else: + return self.default_evalue_threshold + + def recalculate_evalue(self, original_evalue, to_divide, to_multiply): + ''' + keep in mind these calculations will result in minor rounding errors! + For example in 3 different executions for the same query: + When splitting the fasta into chunks: + -one time the fasta had 20 sequences and an i-evalue of 5.98e-52 + - another time the fasta had 24 sequences and an i-evalue of 5.7e-52 + - when the fasta was not split we got 5.8 e-52 + Ideally we would not split the fasta into chunks; this would avoid rounding errors but would not allow parallelization + ''' + if self.force_evalue: + return original_evalue + else: + evalue = float(original_evalue) + evalue /= to_divide + evalue *= to_multiply + if evalue >= self.minimum_evalue_threshold: + evalue = self.minimum_evalue_threshold + return evalue + + def read_searchout(self, output_path, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, + get_output=True): + if '.dmndout' in output_path: + return self.read_dmndout(output_path=output_path, count_seqs_chunk=count_seqs_chunk, + count_residues_original_file=count_residues_original_file, get_output=get_output) + elif '.domtblout' in output_path: + return self.read_domtblout(output_path=output_path, count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, get_output=get_output) + + def read_dmndout(self, output_path, count_seqs_chunk, count_residues_original_file, get_output=True): + res = {} + hit_counter = set() + # reading the output file + with open(output_path, 'r', encoding='utf8') as file: + line = file.readline().strip('\n') + while line: + # try: + if True: + line = line.split() + query_name = line[0] + query_len = int(line[1]) + ref_name = line[2] + ref_len = int(line[3]) + query_start = int(line[4]) + query_end = int(line[5]) + ref_start = int(line[6]) + ref_end = int(line[7]) + evalue = float(line[8]) + bitscore = float(line[9]) + # diamond's evalue is not scaled, so we need to multiply by residues count + evalue = self.recalculate_evalue(evalue, self.diamond_db_size, count_residues_original_file) + # hmmer's coordinates on the seq are always the same, regardless of direction + direction = '+' + if query_start < query_end: + corrected_query_start = query_start + corrected_query_end = query_end + else: + direction = '-' + corrected_query_start = query_end + corrected_query_end = query_start + + if get_output: + if query_name not in res: res[query_name] = {'query_len': query_len, 'hits': []} + # when processing mg data, the evalue of hits for chunks should be higher than what it really is (because chunk has less seqs = more significant e-value) + if evalue <= self.calculate_evalue_threshold(int(query_len), diamond=True): + hit_dict = {'hit_name': ref_name, + 'hit_accession': '', + 'ref_len': ref_len, + 'evalue': evalue, + 'bitscore': bitscore, + 'direction': direction, + 'ref_start': ref_start if ref_start < ref_end else ref_end, + 'ref_end': ref_end if ref_end > ref_start else ref_start, + 'hit_start': corrected_query_start, + 'hit_end': corrected_query_end, + } + if get_output: + res[query_name]['hits'].append(hit_dict) + hit_counter.add(query_name) + # except: + # print(f'Could not read line: {line} in file', output_path, flush=True) + line = file.readline().strip('\n') + return res, hit_counter + + def read_domtblout(self, output_path, count_seqs_chunk, count_seqs_original_file, get_output=True): + res = {} + hit_counter = set() + # reading the output file + with open(output_path, 'r', encoding='utf8') as file: + line = file.readline().strip('\n') + while line: + if line[0] != '#': + # try: + if True: + # after 22 it's just the free text of the annotation description + line = line.split()[0:22] + if len(line) == 22: + query_name = line[0] + query_len = int(line[2]) + hmm_name = line[3] + hmm_accession = line[4] + hmm_len = int(line[5]) + # for these two we accept multiple hits + if self.domain_algorithm in ['dfs', 'heuristic']: + evalue = float(line[12]) + bitscore = float(line[13]) + # here we want the full sequence best + else: + evalue = float(line[6]) + bitscore = float(line[7]) + evalue = self.recalculate_evalue(evalue, count_seqs_chunk, count_seqs_original_file) + hmm_coord_from = int(line[15]) + hmm_coord_to = int(line[16]) + # ali_coord_from = int(line[17]) + # ali_coord_to = int(line[18]) + # we will use envelop coords as per HMMER's manual recommendation + env_coord_from = int(line[19]) + env_coord_to = int(line[20]) + # hmmer's coordinates on the seq are always the same, regardless of direction + direction = '+' + if env_coord_from < env_coord_to: + corrected_env_coord_from = env_coord_from + corrected_env_coord_to = env_coord_to + else: + direction = '-' + corrected_env_coord_from = env_coord_to + corrected_env_coord_to = env_coord_from + if get_output: + if query_name not in res: res[query_name] = {'query_len': query_len, 'hits': []} + # when processing mg data, the evalue of hits for chunks should be higher than what it really is (because chunk has less seqs = more significant e-value) + if evalue <= self.calculate_evalue_threshold(int(query_len)): + hit_dict = {'hit_name': hmm_name, + 'hit_accession': hmm_accession, + 'ref_len': hmm_len, + 'evalue': evalue, + 'bitscore': bitscore, + 'direction': direction, + 'ref_start': hmm_coord_from if hmm_coord_from < hmm_coord_to else hmm_coord_to, + 'ref_end': hmm_coord_to if hmm_coord_to > hmm_coord_from else hmm_coord_from, + 'hit_start': corrected_env_coord_from, + 'hit_end': corrected_env_coord_to, + } + if get_output: + res[query_name]['hits'].append(hit_dict) + hit_counter.add(query_name) + # except: + # print(f'Could not read line: {line} in file', output_path, flush=True) + line = file.readline().strip('\n') + return res, hit_counter + + def process_searchout(self, output_path, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, + stdout_path=None): + ''' + this will read the domtblout file and export: + query -> all hits with their respective evalue and hit start and end + after we have this , we invoke get_best_hits to get our result + + we use i_evalue as per the hmmer manual + + this function can read raw and concatenated domtblout files + ''' + if isinstance(stdout_path, str): + stdout_file = open(stdout_path, 'a+') + else: + stdout_file = stdout_path + print('------------------------------------------', flush=True, file=stdout_file) + print('Processing hits for:\n' + output_path, flush=True, file=stdout_file) + queries_searchout, hit_counter = self.read_searchout(output_path=output_path, count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + count_residues_original_file=count_residues_original_file) + hit_counter = len(hit_counter) + # processing the hits and getting the best hit/non-overlapping hits + print(f'Found {hit_counter} hits in:\n{output_path}\nWill now get best hits!', flush=True, file=stdout_file) + approximated_hits = [] + ref_db = get_path_level(output_path, remove_extension=True) + res_annotation = {} + for query in queries_searchout: + if query not in res_annotation: res_annotation[query] = {} + list_hits = queries_searchout[query].pop('hits') + if self.domain_algorithm == 'heuristic': + best_hit = self.get_best_hits_approximation(list(list_hits), sorting_class='processor', + sorting_type=self.sorting_type) + elif self.domain_algorithm == 'bpo': + best_hit = self.get_lowest_hit(list(list_hits), sorting_class='processor', + sorting_type=self.sorting_type) + else: + try: + best_hit = self.get_best_hits(list(list_hits), queries_searchout[query]['query_len'], + sorting_class='processor') + except (TimeoutError, RecursionError) as e: + # heuristic with bitscore produces bad results, so we force evalue sorting + best_hit = self.get_best_hits_approximation(list(list_hits), sorting_class='processor', + sorting_type='evalue') + approximated_hits.append(query) + queries_searchout[query]['best_hit'] = best_hit + res_annotation[query][ref_db] = queries_searchout[query] + if approximated_hits: + approximated_hits = ' ; '.join(approximated_hits) + print(f'Some hits in the output file:\n{output_path} were approximated. They were:\n{approximated_hits}', + flush=True, file=stdout_file) + print(f'Finished processing hits for:\n{output_path}', flush=True, file=stdout_file) + print('------------------------------------------', flush=True, file=stdout_file) + if isinstance(stdout_path, str): stdout_file.close() + return res_annotation + + def save_processed_hits(self, annotation_output, output_dir, domtblout): + annotation_output_file = output_dir + domtblout.replace('domtblout', 'pro') + try: + os.remove(annotation_output_file) + except: + pass + with open(annotation_output_file, 'w') as output_file: + first_line = ['Query', + 'Ref_file', + 'Ref_hit', + 'Ref_hit_accession', + 'evalue', + 'bitscore', + 'Direction', + 'Query_length', + 'Query_hit_start', + 'Query_hit_end', + 'Ref_hit_start', + 'Ref_hit_end', + 'Ref_length', + ] + first_line = '\t'.join(first_line) + '\n' + output_file.write(first_line) + for query in annotation_output: + for hmm_file in annotation_output[query]: + # some hmm hits won't pass the i-evalue threshold, so we dont consider them + if '_chunk_' in hmm_file: + str_hmm_file = hmm_file.split('_chunk_')[0] + else: + str_hmm_file = hmm_file + if annotation_output[query][hmm_file]['best_hit']: + for hit in annotation_output[query][hmm_file]['best_hit']: + line = [str(query), + str(str_hmm_file), + str(hit['hit_name']), + str(hit['hit_accession']), + str(hit['evalue']), + str(hit['bitscore']), + str(hit['direction']), + str(annotation_output[query][hmm_file]['query_len']), + str(hit['hit_start']), + str(hit['hit_end']), + str(hit['ref_start']), + str(hit['ref_end']), + str(hit['ref_len']), + ] + line = '\t'.join(line) + output_file.write(line + '\n') + + def merge_gff_output(self, output_folder, output_file, chunks_path): + all_seq_regions = [] + gff_version = None + for chunk_output in chunks_path: + target_chunk_output = chunk_output + output_file + target_chunk_output = target_chunk_output.replace('.tsv', '.gff') + with open(target_chunk_output, 'r') as chunk_file: + chunk_line = chunk_file.readline() + while chunk_line: + if '##gff-version' in chunk_line: + gff_version = chunk_line + if '##sequence-region' in chunk_line: + all_seq_regions.append(chunk_line) + chunk_line = chunk_file.readline() + if not gff_version: + kill_switch(InvalidGFFVersion, flush=True, file=self.redirect_verbose) + output_gff_file = f'{output_folder}{output_file}'.replace('.tsv', '.gff') + with open(output_gff_file, 'w+') as file: + file.write(gff_version) + for seq_reg in all_seq_regions: + file.write(seq_reg) + for chunk_output in chunks_path: + target_chunk_output = chunk_output + output_file + target_chunk_output = target_chunk_output.replace('.tsv', '.gff') + with open(target_chunk_output, 'r') as chunk_file: + chunk_line = chunk_file.readline() + while chunk_line: + if not chunk_line.startswith('##'): + file.write(chunk_line) + chunk_line = chunk_file.readline() + + def merge_target_output(self, output_file, output_folder, chunks_path, stdout_file, same_output=True): + print(f'Merging chunks to {output_folder}{output_file}', flush=True, file=stdout_file) + if self.output_gff and ('consensus' in output_file or 'integrated' in output_file): + self.merge_gff_output(output_folder, output_file, chunks_path) + header = False + with open(output_folder + output_file, 'w+') as file: + for chunk_output in chunks_path: + if same_output: + target_chunk_output = chunk_output + output_file + else: + target_chunk_output = chunk_output + with open(target_chunk_output, 'r') as chunk_file: + chunk_line = chunk_file.readline() + while chunk_line: + if 'Query' in chunk_line: + if not header: + file.write(chunk_line) + header = True + else: + file.write(chunk_line) + chunk_line = chunk_file.readline() + + +if __name__ == '__main__': + hmm_pro = Homology_processor() diff --git a/mantis/mantis.py b/mantis/mantis.py new file mode 100644 index 0000000..2dd3f27 --- /dev/null +++ b/mantis/mantis.py @@ -0,0 +1,550 @@ +try: + from mantis.multiprocessing import * +except: + from multiprocessing import * + + +def print_citation_mantis(): + paper_doi = 'https://doi.org/10.1093/gigascience/giab042' + separator = '##########################################################################################################################' + res = f'{separator}\n# Thank you for using Mantis, please make sure you cite the respective paper {paper_doi} #\n{separator}' + print(res) + + +def print_version(user, project): + import requests + response = requests.get(f"https://api.github.com/repos/{user}/{project}/releases/latest") + json = response.json() + if 'name' in json: + print(f'{project}\'s latest release is:', json['name']) + else: + print('No release available') + + +def run_mantis(input_path, + output_folder, + mantis_config=None, + evalue_threshold=None, + overlap_value=None, + minimum_consensus_overlap=None, + organism_details=None, + genetic_code=None, + domain_algorithm=None, + best_combo_formula=None, + sorting_type=None, + keep_files=False, + skip_consensus=False, + skip_managed_memory=False, + force_evalue=False, + no_consensus_expansion=False, + no_taxonomy=False, + no_unifunc=False, + kegg_matrix=False, + verbose_kegg_matrix=False, + output_gff=False, + verbose=True, + default_workers=None, + chunk_size=None, + time_limit=None, + hmmer_threads=None, + cores=None, + memory=None, + ): + if evalue_threshold: + if evalue_threshold != 'dynamic': evalue_threshold = float(evalue_threshold) + if overlap_value: overlap_value = float(overlap_value) + if minimum_consensus_overlap: minimum_consensus_overlap = float(minimum_consensus_overlap) + if best_combo_formula: best_combo_formula = int(best_combo_formula) + if default_workers: default_workers = int(default_workers) + if chunk_size: chunk_size = int(chunk_size) + if time_limit: time_limit = int(time_limit) + if hmmer_threads: hmmer_threads = int(hmmer_threads) + if cores: cores = int(cores) + if memory: memory = int(memory) + if genetic_code: genetic_code = int(genetic_code) + mantis = MANTIS( + input_path=input_path, + output_folder=output_folder, + mantis_config=mantis_config, + evalue_threshold=evalue_threshold, + overlap_value=overlap_value, + minimum_consensus_overlap=minimum_consensus_overlap, + organism_details=organism_details, + genetic_code=genetic_code, + domain_algorithm=domain_algorithm, + best_combo_formula=best_combo_formula, + sorting_type=sorting_type, + keep_files=keep_files, + skip_consensus=skip_consensus, + skip_managed_memory=skip_managed_memory, + force_evalue=force_evalue, + no_consensus_expansion=no_consensus_expansion, + no_taxonomy=no_taxonomy, + no_unifunc=no_unifunc, + kegg_matrix=kegg_matrix, + verbose_kegg_matrix=verbose_kegg_matrix, + output_gff=output_gff, + verbose=verbose, + default_workers=default_workers, + chunk_size=chunk_size, + time_limit=time_limit, + hmmer_threads=hmmer_threads, + user_cores=cores, + user_memory=memory, + ) + mantis.run_mantis() + + +def run_mantis_test(input_path, + output_folder, + mantis_config, + ): + mantis = MANTIS( + input_path=input_path, + output_folder=output_folder, + mantis_config=mantis_config, + keep_files=True) + mantis.run_mantis_test() + + +class MANTIS(Multiprocessing): + def __init__(self, + input_path=None, + output_folder=None, + mantis_config=None, + evalue_threshold=None, + overlap_value=None, + minimum_consensus_overlap=None, + domain_algorithm=None, + sorting_type=None, + best_combo_formula=None, + organism_details={}, + genetic_code=None, + redirect_verbose=None, + keep_files=False, + skip_consensus=False, + skip_managed_memory=False, + force_evalue=False, + no_consensus_expansion=False, + no_taxonomy=False, + no_unifunc=False, + kegg_matrix=False, + verbose_kegg_matrix=False, + output_gff=False, + verbose=True, + default_workers=None, + chunk_size=None, + time_limit=None, + hmmer_threads=None, + user_cores=None, + user_memory=None, + ): + self.output_folder = add_slash(output_folder) + self.redirect_verbose = redirect_verbose + print('------------------------------------------', flush=True, file=self.redirect_verbose) + print_cyan('Setting up Mantis!', flush=True, file=self.redirect_verbose) + print('------------------------------------------', flush=True, file=self.redirect_verbose) + self.input_path = input_path + self.mantis_config = mantis_config + + # Prediction parameters + self.evalue_threshold = evalue_threshold + self.default_evalue_threshold = 1e-3 # 1e-6 might be better a better default + self.minimum_evalue_threshold = 1e-2 + self.force_evalue = force_evalue + + if overlap_value: + self.overlap_value = overlap_value + else: + self.overlap_value = 0.1 + + if minimum_consensus_overlap: + self.minimum_consensus_overlap = minimum_consensus_overlap + else: + self.minimum_consensus_overlap = 0.7 + + if domain_algorithm: + self.domain_algorithm = domain_algorithm + else: + self.domain_algorithm = 'heuristic' + + if best_combo_formula: + self.best_combo_formula = best_combo_formula + else: + self.best_combo_formula = 1 + if hmmer_threads: + self.hmmer_threads = hmmer_threads + # 1 should be ideal if we are already using the maximum amount of cores with Mantis + else: + self.hmmer_threads = 1 + # the user can force the sorting type + if sorting_type: + self.sorting_type = sorting_type + else: + # but we recommend using bitscore for dfs, evalue for bpo or heuristic + if self.domain_algorithm == 'dfs': + self.sorting_type = 'bitscore' + else: + self.sorting_type = 'evalue' + self.organism_details = organism_details + self.genetic_code = genetic_code + # Execution parameters + self.skip_consensus = skip_consensus + self.skip_managed_memory = skip_managed_memory + self.no_consensus_expansion = no_consensus_expansion + self.no_unifunc = no_unifunc + self.kegg_matrix = kegg_matrix + self.verbose_kegg_matrix = verbose_kegg_matrix + self.output_gff = output_gff + if self.verbose_kegg_matrix: self.kegg_matrix = True + self.default_workers = default_workers + self.user_memory = user_memory + # chunk size is highly relevant in the execution time + self.chunk_size = chunk_size + if time_limit: + self.time_limit = time_limit + else: + self.time_limit = 60 + # diamond db size for scaling. we increase the db size to avoid overly good e-values, i.e., 0 where sample scaling by multiplication wouldn't change anything + self.diamond_db_size = 1e6 + print_cyan('Reading config file and setting up paths', flush=True, file=self.redirect_verbose) + Assembler.__init__(self, verbose=verbose, redirect_verbose=redirect_verbose, mantis_config=mantis_config, + keep_files=keep_files, user_cores=user_cores, no_taxonomy=no_taxonomy) + datetime_str = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + if self.input_path: + print_cyan(f'This MANTIS process started running at {datetime_str}', flush=True, file=self.redirect_verbose) + self.chunks_to_annotate = [] + self.chunks_to_fasta = {} + self.fastas_to_annotate = [] + self.print_available_hardware() + + def print_available_hardware(self): + if self.user_cores: + print(f'Cores allocated: {self.user_cores}') + else: + print(f'Cores allocated: {ENVIRONMENT_CORES}') + if self.user_memory: + print(f'Memory allocated: {self.user_memory}') + else: + print(f'Memory allocated: {round(AVAILABLE_RAM, 2)}') + print(f'Workers per core: {WORKER_PER_CORE}') + + def __str__(self): + if self.kegg_matrix and not self.verbose_kegg_matrix: + kegg_matrix_str = 'Generate KEGG modules matrix:\t' + str(self.kegg_matrix) + '\n' + elif self.kegg_matrix and self.verbose_kegg_matrix: + kegg_matrix_str = 'Generate KEGG modules matrix in verbose mode:\t' + str(self.verbose_kegg_matrix) + '\n' + else: + kegg_matrix_str = '' + + output_list = [ + 'Output folder:\t\t\t' + str(self.output_folder) + '\n' if self.output_folder else '', + 'Mantis config:\t\t\t' + str(self.mantis_config) + '\n' if self.mantis_config else '', + 'Target path:\t\t\t' + str(self.input_path) + '\n' if self.input_path else '', + 'E-value threshold:\t\t' + str(self.evalue_threshold) + '\n' if self.evalue_threshold else '', + 'E-value threshold:\t\t' + str(self.default_evalue_threshold) + '\n' if not self.evalue_threshold else '', + 'Forcing e-value:\t\t' + str(self.force_evalue) + '\n' if self.force_evalue else '', + 'Overlap value:\t\t\t' + str(self.overlap_value) + '\n' if self.overlap_value else '', + 'Default workers:\t\t' + str(self.default_workers) + '\n' if self.default_workers else '', + 'User cores:\t\t\t' + str(self.user_cores) + '\n' if self.user_cores else '', + 'HMMER threads:\t\t\t' + str(self.hmmer_threads) + '\n' if self.hmmer_threads else '', + 'Chunk size:\t\t\t' + str(self.chunk_size) + '\n' if self.chunk_size else '', + 'Algorithm:\t\t\t' + str(self.domain_algorithm) + '\n' if self.domain_algorithm else '', + # 'Formula:\t\t\t' + str(self.best_combo_formula) + '\n' if self.best_combo_formula else '', + # 'Sorting type:\t\t\t' + str(self.sorting_type) + '\n' if self.sorting_type else '', + 'Outputting GFF:\t\t\t' + str(self.output_gff) + '\n' if self.output_gff else '', + 'Skip consensus:\t\t' + str(self.skip_consensus) + '\n' if self.skip_consensus else '', + 'Skip memory management:\t\t' + str(self.skip_managed_memory) + '\n' if self.skip_managed_memory else '', + 'Skip consensus expansion:\t' + str( + self.no_consensus_expansion) + '\n' if self.no_consensus_expansion else '', + 'Use taxonomy:\t' + str(self.use_taxonomy) + '\n' if not self.use_taxonomy else '', + 'Skip text similarity analysis:\t' + str(self.no_unifunc) + '\n' if self.no_unifunc else '', + kegg_matrix_str, + '------------------------------------------'] + return 'User configuration:' + '\n' + '------------------------------------------' + '\n' + ''.join(output_list) + + def generate_fastas_to_annotate(self): + if '.' not in self.input_path: + kill_switch(InvalidTargetFile, + 'Your file does not have an extension, so Mantis can\'t detect the file format.\nPlease provide a valid target file', + flush=True, file=self.redirect_verbose) + else: + if os.path.isdir(self.input_path): + self.annotate_directory() + elif self.input_path.endswith('.tsv'): + self.annotate_multiple_samples() + elif self.input_path.split('.')[-2] in ['tar']: + self.annotate_compressed_sample() + elif self.input_path.endswith('.gz') or self.input_path.endswith('.zip'): + self.annotate_compressed_sample() + elif is_fasta(self.input_path): + self.annotate_one_sample() + else: + kill_switch(InvalidTargetFile, + 'Your file does not appear to be a fasta. If you want to annotate multiple samples, make sure your file has the <.tsv> extension.', + flush=True, file=self.redirect_verbose) + if not self.fastas_to_annotate: kill_switch(NoValidFiles) + for file_path, output_path, organism_details, genetic_code, count_seqs_original_file, count_residues_original_file in self.fastas_to_annotate: + Path(output_path).mkdir(parents=True, exist_ok=True) + + def annotate_multiple_samples(self): + try: + with open(self.input_path) as file: + line = file.readline() + if SPLITTER not in line: + line = file.readline() + while line: + line = line.strip('\n').split('\t') + if len(line) >= 2: + query_name = line[0] + line_path = line[1] + if len(line) == 4: + genetic_code = ' '.join(line[-1]) + organism_details = ' '.join(line[2:-1]) + if organism_details == 'None': organism_details = '' + elif len(line) == 3: + organism_details = ' '.join(line[2:]) + if organism_details == 'None': organism_details = '' + genetic_code = None + else: + organism_details = None + genetic_code = None + count_seqs_original_file = get_seqs_count(line_path) + count_residues_original_file = count_residues(line_path) + if os.path.exists(line_path): + self.fastas_to_annotate.append([line_path, add_slash(self.output_folder + query_name), + organism_details, genetic_code, + count_seqs_original_file, count_residues_original_file]) + else: + kill_switch(TargetFileNotFound, + flush=True, file=self.redirect_verbose) + line = file.readline() + except: + kill_switch(InvalidTargetFile, + 'If you want to annotate multiple samples, make sure your file is correctly formatted. Please see the examples in the folder.', + flush=True, file=self.redirect_verbose) + + def annotate_directory(self): + try: + list_dir = os.listdir(self.input_path) + for file in list_dir: + if 'faa' in file.split('.')[-1]: + query_name = '.'.join(file.split('.')[0:-1]) + query_path = self.input_path + file + count_seqs_original_file = get_seqs_count(query_path) + count_residues_original_file = count_residues(query_path) + self.fastas_to_annotate.append([query_path, add_slash(self.output_folder + query_name), None, None, + count_seqs_original_file, count_residues_original_file]) + except: + kill_switch(InvalidTargetFile, 'Something went wrong when annotating the provided directory!', flush=True, + file=self.redirect_verbose) + + def annotate_compressed_sample(self): + try: + uncompressed_path = self.output_folder + 'uncompressed_samples/' + Path(uncompressed_path).mkdir(parents=True, exist_ok=True) + uncompressing_function = uncompress_archive(source_filepath=self.input_path, + extract_path=uncompressed_path) + list_dir = os.listdir(uncompressed_path) + for file in list_dir: + if os.path.isdir(uncompressed_path + file): + sub_list_dir = os.listdir(uncompressed_path + file) + for sub_file in sub_list_dir: + if 'faa' in sub_file.split('.')[-1]: + query_name = '.'.join(sub_file.split('.')[0:-1]) + query_path = add_slash(uncompressed_path + file) + sub_file + count_seqs_original_file = get_seqs_count(query_path) + count_residues_original_file = count_residues(query_path) + self.fastas_to_annotate.append( + [query_path, add_slash(self.output_folder + query_name), None, None, + count_seqs_original_file, count_residues_original_file]) + if 'faa' in file.split('.')[-1]: + query_name = '.'.join(file.split('.')[0:-1]) + query_path = uncompressed_path + file + count_seqs_original_file = get_seqs_count(query_path) + count_residues_original_file = count_residues(query_path) + self.fastas_to_annotate.append( + [query_path, add_slash(self.output_folder + query_name), None, None, + count_seqs_original_file, count_residues_original_file]) + except: + kill_switch(InvalidTargetFile, 'Something went wrong when annotating the provided compressed file!', + flush=True, file=self.redirect_verbose) + + def annotate_one_sample(self): + count_seqs_original_file = get_seqs_count(self.input_path) + count_residues_original_file = count_residues(self.input_path) + self.fastas_to_annotate.append( + [self.input_path, self.output_folder, self.organism_details, self.genetic_code, + count_seqs_original_file, count_residues_original_file]) + + def setup_organism_lineage(self, organism_details, stdout_file): + if not self.use_taxonomy: return [] + if not organism_details: + print_cyan('No data provided for organism lineage!', flush=True, file=stdout_file) + return [] + if re.match('\d+', organism_details): + print_cyan('Setting up organism lineage from provided NCBI taxon id', flush=True, file=stdout_file) + organism_lineage = self.fetch_ncbi_lineage(organism_details) + else: + print_cyan('Setting up organism lineage from provided taxon synonym or GTDB lineage', flush=True, + file=stdout_file) + organism_details_dict = {'synonyms': organism_details} + ncbi_taxon_id = self.get_taxa_ncbi(organism_details) + organism_lineage = self.fetch_ncbi_lineage(ncbi_taxon_id) + return organism_lineage + + def generate_translated_sample(self): + ncbi_resources = add_slash(self.mantis_paths['resources'] + 'NCBI') + translation_tables = parse_translation_tables(ncbi_resources + 'gc.prt') + for i in range(len(self.fastas_to_annotate)): + file_path, output_path, organism_details, genetic_code, count_seqs_original_file, count_residues_original_file = \ + self.fastas_to_annotate[i] + sample_type = check_sample_type(file_path) + if sample_type == 'dna' or sample_type == 'rna': + if not genetic_code: + genetic_code = 11 + translated_fasta_path = f'{output_path}translated_gc_{genetic_code}.faa' + try: + write_translated_fasta(original_fasta_path=file_path, translated_fasta_path=translated_fasta_path, + translation_table=translation_tables[genetic_code], sample_type=sample_type) + self.fastas_to_annotate[i][0] = translated_fasta_path + self.fastas_to_annotate[i][5] = count_residues(translated_fasta_path) + except Exception as e: + kill_switch(InvalidTranslation, file_path) + + def generate_sample_lineage(self): + self.start_taxonomy_connection() + for i in range(len(self.fastas_to_annotate)): + file_path, output_path, organism_details, genetic_code, count_seqs_original_file, count_residues_original_file = self.fastas_to_annotate[i] + stdout_file = open(output_path + 'Mantis.out', 'a+') + organism_lineage = self.setup_organism_lineage(organism_details, stdout_file) + self.fastas_to_annotate[i][2] = organism_lineage + print('------------------------------------------', flush=True, file=stdout_file) + if organism_lineage: + print_cyan('Target file:\n' + file_path + '\n has the following taxonomy lineage: ' + ' > '.join( + organism_lineage), flush=True, file=stdout_file) + else: + print_cyan('Target file:\n' + file_path + '\n has no organism lineage!', flush=True, file=stdout_file) + print('------------------------------------------', flush=True, file=stdout_file) + stdout_file.close() + + def remove_non_essential_files(self): + for file_path, output_path, organism_details, genetic_code, count_seqs_original_file, count_residues_original_file in self.fastas_to_annotate: + if file_exists(output_path + 'fasta_chunks/'): + shutil.rmtree(output_path + 'fasta_chunks/') + + def remove_uncompressed_files(self): + if file_exists(self.output_folder + 'uncompressed_samples/'): + shutil.rmtree(self.output_folder + 'uncompressed_samples/') + + @timeit_class + def run_mantis_test(self): + self.mantis_paths['custom'] = MANTIS_FOLDER + 'tests/test_hmm/' + self.output_gff = True + Path(self.output_folder).mkdir(parents=True, exist_ok=True) + self.generate_fastas_to_annotate() + self.generate_translated_sample() + self.generate_sample_lineage() + + self.split_sample() + + self.set_chunks_to_annotate() + + worker_count = 1 + for hmm_path in [MANTIS_FOLDER + 'tests/test_hmm/test1/test1.hmm', + MANTIS_FOLDER + 'tests/test_hmm/test2/test2.hmm', + MANTIS_FOLDER + 'tests/test_hmm/test3/test3.hmm', + ]: + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + self.create_chunk_output_dirs(current_chunk_dir) + command, output_file = self.compile_annotation_job(hmm_path, + target_path=chunk_path, + output_folder=current_chunk_dir, + count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file) + self.queue.append(['General', command, output_path + 'Mantis.out']) + + self.processes_handler(self.worker_annotation, worker_count) + + self.prepare_queue_split_hits(worker_count) + self.processes_handler(self.worker_split_hits, worker_count) + + self.prepare_queue_process_output() + self.processes_handler(self.worker_process_output, worker_count) + + self.prepare_queue_merge_output() + self.processes_handler(self.worker_merge_output, worker_count) + + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + output_annotation_tsv = current_chunk_dir + 'output_annotation.tsv' + self.queue.append([output_annotation_tsv, current_chunk_dir]) + self.processes_handler(self.worker_interpret_output, worker_count) + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + interepreted_annotation_tsv = current_chunk_dir + 'integrated_annotation.tsv' + stdout_file_path = output_path + 'Mantis.out' + self.queue.append([interepreted_annotation_tsv, current_chunk_dir, stdout_file_path]) + Consensus.__init__(self) + self.processes_handler(self.worker_consensus_output, worker_count) + + for output_path in self.chunks_to_fasta: + self.queue.append([output_path, self.chunks_to_fasta[output_path]]) + self.processes_handler(self.worker_merge_mantis_output, worker_count) + self.generate_matrix() + + print('Mantis ran sucessfully!', flush=True) + + @timeit_class + def run_mantis(self): + self.check_installation(verbose=True) + if not self.passed_check: + kill_switch(InstallationCheckNotPassed) + + Path(self.output_folder).mkdir(parents=True, exist_ok=True) + print_cyan('Reading input file', flush=True, file=self.redirect_verbose) + self.generate_fastas_to_annotate() + print_cyan('Determining lineage', flush=True, file=self.redirect_verbose) + self.generate_translated_sample() + self.generate_sample_lineage() + if self.use_taxonomy: + try: + self.close_taxonomy_connection() + except: + pass + self.split_sample() + self.set_chunks_to_annotate() + start_time = time() + self.run_homology_search() + print(f'Homology search took {int(time() - start_time)} seconds to run', flush=True, file=self.redirect_verbose) + processing_start_time = time() + start_time = time() + self.process_output() + print(f'Output processing took {int(time() - start_time)} seconds to run', flush=True, + file=self.redirect_verbose) + start_time = time() + self.interpret_output() + print(f'Output interpretation took {int(time() - start_time)} seconds to run', flush=True, + file=self.redirect_verbose) + if not self.skip_consensus: + start_time = time() + self.get_consensus_output() + print(f'Consensus generation took {int(time() - start_time)} seconds to run', flush=True, + file=self.redirect_verbose) + start_time = time() + self.merge_mantis_output() + print(f'Output merge took {int(time() - start_time)} seconds to run', flush=True, file=self.redirect_verbose) + start_time = time() + self.remove_uncompressed_files() + print(f'In total, Mantis took {int(time() - processing_start_time)} seconds to process homology search results', + flush=True, file=self.redirect_verbose) + if not self.keep_files: + print_cyan('Removing temporary files', flush=True, file=self.redirect_verbose) + self.remove_non_essential_files() + if self.kegg_matrix and not self.skip_consensus: + print('Generating KEGG modules matrix!', flush=True, file=self.redirect_verbose) + self.generate_matrix() + print(f'Mantis has finished running! It took {str(int(time() - self.start_time))} seconds to run.', flush=True, + file=self.redirect_verbose) + + +if __name__ == '__main__': + m = MANTIS() diff --git a/mantis/metadata.py b/mantis/metadata.py new file mode 100644 index 0000000..783789e --- /dev/null +++ b/mantis/metadata.py @@ -0,0 +1,519 @@ +try: + from mantis.assembler import * +except: + from assembler import * + + +class Metadata(): + + def get_target_custom_ref_paths(self, target, folder): + for custom_ref in self.get_custom_refs_paths(folder=folder): + if target in custom_ref: return custom_ref + + def is_good_annotation(self, to_add): + if 'unknown' in to_add: + return False + elif 'hypothetical protein' in to_add: + return False + elif 'hypothetical enzyme' in to_add: + return False + elif 'putative protein' in to_add: + return False + elif 'putative enzyme' in to_add: + return False + return True + + def add_to_dict(self, dict_hits, dict_key, to_add): + if not to_add: return + if dict_key not in dict_hits['link']: + dict_hits['link'][dict_key] = set() + if isinstance(to_add, str): + list_to_add = [to_add] + else: + list_to_add = to_add + for i in list_to_add: + if self.is_good_annotation(i.lower()): + if i not in dict_hits['link'][dict_key]: + i = i.strip() + if i: + dict_hits['link'][dict_key].add(i) + + def get_link_compiled_metadata(self, dict_hits, ref_file_path): + cursor = Metadata_SQLITE_Connector(ref_file_path) + for hit in dict_hits: + hit_dict = dict_hits[hit] + hit_info = cursor.fetch_metadata(hit) + for db in hit_info: + if db not in hit_dict['link']: hit_dict['link'][db] = set() + hit_dict['link'][db].update(hit_info[db]) + cursor.close_sql_connection() + + def get_common_links(self, string, res={}): + ec = find_ecs(string) + if ec: + self.add_to_dict(res, 'enzyme_ec', ec) + tc = find_tcdb(string) + if tc: + self.add_to_dict(res, 'tcdb', tc) + tigr = find_tigrfam(string) + if tigr: + self.add_to_dict(res, 'tigrfam', tigr) + ko = find_ko(string) + if ko: + self.add_to_dict(res, 'kegg_ko', ko) + pfam = find_pfam(string) + if pfam: + self.add_to_dict(res, 'pfam', pfam) + cog = find_cog(string) + if cog: + self.add_to_dict(res, 'cog', cog) + arcog = find_arcog(string) + if arcog: + self.add_to_dict(res, 'arcog', arcog) + go = find_go(string) + if go: + self.add_to_dict(res, 'go', cog) + return res + + def get_essential_genes(self): + essential_genes = f'{MANTIS_FOLDER}Resources{SPLITTER}essential_genes/essential_genes.txt' + if file_exists(essential_genes): + with open(essential_genes) as file: lines = file.readlines() + lines = [l.strip('\n') for l in lines] + return set(lines) + + def is_essential(self, dict_hits): + essential_genes = self.get_essential_genes() + if essential_genes: + for hit in dict_hits: + valid_ids = set() + if 'pfam' in dict_hits[hit]['link']: + valid_ids.update(dict_hits[hit]['link']['pfam']) + if 'tigrfam' in dict_hits[hit]['link']: + valid_ids.update(dict_hits[hit]['link']['tigrfam']) + valid_ids.update(find_tigrfam(dict_hits[hit]['link']['hit'])) + valid_ids.update(find_tigrfam(dict_hits[hit]['link']['accession'])) + valid_ids.update(find_pfam(dict_hits[hit]['link']['hit'])) + valid_ids.update(find_pfam(dict_hits[hit]['link']['accession'])) + if valid_ids.intersection(essential_genes): + self.add_to_dict(dict_hits[hit], 'is_essential_gene', 'True') + + def get_hit_links(self, dict_hits, ref_file): + if re.search('NOG[GT]', ref_file): + if 'NOGG' in ref_file: + taxon_id = 'NOGG' + else: + taxon_id = re.search('NOGT\d+', ref_file).group().replace('NOGT', '') + metadata_file = add_slash(self.mantis_paths['NOG'] + taxon_id) + 'metadata.tsv' + self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=metadata_file) + elif re.search('NCBI[GT]', ref_file): + if 'NCBIG' in ref_file: + taxon_id = 'NCBIG' + else: + taxon_id = re.search('NCBIT\d+', ref_file).group().replace('NCBIT', '') + metadata_file = add_slash(self.mantis_paths['NCBI'] + taxon_id) + 'metadata.tsv' + self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=metadata_file) + self.is_essential(dict_hits) + + elif ref_file == 'Pfam-A': + self.get_link_compiled_metadata(dict_hits=dict_hits, + ref_file_path=self.mantis_paths['pfam'] + 'metadata.tsv') + self.is_essential(dict_hits) + elif ref_file == 'kofam_merged': + self.get_link_compiled_metadata(dict_hits=dict_hits, + ref_file_path=self.mantis_paths['kofam'] + 'metadata.tsv') + elif ref_file == 'tcdb': + self.get_link_compiled_metadata(dict_hits=dict_hits, + ref_file_path=self.mantis_paths['tcdb'] + 'metadata.tsv') + else: + custom_ref_path = self.get_target_custom_ref_paths(ref_file, folder=True) + if custom_ref_path: + file_path = add_slash(custom_ref_path) + 'metadata.tsv' + if file_exists(file_path): + self.get_link_compiled_metadata(dict_hits=dict_hits, ref_file_path=file_path) + for hit in dict_hits: + self.get_common_links(hit, dict_hits[hit]) + if 'accession' in dict_hits[hit]['link']: self.get_common_links(dict_hits[hit]['link']['accession'], + dict_hits[hit]) + return dict_hits + + def remove_ids_text(self, sorted_keys, temp_link, target_removal): + for link_key in sorted_keys: + if isinstance(temp_link[link_key], str): temp_link[link_key] = [temp_link[link_key]] + for inner_l in temp_link[link_key]: + if target_removal == 'description': + for i in range(len(temp_link['description'])): + temp_link['description'][i] = temp_link['description'][i].replace(inner_l, '').replace('()', + '').strip() + elif target_removal == 'kegg_map_lineage': + if 'kegg_map' == link_key and 'kegg_map_lineage' in temp_link: + for i in range(len(temp_link['kegg_map_lineage'])): + temp_link['kegg_map_lineage'][i] = temp_link['kegg_map_lineage'][i].replace(inner_l, + '').replace( + '()', '').strip() + + def generate_interpreted_line(self, query, ref_file, link, evalue, bitscore, direction, query_len, query_start, + query_end, + ref_start, ref_end, ref_len): + temp_link = dict(link) + hit = temp_link.pop('hit') + hit_accession = '-' + if 'accession' in temp_link: hit_accession = temp_link.pop('accession') + row_start = [query, ref_file, hit, hit_accession, evalue, bitscore, direction, query_len, query_start, + query_end, + ref_start, ref_end, ref_len, '|'] + res = [] + sorted_keys = sorted(temp_link.keys()) + if 'enzyme_ec' in sorted_keys: + sorted_keys.remove('enzyme_ec') + sorted_keys.insert(0, 'enzyme_ec') + # so that description always comes in the end + if 'kegg_map_lineage' in sorted_keys: + sorted_keys.remove('kegg_map_lineage') + self.remove_ids_text(sorted_keys, temp_link, target_removal='kegg_map_lineage') + sorted_keys.append('kegg_map_lineage') + if 'description' in sorted_keys: + sorted_keys.remove('description') + # self.remove_ids_text(sorted_keys, temp_link, target_removal='description') + sorted_keys.append('description') + for link_key in sorted_keys: + if isinstance(temp_link[link_key], str): temp_link[link_key] = [temp_link[link_key]] + for inner_l in temp_link[link_key]: + res.append(link_key + ':' + inner_l) + res = sorted(res) + res = row_start + res + return res + + def read_and_interpret_output_annotation(self, output_annotation_tsv): + c = 0 + links_to_get = {} + lines_info = {} + with open(output_annotation_tsv) as file: + line = file.readline() + line = file.readline() + while line: + line = line.strip('\n').split('\t') + query, ref_file, ref_hit, ref_hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line + if self.nog_db == 'hmm' and 'NOG' in ref_file: + ref_hit = ref_hit.split('.')[0] + if ref_file not in links_to_get: links_to_get[ref_file] = {} + if ref_hit not in links_to_get[ref_file]: links_to_get[ref_file][ref_hit] = {'link': {'hit': ref_hit}, + 'lines': []} + if ref_hit_accession != '-': links_to_get[ref_file][ref_hit]['link']['accession'] = ref_hit_accession + links_to_get[ref_file][ref_hit]['lines'].append(c) + lines_info[c] = {'query': query, 'evalue': evalue, 'bitscore': bitscore, 'direction': direction, + 'query_len': query_len, + 'query_start': query_start, 'query_end': query_end, 'ref_start': ref_start, + 'ref_end': ref_end, 'ref_len': ref_len} + c += 1 + line = file.readline() + res = {} + for ref_file in links_to_get: + ref_file_links = self.get_hit_links(links_to_get[ref_file], ref_file) + for ref_hit in ref_file_links: + for line in ref_file_links[ref_hit]['lines']: + res[line] = self.generate_interpreted_line(query=lines_info[line]['query'], + ref_file=ref_file, + link=ref_file_links[ref_hit]['link'], + evalue=lines_info[line]['evalue'], + bitscore=lines_info[line]['bitscore'], + direction=lines_info[line]['direction'], + query_len=lines_info[line]['query_len'], + query_start=lines_info[line]['query_start'], + query_end=lines_info[line]['query_end'], + ref_start=lines_info[line]['ref_start'], + ref_end=lines_info[line]['ref_end'], + ref_len=lines_info[line]['ref_len'], + ) + return res + + def generate_gff_line_integrated(self, integrated_line): + # https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + # verified with http://genometools.org/cgi-bin/gff3validator.cgi + line_split = integrated_line.index('|') + line_data, annotation_data = integrated_line[:line_split], integrated_line[line_split + 1:] + query, ref_file, hit, hit_accession, evalue, bitscore, direction, query_len, query_start, query_end, ref_start, ref_end, ref_len = line_data + # attributes of gff line: + + if hit_accession != '-': + attributes = f'Name={hit};Target={hit} {ref_start} {ref_end};Alias={hit_accession}' + else: + attributes = f'Name={hit};Target={hit} {ref_start} {ref_end}' + notes = f'Note=ref_file:{ref_file},ref_len:{ref_len}' + dbxref = [] + ontology_terms = [] + descriptions = [] + + for i in annotation_data: + if not i.startswith('go:'): + dbxref.append(i) + elif i.startswith('description:'): + descriptions.append(i) + else: + ontology_terms.append(i) + if descriptions: + notes += ',' + ','.join(descriptions) + all_annotations = None + if dbxref and ontology_terms: + all_annotations = 'Dbxref=' + ','.join(dbxref) + ';' + 'Ontology_term=' + ','.join(ontology_terms) + elif dbxref and not ontology_terms: + all_annotations = 'Dbxref=' + ','.join(dbxref) + elif not dbxref and ontology_terms: + all_annotations = 'Ontology_term=' + ','.join(ontology_terms) + gff_line = '\t'.join([query, 'Mantis', 'CDS', query_start, query_end, evalue, direction, '0']) + '\t' + if all_annotations: + gff_line += ';'.join([attributes, notes, all_annotations]) + else: + gff_line += ';'.join([attributes, notes]) + sequence_region_line = f'##sequence-region {query} 1 {query_len}' + return query, sequence_region_line, gff_line + + def generate_integrated_output(self, output_annotation_tsv, interpreted_annotation_tsv): + first_line = ['Query', + 'Ref_file', + 'Ref_hit', + 'Ref_hit_accession', + 'evalue', + 'bitscore', + 'Direction', + 'Query_length', + 'Query_hit_start', + 'Query_hit_end', + 'Ref_hit_start', + 'Ref_hit_end', + 'Ref_length', + '|', + 'Links'] + first_line = '\t'.join(first_line) + output_file = open(interpreted_annotation_tsv, 'w+') + output_file.write(first_line + '\n') + + gff_file = None + gff_file_path = interpreted_annotation_tsv.replace('.tsv', '.gff') + gff_dict = {} + if self.output_gff: + gff_file = open(gff_file_path, 'w+') + gff_file.write('##gff-version 3' + '\n') + + # generating output + output_annotation = self.read_and_interpret_output_annotation(output_annotation_tsv) + for line in range(len(output_annotation)): + current_output_line = output_annotation[line] + if current_output_line: + out_line = '\t'.join(current_output_line) + output_file.write(out_line + '\n') + + if gff_file: + seq_id, sequence_region_line, gff_line = self.generate_gff_line_integrated(current_output_line) + if seq_id not in gff_dict: + gff_dict[seq_id] = {'seq_region': sequence_region_line, 'lines': []} + gff_dict[seq_id]['lines'].append(gff_line) + # writing gff + for seq_id in gff_dict: + gff_file.write(gff_dict[seq_id]['seq_region'] + '\n') + for seq_id in gff_dict: + for annot_line in gff_dict[seq_id]['lines']: + gff_file.write(annot_line + '\n') + + if gff_file: + gff_file.close() + output_file.close() + + ######for KEGG module matrix##### + def generate_module_col(self, tree_modules): + sorted_keys = [ + 'Carbohydrate metabolism', + 'Energy metabolism', + 'Lipid metabolism', + 'Nucleotide metabolism', + 'Amino acid metabolism', + 'Glycan metabolism', + 'Metabolism of cofactors and vitamins', + 'Biosynthesis of terpenoids and polyketides', + 'Biosynthesis of other secondary metabolites', + 'Xenobiotics biodegradation', + ] + res = [] + for sk in sorted_keys: + sorted_sub_paths = sorted(tree_modules[sk].keys()) + for ssp in sorted_sub_paths: + sorted_modules = sorted(tree_modules[sk][ssp]) + for sm in sorted_modules: + module_name, module_paths = tree_modules[sk][ssp][sm] + res.append([sm, module_name]) + return res + + def get_best_sample_module_path(self, sample_kos, all_paths): + best_score = 0 + best_path = set() + + for current_path in all_paths: + available_kos = current_path.intersection(sample_kos) + current_score = len(available_kos) / len(current_path) + + if current_score > best_score: + best_score = current_score + best_path = current_path + + if best_score: + available_kos = best_path.intersection(sample_kos) + missing_kos = best_path.difference(available_kos) + available = ','.join(available_kos) + missing = ','.join(missing_kos) + else: + available = 'NA' + missing = 'NA' + return best_score, available, missing + + def generate_sample_col_verbose(self, sample_kos, tree_modules): + sorted_keys = [ + 'Carbohydrate metabolism', + 'Energy metabolism', + 'Lipid metabolism', + 'Nucleotide metabolism', + 'Amino acid metabolism', + 'Glycan metabolism', + 'Metabolism of cofactors and vitamins', + 'Biosynthesis of terpenoids and polyketides', + 'Biosynthesis of other secondary metabolites', + 'Xenobiotics biodegradation', + ] + res = {} + for sk in sorted_keys: + sorted_sub_paths = sorted(tree_modules[sk].keys()) + for ssp in sorted_sub_paths: + sorted_modules = sorted(tree_modules[sk][ssp]) + for sm in sorted_modules: + module_name, module_paths = tree_modules[sk][ssp][sm] + module_perc, available, missing = self.get_best_sample_module_path(sample_kos, module_paths) + module_perc = str(round(module_perc, 3)) + res[sm] = [module_perc, available, missing] + return res + + def generate_sample_col_non_verbose(self, sample_kos, tree_modules): + sorted_keys = [ + 'Carbohydrate metabolism', + 'Energy metabolism', + 'Lipid metabolism', + 'Nucleotide metabolism', + 'Amino acid metabolism', + 'Glycan metabolism', + 'Metabolism of cofactors and vitamins', + 'Biosynthesis of terpenoids and polyketides', + 'Biosynthesis of other secondary metabolites', + 'Xenobiotics biodegradation', + ] + res = {} + for sk in sorted_keys: + sorted_sub_paths = sorted(tree_modules[sk].keys()) + for ssp in sorted_sub_paths: + sorted_modules = sorted(tree_modules[sk][ssp]) + for sm in sorted_modules: + module_name, module_paths = tree_modules[sk][ssp][sm] + module_perc, available, missing = self.get_best_sample_module_path(sample_kos, module_paths) + module_perc = str(round(module_perc, 3)) + res[sm] = [module_perc] + + return res + + def generate_sample_col(self, sample_kos, tree_modules): + if self.verbose_kegg_matrix: + return self.generate_sample_col_verbose(sample_kos, tree_modules) + else: + return self.generate_sample_col_non_verbose(sample_kos, tree_modules) + + def get_sample_kos(self, annotation_file): + res = set() + sample_name = annotation_file.split(SPLITTER)[-2] + with open(annotation_file) as file: + line = file.readline() + while line: + line = line.strip('\n') + line = line.split('\t') + kegg_kos = [i.replace('kegg_ko:', '') for i in line if 'kegg_ko:' in i] + res.update(kegg_kos) + line = file.readline() + return {'sample': sample_name, 'kegg_ko': res} + + def get_ids_consensus(self, input_file, wanted_db): + res = {} + with open(input_file) as file: + file.readline() + for line in file: + line = line.strip('\n') + line = line.split('\t') + seq_id = line[0] + if seq_id not in res: res[seq_id] = set() + separator = line.index('|') + annotations = line[separator + 1:] + for db_annot in annotations: + db = db_annot.split(':')[0] + if wanted_db == db: + # to avoid bad splitting when dealing with descriptions + annot = db_annot[len(db) + 1:] + res[seq_id].add(annot) + return res + + def export_sample_kos(self, samples_info): + out_file = self.output_folder + 'sample_kos.tsv' + with open(out_file, 'w+') as file: + firstline = 'Sample\tKOs\n' + file.write(firstline) + for sample_path in samples_info: + sample_name = sample_path.split(SPLITTER)[-2] + seq_kos = self.get_ids_consensus(sample_path, wanted_db='kegg_ko') + for seq in seq_kos: + seq_name = f'{sample_name}@{seq}' + for ko in seq_kos[seq]: + line = f'{seq_name}\t{ko}\n' + file.write(line) + + def generate_matrix(self): + sample_paths = [] + figure_info = {} + out_file = self.output_folder + 'kegg_modules.tsv' + for i in self.fastas_to_annotate: + file_path, sample_output_path, organism_details, genetic_code, count_seqs_original_file, count_residues_original_file = i + sample_paths.append(sample_output_path + 'consensus_annotation.tsv') + + samples_info = [self.get_sample_kos(i) for i in sample_paths] + tree = load_metrics(add_slash(self.mantis_paths['resources'] + 'KEGG') + 'modules.pickle') + self.export_sample_kos(sample_paths) + if tree: + module_col = self.generate_module_col(tree) + with open(out_file, 'w+') as file: + if self.verbose_kegg_matrix: + top_line = [f'Completeness_score_{s["sample"]}\tKOs_{s["sample"]}\tMissing_KOs_{s["sample"]}' for s + in samples_info] + top_line.insert(0, 'Module_ID\tModule_name') + + else: + top_line = [s['sample'] for s in samples_info] + top_line.insert(0, 'Module_ID') + top_line = '\t'.join(top_line) + '\n' + file.write(top_line) + samples_perc = [[s['sample'], self.generate_sample_col(s['kegg_ko'], tree)] for s in samples_info] + for i in range(len(module_col)): + module_id, module_name = module_col[i] + if self.verbose_kegg_matrix: + module_key = f'{module_id}\t{module_name}' + else: + module_key = module_id + module_line = [module_key] + for sample_names, s in samples_perc: + module_line.extend(s[module_id]) + if sample_names not in figure_info: figure_info[sample_names] = {} + figure_info[sample_names][module_id] = s[module_id][0] + + if i == len(module_col) - 1: + module_line = '\t'.join(module_line) + else: + module_line = '\t'.join(module_line) + '\n' + file.write(module_line) + else: + print('KEGG modules pickle is not present, so Mantis cannot create the KEGG matrix', flush=True, + file=self.redirect_verbose) + + +if __name__ == '__main__': + m = Metadata() diff --git a/mantis/metadata_sqlite_connector.py b/mantis/metadata_sqlite_connector.py new file mode 100644 index 0000000..9e728b9 --- /dev/null +++ b/mantis/metadata_sqlite_connector.py @@ -0,0 +1,197 @@ +try: + from mantis.utils import * +except: + from utils import * + + +class Metadata_SQLITE_Connector(): + def __init__(self, metadata_file): + self.metadata_file_tsv = metadata_file + self.db_file = metadata_file.replace('.tsv', '') + '.db' + if not file_exists(self.metadata_file_tsv): + print(f'Metadata file missing: {metadata_file}') + return + self.insert_step = 5000 + self.info_splitter = '##' + if not file_exists(self.db_file): + print('Creating SQL database', self.db_file) + self.create_sql_table() + self.start_sqlite_cursor() + + def start_sqlite_cursor(self): + self.sqlite_connection = sqlite3.connect(self.db_file) + self.cursor = self.sqlite_connection.cursor() + self.get_db_headers() + + def commit_and_close_sqlite_cursor(self): + self.sqlite_connection.commit() + self.sqlite_connection.close() + + def close_sql_connection(self): + try: + self.sqlite_connection.close() + except: + return + + def check_all_tables(self): + self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") + all_tables = self.cursor.fetchall() + print(all_tables) + + def convert_row_to_sql(self, ref, row_info): + res = [ref] + for db in self.db_headers: + if db in row_info: + res.append(self.info_splitter.join(row_info[db])) + else: + res.append(None) + return res + + def generate_insert_command(self): + headers_str = ', '.join(self.db_headers) + headers_str = f'(REF, {headers_str})'.upper() + n_values = ['?' for i in range(len(self.db_headers) + 1)] + n_values_str = ', '.join(n_values) + n_values_str = f'({n_values_str})' + insert_command = f'INSERT INTO METADATA {headers_str} values {n_values_str}' + return insert_command + + def generate_fetch_command(self, ref_id): + headers_str = ', '.join(self.db_headers) + headers_str = f'REF, {headers_str}'.upper() + fetch_command = f'SELECT {headers_str} FROM METADATA WHERE REF="{ref_id}"' + return fetch_command + + def yield_metadata(self): + with open(self.metadata_file_tsv, 'r') as file: + for line in file: + row_info = {} + line = line.strip('\n') + if line: + line = line.split('\t') + current_ref = line[0] + if '|' in line: line.remove('|') + annotations = line[1:] + for link in annotations: + if link: + temp_link = link.split(':') + link_type = temp_link[0] + link_text = ':'.join(temp_link[1:]) + link_text = link_text.strip() + if link_type not in row_info: row_info[link_type] = set() + row_info[link_type].add(link_text) + if link_type == 'description' and link_text == 'NA': + link_text = None + if link_text and link_type == 'description': + get_common_links_metadata(link_text, res=row_info) + yield self.convert_row_to_sql(current_ref, row_info) + + def get_db_headers(self): + res = set() + try: + schema_command = f'PRAGMA table_info(METADATA);' + res_fetch = self.cursor.execute(schema_command).fetchall() + res_fetch.pop(0) + for line in res_fetch: + link_type = line[1] + res.add(link_type) + except: + with open(self.metadata_file_tsv, 'r') as file: + for line in file: + line = line.strip('\n') + line = line.split('\t') + annotations = line[2:] + for link in annotations: + if link: + temp_link = link.split(':') + link_type = temp_link[0] + res.add(link_type) + self.db_headers = sorted(list(res)) + + def create_sql_table(self): + self.get_db_headers() + if os.path.exists(self.db_file): + os.remove(self.db_file) + self.start_sqlite_cursor() + + create_table_command = f'CREATE TABLE METADATA (REF TEXT, ' + for header in self.db_headers: + create_table_command += f'{header.upper()} TEXT, ' + create_table_command = create_table_command.rstrip(', ') + create_table_command += ')' + self.cursor.execute(create_table_command) + self.sqlite_connection.commit() + create_index_command = f'CREATE INDEX REF_IDX ON METADATA (REF)' + self.cursor.execute(create_index_command) + + self.store_metadata() + + self.commit_and_close_sqlite_cursor() + + def generate_inserts(self, metadata_yielder): + step = self.insert_step + temp = [] + for i in metadata_yielder: + temp.append(i) + if len(temp) >= step: + yield temp + temp = [] + yield temp + + def store_metadata(self): + insert_command = self.generate_insert_command() + metadata_yielder = self.yield_metadata() + if metadata_yielder: + generator_insert = self.generate_inserts(metadata_yielder) + for table_chunk in generator_insert: + if table_chunk: + self.cursor.executemany(insert_command, table_chunk) + self.sqlite_connection.commit() + + def convert_sql_to_dict(self, sql_result): + sql_result = sql_result[1:] + res = {} + for i in range(len(self.db_headers)): + db = self.db_headers[i].lower() + db_res = sql_result[i] + if db_res: + db_res = db_res.split(self.info_splitter) + if db not in res: res[db] = set() + res[db].update(db_res) + return res + + def fetch_metadata(self, ref_id): + if not file_exists(self.db_file): + return {} + fetch_command = self.generate_fetch_command(ref_id) + try: + res_fetch = self.cursor.execute(fetch_command).fetchone() + res = self.convert_sql_to_dict(res_fetch) + return res + except: + print(f'Failed retrieving {ref_id} in {self.db_file}') + return {} + + def test_database(self): + res = set() + if not file_exists(self.metadata_file_tsv): return res + with open(self.metadata_file_tsv) as file: + for line in file: + ref = line.split('\t')[0] + try: + ref_info = self.fetch_metadata(ref) + except: + print(f'Failed retrieving {ref} in {self.db_file}') + res.add(ref) + return res + + +if __name__ == '__main__': + import time + + metadata_connector = Metadata_SQLITE_Connector('/media/HDD/data/mantis_references/tcdb/metadata.tsv') + metadata_connector.test_database() + start = time.time() + for i in range(10000): + res = metadata_connector.fetch_metadata('P0A2U6') + print(time.time() - start) diff --git a/mantis/multiprocessing.py b/mantis/multiprocessing.py new file mode 100644 index 0000000..63c34ee --- /dev/null +++ b/mantis/multiprocessing.py @@ -0,0 +1,661 @@ +try: + from mantis.assembler import * + from mantis.homology_processor import Homology_processor + from mantis.metadata import Metadata + from mantis.consensus import Consensus + +except: + from assembler import * + from homology_processor import Homology_processor + from metadata import Metadata + from consensus import Consensus + + +class Multiprocessing(Assembler, Homology_processor, Metadata, Consensus): + + def prepare_queue_split_sample(self, protein_seqs, seq_chunks, chunk_dir): + c = 0 + for chunk in seq_chunks: + self.queue.append([self.chunk_dict_generator(protein_seqs, chunk), c, chunk_dir]) + c += 1 + return c + + def worker_split_sample(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + chunk_seqs, chunk_number, chunk_dir = record + self.generate_fasta_chunks(chunk_seqs, chunk_number, chunk_dir) + + def generate_fasta_chunks(self, protein_seqs, chunk_number, chunk_dir): + process_number = str(current_process()._name).split('-')[-1] + chunk_name = f'p{process_number}_c{chunk_number}' + current_chunk_dir = add_slash(chunk_dir + chunk_name) + chunk_path = f'{current_chunk_dir}{chunk_name}.faa' + Path(current_chunk_dir).mkdir(parents=True, exist_ok=True) + with open(chunk_path, 'w+') as file: + for seq_id in protein_seqs: + chunks = [protein_seqs[seq_id][x:x + 60] for x in range(0, len(protein_seqs[seq_id]), 60)] + chunk_str = '\n'.join(i for i in chunks) + file.write(f'>{seq_id}\n{chunk_str}\n') + + def set_chunks_to_annotate(self): + for file_path, output_path, organism_lineage, genetic_code, count_seqs_original_file, count_residues_original_file in self.fastas_to_annotate: + chunk_dir = add_slash(output_path + 'fasta_chunks') + all_chunks = os.listdir(chunk_dir) + for chunk in all_chunks: + current_chunk_dir = add_slash(chunk_dir + chunk) + chunk_name = chunk + chunk_path = current_chunk_dir + chunk_name + '.faa' + count_seqs_chunk = get_seqs_count(chunk_path) + self.chunks_to_annotate.append( + [chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, + count_seqs_original_file, count_residues_original_file, output_path]) + if output_path not in self.chunks_to_fasta: self.chunks_to_fasta[output_path] = [] + self.chunks_to_fasta[output_path].append(current_chunk_dir) + + def split_sample(self, minimum_worker_load=20000, time_limit=300, load_balancing_limit=200000): + ''' + minimum_worker_load we assume a minumum amount of workload for the workers, since generating results in process spawning overhead + #if we have a chunk size of 5k, each worker will produce minimum_worker_load/5000 + it will use 5 processes here to split faster + + On generating chunks: + HMMER performance is determined by several factors, one of which is the length of the query sequence. + A naive approach but efficient approach to generating chunks would be to split chunks by a moving window + A better approach would be to load balance the chunks by sequence length . This will effectively create more even chunks. This is slower and more ram consuming though since we pop each sequence at a time and also saving all seq keys in memory + + Just a comparison on the chunk size (bytes) of 1307 chunks between the non balanced and balanced generator (metagenomic sample with 1306215 sequences and 325.1 MB) + non_balanced balanced + sum 209394320 209421151 + average 160332.557427259 160230.413925019 + min 118979 159526 + max 197024 163953 + stdev 11069.918226454 602.515771601041 + There's obviously some variation, even in the balanced generator, but it's way lower (18 times more deviation) + Load balancing affects the file size by very little too (around 0.013%) + + After some testing I've found that load balancing doesn't affect the total speed when dealing with metagenomic samples. + With so many chunks, the theoretical speed gained by balancing the chunks doesn't come into play since we never get idle processes. + This load balancing will only be useful for smaller sample sizes + Ordering around 200k sequences length should take no longer than 10 seconds + + A inconvenient outcome of using the is the fact that it uses more RAM (since it's not a generator) and requires more time. This is hardly perceptible with smaller samples though. + + ''' + print_cyan('Splitting samples into chunks!', flush=True, file=self.redirect_verbose) + worker_count = 1 + n_chunks = 0 + total_n_seqs = 0 + for file_path, output_path, organism_lineage, genetic_code, count_seqs_original_file, count_residues_original_file in self.fastas_to_annotate: + total_n_seqs += get_seqs_count(file_path) + for file_path, output_path, organism_lineage, genetic_code, count_seqs_original_file, count_residues_original_file in self.fastas_to_annotate: + protein_seqs = read_protein_fasta(file_path) + chunk_dir = add_slash(f'{output_path}fasta_chunks') + if not file_exists(chunk_dir): + Path(chunk_dir).mkdir(parents=True, exist_ok=True) + current_worker_count = estimate_number_workers_split_sample(minimum_worker_load, len(protein_seqs)) + # here we use the total amount of sequences to avoid generating too many small files for long runs + chunk_size = estimate_chunk_size(total_n_seqs=total_n_seqs, + annotation_workers=self.estimate_number_workers_annotation( + split_sample=True), + chunk_size=self.chunk_size + ) + if current_worker_count > worker_count: worker_count = current_worker_count + if len(protein_seqs) < load_balancing_limit: + proteins_seqs_keys_len = {i: len(protein_seqs[i]) for i in protein_seqs} + list_ordered = sorted(proteins_seqs_keys_len, key=proteins_seqs_keys_len.__getitem__) + seq_chunks = chunk_generator_load_balanced(list_ordered, chunk_size, time_limit=time_limit) + else: + proteins_seqs_keys = list(protein_seqs.keys()) + seq_chunks = chunk_generator(proteins_seqs_keys, chunk_size) + current_chunks = self.prepare_queue_split_sample(protein_seqs, seq_chunks, chunk_dir) + n_chunks += current_chunks + stdout_file = open(f'{output_path}Mantis.out', 'a+') + print( + f'The current sample: {file_path} will be split into {current_chunks} chunks (up to {chunk_size} sequences each), which will be stored at:\n{chunk_dir}', + flush=True, file=stdout_file) + stdout_file.close() + if worker_count < ENVIRONMENT_CORES * WORKER_PER_CORE: + if len(self.fastas_to_annotate) < ENVIRONMENT_CORES * WORKER_PER_CORE: + worker_count = len(self.fastas_to_annotate) + else: + worker_count = ENVIRONMENT_CORES * WORKER_PER_CORE + print_cyan(f'Samples will be split into {n_chunks} chunks with {worker_count} workers', + flush=True, file=self.redirect_verbose) + self.processes_handler(self.worker_split_sample, worker_count) + + ####To run HMMER + + def compile_annotation_job(self, ref_path, target_path, output_folder, count_seqs_chunk, count_seqs_original_file, + output_initials=''): + if ref_path.endswith('.hmm'): + return self.compile_annotation_job_hmmer(ref_path, target_path, output_folder, count_seqs_chunk, + count_seqs_original_file, output_initials, ) + elif ref_path.endswith('.dmnd'): + return self.compile_annotation_job_diamond(ref_path, target_path, output_folder, output_initials) + + def compile_annotation_job_hmmer(self, hmm_path, target_path, output_folder, count_seqs_chunk, + count_seqs_original_file, output_initials=''): + hmm = get_path_level(hmm_path) + hmm = hmm.split('.')[0] + # what is more efficient? hmmsearch or hmmscan? hmmsearch: https://cryptogenomicon.org/2011/05/27/hmmscan-vs-hmmsearch-speed-the-numerology/ + command = 'hmmsearch ' + console_stdout = f'{output_folder}output_hmmer{SPLITTER}{output_initials}{hmm}.out' + if self.keep_files: + command += f'-o {console_stdout}' + else: + command += '-o /dev/null' + # summarized output + # command += ' --tblout ' + output_folder + 'tblout'+SPLITTER+output_initials +hmm+'.tblout' + # output with coordinates of hits + domtblout_path = f'{output_folder}searchout{SPLITTER}{output_initials}{hmm}.domtblout' + command += f' --domtblout {domtblout_path}' + # hmm requires a master thread, so we deduct it from the number of threads + command += f' --cpu {self.hmmer_threads}' + # since we split the original fasta into chunks, hmmer might remove some hits ( I correct for this further down the line, but not in hmmer) + # even when using the default evalue threshold, there isn't much of a loss + # we use domE because we accept multiple hits with these two algorithms + if self.domain_algorithm in ['dfs', 'heuristic']: + threshold_type = '--domE' + # whereas bpo we only accept one + else: + threshold_type = '-E' + if self.evalue_threshold == 'dynamic': + evalue = self.recalculate_evalue(self.default_evalue_threshold, count_seqs_original_file, count_seqs_chunk) + elif not self.evalue_threshold: + evalue = self.recalculate_evalue(self.default_evalue_threshold, count_seqs_original_file, count_seqs_chunk) + else: + evalue = self.recalculate_evalue(self.evalue_threshold, count_seqs_original_file, count_seqs_chunk) + + evalue *= 10 + command += f' {threshold_type} {evalue}' + command += f' --notextw {hmm_path} {target_path}' + return command, domtblout_path + + def compile_annotation_job_diamond(self, ref_path, target_path, output_folder, output_initials=''): + ref = get_path_level(ref_path) + ref = ref.split('.')[0] + # dbsize is set so we can scale it to sample size + command = f'diamond blastp --quiet --dbsize {self.diamond_db_size}' # --ultra-sensitive '# ' + dmndout_path = f'{output_folder}searchout{SPLITTER}{output_initials}{ref}.dmndout' + command += f' --out {dmndout_path} --outfmt 6 qseqid qlen sseqid slen qstart qend sstart send evalue bitscore' + command += f' --threads {self.hmmer_threads}' + # here we dont multiply by 10 because we will be scalling to sample size anyway, so the real e-value will need to be much lower anyway + if self.evalue_threshold == 'dynamic': + command += f' --evalue {self.default_evalue_threshold}' + elif not self.evalue_threshold: + command += f' --evalue {self.default_evalue_threshold}' + else: + command += f' --evalue {self.evalue_threshold}' + command += f' -d {ref_path} -q {target_path}' + return command, dmndout_path + + #####For the general HMMS + + def add_number_hmms_taxa(self, db): + res = 0 + if self.mantis_paths[db][0:2] != 'NA': + tax_hmms = 0 + for file_path, output_path, organism_lineage, genetic_code, count_seqs_original_file, count_residues_original_file in self.fastas_to_annotate: + organism_lineage_temp = list(organism_lineage) + if organism_lineage_temp: + current_taxon = organism_lineage_temp.pop(-1) + hmm_path = self.get_taxon_ref_path(current_taxon, db=db) + # to skip taxons without an hmm + while not hmm_path and organism_lineage_temp: + current_taxon = organism_lineage_temp.pop(-1) + hmm_path = self.get_taxon_ref_path(current_taxon, db=db) + if hmm_path: + chunks_path = get_chunks_path(hmm_path) + tax_hmms += len(chunks_path) + res = int(tax_hmms / len(self.fastas_to_annotate)) + return res + + def calculate_total_refs_annotation(self): + # some lineage taxons (since we might fully annotate a fasta before the top taxon level) might be unnecessary but this should provide a good estimate + n_refs = 0 + ref_list = self.compile_refs_list() + # this will take into account hmms that are split into chunks + for ref in ref_list: + n_refs += len(get_chunks_path(ref)) + if self.mantis_paths['NCBI'][0:2] != 'NA': + n_refs += len(get_chunks_path(add_slash(self.mantis_paths['NCBI'] + 'NCBIG'))) + if self.mantis_paths['NOG'][0:2] != 'NA': + n_refs += len(get_chunks_path(add_slash(self.mantis_paths['NOG'] + 'NOGG'))) + n_refs += self.add_number_hmms_taxa('NOG') + n_refs += self.add_number_hmms_taxa('NCBI') + self.total_refs_annotation = n_refs + return n_refs + + def estimate_number_workers_annotation(self, split_sample=False): + if not hasattr(self, 'total_refs_annotation'): + n_refs = self.calculate_total_refs_annotation() + else: + n_refs = self.total_refs_annotation + return estimate_number_workers_annotation(n_chunks=len(self.chunks_to_annotate), + n_refs=n_refs, + default_workers=self.default_workers, + user_cores=self.user_cores, + split_sample=split_sample, + ) + + def run_homology_search(self): + worker_count = self.estimate_number_workers_annotation() + self.prepare_queue_homology_search() + print_cyan(f'Running homology search with {worker_count} workers.', flush=True, file=self.redirect_verbose) + if worker_count < 1: + kill_switch(BadNumberWorkers, 'run_hmmer') + self.processes_handler(self.worker_annotation, worker_count) + + def run_annotation(self, hs_command, stdout_path, master_pid, output_file=None): + stdout_file = open(stdout_path, 'a+') + print(f'Running homology search command:\n{hs_command}', flush=True, file=stdout_file) + start_time = time() + # if we dont want to manage memory or for diamond + if self.skip_managed_memory or hs_command.startswith('blastp'): + run_command(hs_command, master_pid=None) + else: + run_command(hs_command, master_pid=master_pid, wanted_child='hmmsearch', user_memory=self.user_memory) + print(f'Finished running homology search ({round(time() - start_time, 3)} seconds):\n{hs_command}', flush=True, + file=stdout_file) + stdout_file.close() + if output_file: + move_file(output_file, f'{output_file}_finished') + + def worker_annotation(self, queue, master_pid): + ''' + this is a bit tricky to understand but it works the following way: + There is a queue which is stacked with jobs (i.e. hmmer commands to execute), each process/worker takes one job and runs it + when the worker finishes the job it just takes another job from the queue + if the worker received a record that is None (which is a sentinel, see processes_handler in Assembler.py) then the while cycle breaks and the worker is done + + + Now, onto the different types of records/jobs + -if a record is None, the worker stops + -if a record is not None the worker has to work (^.^): + -if the record_type is General we just run the usual hmmer command (e.g. annotate against a Pfam chunk) + -if the record_type is NOGT or NCBIT we just run the usual hmmer command (e.g. annotate against a Pfam chunk) + -if the record_type is NOGT/G_checkpoint or NCBIT/G_checkpoint then we dont execute hmmer, we do something else: + you might have noticed that when we run with TSHMMs we dont run the whole sample against the whole TSHMM, instead, + in each iteration we need to check which sequences still need to be annotated, so thats why we have the functions + that reads and generates fastas, as well as the read_domtblout - these are basically processing results during the + queue (unlike the others where results are processed in the end). + Another important thing about these checkpoints is that they also check whether a certain sample has finished annotated + with the taxon, you might be wondering, "is this just a normal execution?", and the answer is no. + Unfortunately some taxons have too many HMMs to run them all at once, so we had them split into chunks, which then forces + us to make sure all the HMM chunks have finished (so we can generate the fasta for the next taxon). And so with these checkpoints, + if the hmmer execution finished running we proceed to the next taxon, otherwise we add another checkpoint to the queue (see add_to_queue_lineage_annotation).... + these checkpoints will keep being added and processed by the worker until all the other workers have finished annotating all the required HMM chunks! + ''' + while True: + record = queue.pop(0) + if record is None: break + record_type = record[0] + if record_type == 'General': + _, hs_command, stdout_path = record + self.run_annotation(hs_command=hs_command, stdout_path=stdout_path, master_pid=master_pid) + # for taxon runs + elif record_type in ['NOGT', 'NCBIT']: + _, hs_command, stdout_path, output_file = record + self.run_annotation(hs_command=hs_command, stdout_path=stdout_path, master_pid=master_pid, + output_file=output_file) + elif record_type in ['NOGG_checkpoint', 'NCBIG_checkpoint']: + _, current_chunk_dir, fasta_path, count_seqs_original_file, chunks_n = record + target_ref = record_type.split('_')[0] + target_db = target_ref[0:-1] + if self.taxon_annotation_finished(target_ref, current_chunk_dir, chunks_n): + remove_temp_fasta(fasta_path, target_db) + else: + self.queue.insert(0, record) + + elif record_type in ['NOGT_checkpoint', 'NCBIT_checkpoint']: + _, current_chunk_dir, fasta_path, taxon_id, organism_lineage, original_lineage, count_seqs_original_file, count_residues_original_file, chunks_n, stdout_path = record + target_ref = record_type.split('_')[0] + target_db = target_ref[0:-1] + + if self.taxon_annotation_finished(target_ref + str(taxon_id), current_chunk_dir, chunks_n): + protein_sequences = read_protein_fasta(fasta_path) + count_seqs_chunk = len(protein_sequences) + searchout_path = add_slash(f'{current_chunk_dir}searchout') + taxon_searchouts = self.get_taxon_chunks(taxon_id, searchout_path, target_ref) + annotated_queries = set() + for dp in taxon_searchouts: + _, hit_counter = self.read_searchout(output_path=searchout_path + dp, + count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + count_residues_original_file=count_residues_original_file, + get_output=False) + annotated_queries.update(hit_counter) + # in each iteration we only annotate the missing sequences + self.remove_annotated_queries(protein_sequences, annotated_queries) + if protein_sequences: + fasta_path = self.generate_temp_fasta(protein_sequences, current_chunk_dir, target_db) + self.add_to_queue_lineage_annotation(fasta_path=fasta_path, current_chunk_dir=current_chunk_dir, + organism_lineage=list(organism_lineage), + original_lineage=list(original_lineage), + count_seqs_original_file=count_seqs_original_file, + count_residues_original_file=count_residues_original_file, + stdout_path=stdout_path, + dbs_to_use=[target_db]) + # if annotations havent finished, we add the checker back into the queue + else: + self.queue.insert(0, record) + + def prepare_queue_homology_search(self): + refs_list = self.compile_refs_list() + chunked_refs_list = [] + for ref_path in refs_list: + chunked_refs_list.extend(get_chunks_path(ref_path)) + chunked_refs_list = self.order_by_size_descending(chunked_refs_list) + # taxonomy processes + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + self.create_chunk_output_dirs(current_chunk_dir) + self.add_to_queue_lineage_annotation(fasta_path=chunk_path, current_chunk_dir=current_chunk_dir, + organism_lineage=list(organism_lineage), + original_lineage=list(organism_lineage), + count_seqs_original_file=count_seqs_original_file, + count_residues_original_file=count_residues_original_file, + stdout_path=f'{output_path}Mantis.out') + # general processes + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + for ref_path in chunked_refs_list: + command, output_file = self.compile_annotation_job(ref_path, target_path=chunk_path, + output_folder=current_chunk_dir, + count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + ) + self.queue.append(['General', command, f'{output_path}Mantis.out']) + + ####For the lineage HMMs + + def add_to_queue_lineage_annotation(self, fasta_path, current_chunk_dir, organism_lineage, original_lineage, + count_seqs_original_file, count_residues_original_file, stdout_path, + dbs_to_use=['NCBI', 'NOG']): + count_seqs_chunk = get_seqs_count(fasta_path) + # taxa specific dbs include nog and ncbi + for db in dbs_to_use: + db_tax = db + 'T' + db_general = db + 'G' + path_tax, path_general, hmm_db_general = db, db, db_general + current_lineage = list(organism_lineage) + if count_seqs_chunk: + # we use this one with NCBI and NOG if there's a lineage + if current_lineage and original_lineage: + if self.mantis_paths[path_tax][0:2] != 'NA': + current_taxon = current_lineage.pop(-1) + ref_path = self.get_taxon_ref_path(current_taxon, db=path_tax) + # to skip taxons without an hmm + while not ref_path and current_lineage: + current_taxon = current_lineage.pop(-1) + ref_path = self.get_taxon_ref_path(current_taxon, db=path_tax) + if ref_path: + chunks_path = get_chunks_path(ref_path) + for chunk_hmm in chunks_path: + command, output_file = self.compile_annotation_job(chunk_hmm, + target_path=fasta_path, + output_initials=db_tax, + output_folder=current_chunk_dir, + count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + ) + self.queue.insert(0, [db_tax, command, stdout_path, output_file]) + # will be used for checking whether chunks have been annotated + self.queue.insert(len(chunks_path), + [f'{db_tax}_checkpoint', current_chunk_dir, fasta_path, current_taxon, + current_lineage, original_lineage, count_seqs_original_file, + count_residues_original_file, + len(chunks_path), stdout_path]) + self.save_temp_fasta_length(current_chunk_dir, db_tax + str(current_taxon), + count_seqs_chunk, db) + # we only use the general NCBI, NOGG is too big and would slow down annotation considerably + elif not ref_path and db == 'NCBI': + # if there are still missing annotations from the lineage annotation or there's no taxonomic classification we query against the whole TSHMM database + if self.mantis_paths[path_general][0:2] != 'NA': + ref_path = get_ref_in_folder(self.mantis_paths[path_general] + hmm_db_general) + + chunks_path = get_chunks_path(ref_path) + for chunk_hmm in chunks_path: + command, output_file = self.compile_annotation_job(chunk_hmm, + target_path=fasta_path, + output_folder=current_chunk_dir, + count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + ) + self.queue.insert(0, [db_tax, command, stdout_path, output_file]) + self.queue.insert(len(chunks_path), + [f'{db_general}_checkpoint', current_chunk_dir, fasta_path, + count_seqs_original_file, len(chunks_path)]) + self.save_temp_fasta_length(current_chunk_dir, db_general, count_seqs_chunk, db) + # we use this for any situation for NCBI, for NOG only when there's no original_lineage + else: + if (db == 'NOG' and not original_lineage) or (db == 'NCBI'): + # if there are still missing annotations from the lineage annotation or there's not taxonomic classification we query against the whole TSHMM database + if self.mantis_paths[path_general][0:2] != 'NA': + ref_path = get_ref_in_folder(self.mantis_paths[path_general] + hmm_db_general) + chunks_path = get_chunks_path(ref_path) + for chunk_hmm in chunks_path: + command, output_file = self.compile_annotation_job(chunk_hmm, + target_path=fasta_path, + output_folder=current_chunk_dir, + count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + ) + self.queue.insert(0, [db_tax, command, stdout_path, output_file]) + self.queue.insert(len(chunks_path), + [f'{db_general}_checkpoint', current_chunk_dir, fasta_path, + count_seqs_original_file, len(chunks_path)]) + self.save_temp_fasta_length(current_chunk_dir, db_general, count_seqs_chunk, db) + + ####Merging hmmer output + + def estimate_searchouts_per_chunk(self): + n_hmms = len(self.compile_refs_list()) + if self.mantis_paths['NOG'][0:2] != 'NA': n_hmms += 2 + if self.mantis_paths['NCBI'][0:2] != 'NA': n_hmms += 2 + return n_hmms + + def process_output(self): + searchout_per_chunks = self.estimate_searchouts_per_chunk() + worker_count = estimate_number_workers_process_output(n_chunks=len(self.chunks_to_annotate), + searchout_per_chunks=searchout_per_chunks, + user_cores=self.user_cores) + print_cyan(f'Processing output with {worker_count} workers.', flush=True, file=self.redirect_verbose) + if worker_count < 1: + kill_switch(BadNumberWorkers, 'process_output') + + # when we have a lot of hits in the HMM file the hit processing can be quite memory heavy, so instead we now split hits into chunks + # This process is quite light since it only stores the file the hit should be stored at, all the hit information is read and discarded from memory + # this also allows for parallelization + self.prepare_queue_split_hits(worker_count) + self.processes_handler(self.worker_split_hits, worker_count) + + self.prepare_queue_process_output() + self.processes_handler(self.worker_process_output, worker_count) + + self.prepare_queue_merge_output() + self.processes_handler(self.worker_merge_output, worker_count) + + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + self.remove_temp_fasta_length(current_chunk_dir, 'NOG') + self.remove_temp_fasta_length(current_chunk_dir, 'NCBI') + + def prepare_queue_merge_processed_output(self): + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + chunks_output_path = add_slash(f'{current_chunk_dir}processed_output') + all_output_with_chunks = os.listdir(chunks_output_path) + self.queue.append([chunks_output_path, all_output_with_chunks, f'{output_path}Mantis.out']) + + def worker_merge_processed_output(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + chunks_output_path, all_output_with_chunks, stdout_path = record + stdout_file = open(stdout_path, 'a+') + self.merge_output_chunks(chunks_output_path, all_output_with_chunks, chunk_suffix='.pro', + stdout_file=stdout_file) + stdout_file.close() + + def prepare_queue_split_hits(self, worker_count): + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + searchout_path = add_slash(f'{current_chunk_dir}searchout') + raw_searchout_path = add_slash(f'{current_chunk_dir}raw_searchout') + move_file(searchout_path, raw_searchout_path) + all_searchout = os.listdir(raw_searchout_path) + same_db_chunks = {} + for searchout in all_searchout: + chunk_n = re.search('_chunk_\d+', searchout) + if chunk_n: + temp = searchout.replace(chunk_n.group(), '') + else: + temp = searchout + if temp not in same_db_chunks: same_db_chunks[temp] = set() + same_db_chunks[temp].add(searchout) + for searchout in same_db_chunks: + self.queue.append( + [raw_searchout_path, searchout, same_db_chunks[searchout], current_chunk_dir, worker_count, + f'{output_path}Mantis.out']) + + def worker_split_hits(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + searchout_path, searchout, chunks_searchout, current_chunk_dir, worker_count, stdout_path = record + self.split_hits(searchout_path, searchout, chunks_searchout, current_chunk_dir, worker_count) + + def prepare_queue_process_output(self): + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + searchout_path = add_slash(f'{current_chunk_dir}searchout') + all_searchout = os.listdir(searchout_path) + if not file_exists(add_slash(add_slash(current_chunk_dir) + 'processed_output')): + Path(add_slash(add_slash(current_chunk_dir) + 'processed_output')).mkdir(parents=True, exist_ok=True) + for searchout in all_searchout: + if 'NOG' in searchout: + count_seqs_chunk_searchout = self.get_temp_fasta_length(current_chunk_dir, searchout, 'NOG') + elif 'NCBIT' in searchout or 'NCBIG' in searchout: + count_seqs_chunk_searchout = self.get_temp_fasta_length(current_chunk_dir, searchout, 'NCBI') + else: + count_seqs_chunk_searchout = int(count_seqs_chunk) + self.queue.append([searchout_path + searchout, current_chunk_dir, count_seqs_chunk_searchout, + count_seqs_original_file, count_residues_original_file, f'{output_path}Mantis.out']) + + def worker_process_output(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + searchout_path, current_chunk_dir, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, stdout_path = record + processed_hits = self.process_searchout(output_path=searchout_path, count_seqs_chunk=count_seqs_chunk, + count_seqs_original_file=count_seqs_original_file, + count_residues_original_file=count_residues_original_file, + stdout_path=stdout_path) + self.save_processed_hits(processed_hits, add_slash(add_slash(current_chunk_dir) + 'processed_output'), + domtblout=get_path_level(searchout_path)) + if not self.keep_files: + os.remove(searchout_path) + + def prepare_queue_merge_output(self): + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + self.queue.append([current_chunk_dir, output_path]) + + def worker_merge_output(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + current_chunk_dir, output_path = record + chunks_path = add_slash(f'{current_chunk_dir}processed_output') + chunks_to_merge = [chunks_path + i for i in os.listdir(chunks_path)] + stdout_file = open(f'{output_path}Mantis.out', 'a+') + self.merge_target_output('output_annotation.tsv', current_chunk_dir, chunks_to_merge, stdout_file, + same_output=False) + stdout_file.close() + + ###Interpreting output + + def interpret_output(self): + worker_count = estimate_number_workers_process_output(n_chunks=len(self.chunks_to_annotate), + user_cores=self.user_cores) + self.prepare_queue_interpret_output() + print_cyan(f'Interpreting output with {worker_count} workers.', flush=True, + file=self.redirect_verbose) + if worker_count < 1: + kill_switch(BadNumberWorkers, 'interpret_output') + + self.processes_handler(self.worker_interpret_output, worker_count) + + def prepare_queue_interpret_output(self): + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + output_annotation_tsv = f'{current_chunk_dir}output_annotation.tsv' + self.queue.append([output_annotation_tsv, current_chunk_dir]) + + def worker_interpret_output(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + output_annotation_tsv, current_chunk_dir = record + interpreted_annotation_tsv = f'{current_chunk_dir}integrated_annotation.tsv' + self.generate_integrated_output(output_annotation_tsv, interpreted_annotation_tsv) + + ###Generate consensus output + + def get_consensus_output(self): + Consensus.__init__(self) + worker_count = estimate_number_workers_process_output(n_chunks=len(self.chunks_to_annotate), + user_cores=self.user_cores) + self.prepare_queue_generate_consensus() + print_cyan(f'Generating consensus output with {worker_count} workers.', flush=True, + file=self.redirect_verbose) + if worker_count < 1: + kill_switch(BadNumberWorkers, 'get_consensus_output') + + self.processes_handler(self.worker_consensus_output, worker_count) + + def prepare_queue_generate_consensus(self): + for chunk_name, chunk_path, current_chunk_dir, organism_lineage, count_seqs_chunk, count_seqs_original_file, count_residues_original_file, output_path in self.chunks_to_annotate: + interepreted_annotation_tsv = f'{current_chunk_dir}integrated_annotation.tsv' + stdout_file_path = f'{output_path}Mantis.out' + self.queue.append([interepreted_annotation_tsv, current_chunk_dir, stdout_file_path]) + + def worker_consensus_output(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + interpreted_annotation_tsv, current_chunk_dir, stdout_file_path = record + consensus_annotation_tsv = f'{current_chunk_dir}consensus_annotation.tsv' + self.generate_consensus_output(interpreted_annotation_tsv, consensus_annotation_tsv, stdout_file_path) + + # Merging Mantis output + + def merge_mantis_output(self): + worker_count = estimate_number_workers_process_output(n_chunks=len(self.chunks_to_annotate), + user_cores=self.user_cores) + self.prepare_queue_merge_mantis_output() + print_cyan(f'Merging output with {worker_count} workers.', flush=True, file=self.redirect_verbose) + if worker_count < 1: + kill_switch(BadNumberWorkers, 'merge_mantis_output') + + self.processes_handler(self.worker_merge_mantis_output, worker_count) + + def prepare_queue_merge_mantis_output(self): + for output_path in self.chunks_to_fasta: + self.queue.append([output_path, self.chunks_to_fasta[output_path]]) + + def worker_merge_mantis_output(self, queue, master_pid): + while True: + record = queue.pop(0) + if record is None: break + output_path, chunks_path = record + chunks_output = [f'{i}output_annotation.tsv' for i in chunks_path] + self.merge_chunks_outputs(output_path, chunks_path) + + def merge_chunks_outputs(self, output_path, chunks_path): + stdout_file = open(f'{output_path}Mantis.out', 'a+') + self.merge_target_output(output_file='output_annotation.tsv', output_folder=output_path, + chunks_path=chunks_path, stdout_file=stdout_file) + self.merge_target_output(output_file='integrated_annotation.tsv', output_folder=output_path, + chunks_path=chunks_path, stdout_file=stdout_file) + if not self.skip_consensus: + self.merge_target_output(output_file='consensus_annotation.tsv', output_folder=output_path, + chunks_path=chunks_path, stdout_file=stdout_file) + print('------------------------------------------', flush=True, file=stdout_file) + print_cyan('This sample has been sucessfully annotated!', flush=True, file=stdout_file) + stdout_file.close() diff --git a/mantis/taxonomy_sqlite_connector.py b/mantis/taxonomy_sqlite_connector.py new file mode 100644 index 0000000..dc5d0ea --- /dev/null +++ b/mantis/taxonomy_sqlite_connector.py @@ -0,0 +1,344 @@ +try: + from mantis.utils import * +except: + from utils import * + + +class Taxonomy_SQLITE_Connector(): + + def __init__(self, resources_folder): + self.info_splitter = '##' + self.insert_step = 50000 + self.resources_folder = resources_folder + self.taxonomy_db_file = f'{self.resources_folder}Taxonomy.db' + Path(self.resources_folder).mkdir(parents=True, exist_ok=True) + + ''' + this just creates an sql database from two gtdb files to convert gtdb to ncbi. first we download them and create the db + then anytime we need to fetch info we just open the db, fetch the info, and close the connection + ''' + + def start_taxonomy_connection(self): + self.sqlite_connection = sqlite3.connect(self.taxonomy_db_file) + self.cursor = self.sqlite_connection.cursor() + + def commit_and_close_sqlite_cursor(self): + self.sqlite_connection.commit() + self.sqlite_connection.close() + + def close_taxonomy_connection(self): + self.sqlite_connection.close() + + def check_table(self): + self.cursor.execute("SELECT * FROM GTDB2NCBI limit 10") + res_fetch = self.cursor.fetchall() + print(res_fetch) + + def process_gtdb_taxonomy(self, gtdb_lineage): + res = None + temp = gtdb_lineage.split(';') + if len(temp) > 1: + temp = ['_'.join(i.split('_')[2:]) for i in temp] + for i in range(len(temp)): + current_str = temp[i].split('_') + current_str = [i for i in current_str if len(i) > 1] + temp[i] = '_'.join(current_str) + + lineage_str = self.info_splitter.join(temp) + while temp and not res: + res = temp.pop(-1).strip() + return res, lineage_str + + def read_gtdb_tsv(self, gtdb_tsv): + # to avoid duplicate entries in sql + already_added = set() + with open(gtdb_tsv) as file: + line = file.readline() + line = line.strip('\n') + line = line.split('\t') + ncbi_id_index = line.index('ncbi_taxid') + gtdb_taxonomy_index = line.index('gtdb_taxonomy') + for line in file: + line = line.strip('\n') + line = line.split('\t') + gtdb_taxonomy = line[gtdb_taxonomy_index] + ncbi_id = line[ncbi_id_index] + + most_resolved, lineage_str = self.process_gtdb_taxonomy(gtdb_taxonomy) + yield_str = self.info_splitter.join([most_resolved, ncbi_id]) + if yield_str not in already_added: + yield most_resolved, lineage_str, ncbi_id + already_added.add(yield_str) + + def get_metadata_files_gtdb(self): + url = 'https://data.gtdb.ecogenomic.org/releases/latest/' + req = requests.get(url) + webpage = req.text + bac_file = re.compile('bac\d+_metadata.tar.gz') + bac_metadata_string = bac_file.search(webpage) + if bac_metadata_string: + bac_metadata_string = bac_metadata_string.group() + else: + bac_metadata_string = None + ar_file = re.compile('ar\d+_metadata.tar.gz') + ar_metadata_string = ar_file.search(webpage) + if ar_metadata_string: + ar_metadata_string = ar_metadata_string.group() + else: + ar_metadata_string = None + if not ar_metadata_string: + ar_url = 'https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_metadata_r207.tar.gz' + else: + ar_url = f'{url}{ar_metadata_string}' + if not bac_metadata_string: + bac_url = 'https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_metadata_r207.tar.gz' + else: + bac_url = f'{url}{bac_metadata_string}' + + return bac_metadata_string, bac_url, ar_metadata_string, ar_url + + def download_data(self): + bac_metadata_string, bac_url, ar_metadata_string, ar_url = self.get_metadata_files_gtdb() + + if bac_metadata_string: + bac_file = f'{self.temp_folder}{bac_metadata_string}' + try: + bac_file_unc = [i for i in os.listdir(self.temp_folder) if i.startswith('bac') and i.endswith('.tsv')][ + 0] + bac_file_unc = f'{self.temp_folder}{bac_file_unc}' + except: + bac_file_unc = None + if file_exists(bac_file) or file_exists(bac_file_unc): + pass + else: + download_file(bac_url, output_folder=self.temp_folder) + + if file_exists(bac_file): uncompress_archive(bac_file, remove_source=True) + self.bac_file = [i for i in os.listdir(self.temp_folder) if i.startswith('bac') and i.endswith('.tsv')][0] + self.bac_file = f'{self.temp_folder}{self.bac_file}' + else: + self.bac_file = None + + if ar_metadata_string: + ar_file = f'{self.temp_folder}{ar_metadata_string}' + try: + ar_file_unc = [i for i in os.listdir(self.temp_folder) if i.startswith('ar') and i.endswith('.tsv')][0] + ar_file_unc = f'{self.temp_folder}{ar_file_unc}' + except: + ar_file_unc = None + if file_exists(ar_file) or file_exists(ar_file_unc): + pass + else: + download_file(ar_url, output_folder=self.temp_folder) + if file_exists(ar_file): uncompress_archive(ar_file, remove_source=True) + self.ar_file = [i for i in os.listdir(self.temp_folder) if i.startswith('ar') and i.endswith('.tsv')][0] + self.ar_file = f'{self.temp_folder}{self.ar_file}' + else: + self.ar_file = None + + taxonomy_file = f'{self.temp_folder}new_taxdump.tar.gz' + taxonomy_url = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz' + download_file(taxonomy_url, output_folder=self.temp_folder) + if file_exists(taxonomy_file): uncompress_archive(taxonomy_file, remove_source=True) + + def create_taxonomy_db(self): + if not os.path.exists(self.taxonomy_db_file): + self.temp_folder = f'{self.resources_folder}Taxonomy_temp{SPLITTER}' + + Path(self.temp_folder).mkdir(parents=True, exist_ok=True) + self.download_data() + # this will probably need to be changed to an output_folder provided by the user + if os.path.exists(self.taxonomy_db_file): + os.remove(self.taxonomy_db_file) + self.sqlite_connection = sqlite3.connect(self.taxonomy_db_file) + self.cursor = self.sqlite_connection.cursor() + + create_table_command = f'CREATE TABLE GTDB2NCBI (' \ + f'GTDB TEXT,' \ + f'GTDBLINEAGE TEXT,' \ + f'NCBI INTEGER )' + self.cursor.execute(create_table_command) + create_index_command = f'CREATE INDEX GTDB2NCBI_IDX ON GTDB2NCBI (GTDB,GTDBLINEAGE,NCBI)' + self.cursor.execute(create_index_command) + + create_table_command = f'CREATE TABLE NCBILINEAGE (' \ + f'NCBI INTEGER,' \ + f'LINEAGE TEXT )' + self.cursor.execute(create_table_command) + create_index_command = f'CREATE INDEX NCBILINEAGEI_IDX ON NCBILINEAGE (NCBI)' + self.cursor.execute(create_index_command) + self.sqlite_connection.commit() + self.store_gtdb2ncbi() + self.store_ncbi_lineage() + shutil.rmtree(self.temp_folder) + + def generate_inserts(self, input_generator): + step = self.insert_step + temp = [] + for i in input_generator: + temp.append(i) + if len(temp) >= step: + yield temp + temp = [] + yield temp + + def chain_generators(self): + if self.bac_file: + for i in self.read_gtdb_tsv(self.bac_file): + yield i + if self.ar_file: + for i in self.read_gtdb_tsv(self.ar_file): + yield i + + def store_gtdb2ncbi(self): + gtdb2ncbi = self.chain_generators() + generator_insert = self.generate_inserts(gtdb2ncbi) + for table_chunk in generator_insert: + insert_command = f'INSERT INTO GTDB2NCBI (GTDB,GTDBLINEAGE, NCBI) values (?,?,?)' + self.cursor.executemany(insert_command, table_chunk) + self.sqlite_connection.commit() + + def read_ncbi_lineage(self): + with open(f'{self.temp_folder}taxidlineage.dmp') as file: + for line in file: + line = line.strip('\n').replace('|', '') + line = line.split() + taxon_id = line[0] + lineage = line[1:] + yield taxon_id, ','.join(lineage) + + def store_ncbi_lineage(self): + ncbi_lineage = self.read_ncbi_lineage() + generator_insert = self.generate_inserts(ncbi_lineage) + for table_chunk in generator_insert: + insert_command = f'INSERT INTO NCBILINEAGE (NCBI, LINEAGE) values (?,?)' + self.cursor.executemany(insert_command, table_chunk) + self.sqlite_connection.commit() + + def fetch_ncbi_id(self, gtdb_lineage): + res = set() + gtdb_id, _ = self.process_gtdb_taxonomy(gtdb_lineage) + fetch_command = f'SELECT GTDB,NCBI FROM GTDB2NCBI WHERE GTDB = "{gtdb_id}"' + res_fetch = self.cursor.execute(fetch_command).fetchall() + for i in res_fetch: + res.add(i[1]) + if len(res) == 1: + res = list(res)[0] + elif len(res) == 0: + res = None + elif len(res) > 1: + res = self.get_lowest_common_ancestor_ncbi(res) + return res + + def fetch_gtdb_id(self, ncbi_id): + res = set() + fetch_command = f'SELECT GTDB,GTDBLINEAGE,NCBI FROM GTDB2NCBI WHERE NCBI = "{ncbi_id}"' + res_fetch = self.cursor.execute(fetch_command).fetchall() + for i in res_fetch: + res.add(i) + if len(res) == 1: + res = list(res)[0][1] + elif len(res) == 0: + res = None + # if the ncbi matches with more than one gtdb id + elif len(res) > 1: + all_lineages = [] + for gtdb_id, lineage, ncbi_id in res: + lineage = lineage.split(self.info_splitter) + all_lineages.append(lineage) + lca = self.get_lowest_common_ancestor_gtdb(all_lineages) + res = lca + return res + + def fetch_ncbi_lineage(self, ncbi_id): + fetch_command = f'SELECT NCBI,LINEAGE FROM NCBILINEAGE WHERE NCBI = "{ncbi_id}"' + res_fetch = self.cursor.execute(fetch_command).fetchone() + if res_fetch: + taxon_id, lineage = res_fetch + lineage = lineage.split(',') + lineage.append(str(taxon_id)) + return lineage + return [] + + def launch_taxonomy_connector(self): + if os.path.exists(self.taxonomy_db_file): + self.start_taxonomy_connection() + return True + else: + print(f'Database file {self.taxonomy_db_file} does not exist') + return False + + def get_lowest_common_ancestor_ncbi(self, list_taxons): + print('Retrieved multiple NCBI entries, retrieving lowest common ancestry NCBI ID') + all_lineages = [] + min_size = None + for taxon_id in list_taxons: + taxon_lineage = self.fetch_ncbi_lineage(taxon_id) + if taxon_lineage: + all_lineages.append(taxon_lineage) + if not min_size: min_size = len(taxon_lineage) + if len(taxon_lineage) < min_size: min_size = len(taxon_lineage) + i = 0 + for i in range(min_size): + temp = set() + for taxon_lineage in all_lineages: + temp.add(taxon_lineage[i]) + if len(temp) > 1: + break + lca = all_lineages[0][0:i + 1] + return lca[-1] + + def get_lowest_common_ancestor_gtdb(self, list_taxon_lineages): + print('Retrieved multiple GTDB entries, retrieving lowest common ancestry GTDB ID') + min_size = None + for taxon_lineage in list_taxon_lineages: + if not min_size: min_size = len(taxon_lineage) + if len(taxon_lineage) < min_size: min_size = len(taxon_lineage) + i = 0 + for i in range(min_size): + temp = set() + for taxon_lineage in list_taxon_lineages: + temp.add(taxon_lineage[i]) + if len(temp) > 1: + break + lca = list_taxon_lineages[0][0:i + 1] + return lca[-1] + + def get_taxa_ncbi_url(self, url): + webpage = None + c = 0 + while not webpage and c <= 10: + req = requests.get(url) + try: + webpage = req.text + taxa_id = re.search('\d+', webpage) + return re.search('\d+', taxa_id.group()).group() + except: + print('Could not get a response from NCBI, trying again') + c += 1 + + def get_taxa_ncbi(self, organism_name): + taxa_id = self.fetch_ncbi_id(organism_name) + if taxa_id: return str(taxa_id) + url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term={organism_name}' + taxa_id = self.get_taxa_ncbi_url(url) + if not taxa_id: + url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=candidatus+{organism_name}' + taxa_id = self.get_taxa_ncbi_url(url) + if not taxa_id: print(f'Could not find taxa ID for {organism_name}') + return str(taxa_id) + + +if __name__ == '__main__': + gtdb_connector = Taxonomy_SQLITE_Connector(resources_folder='/home/pedroq/Desktop/test_cr/') + gtdb_connector.launch_taxonomy_connector() + # gtdb_connector.create_taxonomy_db() + a = gtdb_connector.fetch_ncbi_id( + 'd__Archaea;p__Halobacteriota;c__Methanosarcinia;o__Methanosarcinales;f__Methanosarcinaceae;g__Methanolobus;s__Methanolobus psychrophilus') + print(a) + a = gtdb_connector.fetch_ncbi_id('Clostridium_P perfringens') + print(a) + a = gtdb_connector.fetch_ncbi_id('s__Bacillus subtilis') + print(a) + a = gtdb_connector.fetch_gtdb_id('1423') + print(a) diff --git a/mantis/unifunc_wrapper.py b/mantis/unifunc_wrapper.py new file mode 100644 index 0000000..f7f17bf --- /dev/null +++ b/mantis/unifunc_wrapper.py @@ -0,0 +1,30 @@ +try: + from mantis.utils import unifunc_downloaded, download_unifunc +except: + from utils import unifunc_downloaded, download_unifunc + +try: + if not unifunc_downloaded(): + download_unifunc() + from Resources.UniFunc.unifunc.source import UniFunc +except: + from UniFunc.unifunc.source import UniFunc + + +def test_nlp(): + nlp = UniFunc_wrapper() + str1 = 'Responsible for trypanothione reduction' + str2 = 'Protein associated with trypanothione reductase activity' + res = nlp.get_similarity_score(str1, str2, verbose=True) + print(res) + + +# this is basically a wrapper for UniFunc +class UniFunc_wrapper(UniFunc): + def __init__(self): + UniFunc.__init__(self) + + +if __name__ == '__main__': + nlp = UniFunc_wrapper() + test_nlp()