diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index b17dbd6..2b13725 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -36,3 +36,26 @@ jobs: python setup.py install cd pypgatk python tests/pypgatk_tests.py + - name: Update changelog + uses: heinrichreimer/github-changelog-generator-action@v2.2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Update repo with new version info + uses: EndBug/add-and-commit@v7 + with: + add: CHANGELOG.md + author_name: Release Auto-Commit + author_email: ypriverol@gmail.com + branch: dev + message: Updates version info for ${{github.event.release.name}} + + - name: Generate coverage report + run: | + pip install pytest + pip install pytest-cov + python setup.py install + cd pypgatk + pytest --cov=./ --cov-report=xml tests/* + + diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..456efee --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ +# Changelog + +## [v0.0.8](https://github.com/bigbio/py-pgatk/tree/v0.0.8) (2021-01-19) + +**Implemented enhancements:** + +- Cosmic mutations for celllines [\#28](https://github.com/bigbio/py-pgatk/issues/28) + +## [v0.0.7](https://github.com/bigbio/py-pgatk/tree/v0.0.7) (2021-01-11) + +## [v0.0.6](https://github.com/bigbio/py-pgatk/tree/v0.0.6) (2020-05-02) + +**Implemented enhancements:** + +- Add the decoy Sanger tool to the library [\#19](https://github.com/bigbio/py-pgatk/issues/19) +- Major refactoring of the library to be more package oriented design [\#13](https://github.com/bigbio/py-pgatk/issues/13) +- Create / Download Canonical protein fasta file from ENSEMBL [\#1](https://github.com/bigbio/py-pgatk/issues/1) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/README.md b/README.md index fa46da8..ba22c6b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ ![Python application](https://github.com/bigbio/py-pgatk/workflows/Python%20application/badge.svg) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pypgatk/README.html) - +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/f6d030fd7d69413987f7265a01193324)](https://www.codacy.com/gh/bigbio/py-pgatk/dashboard?utm_source=github.com&utm_medium=referral&utm_content=bigbio/py-pgatk&utm_campaign=Badge_Grade) +[![PyPI version](https://badge.fury.io/py/pypgatk.svg)](https://badge.fury.io/py/pypgatk) +![PyPI - Downloads](https://img.shields.io/pypi/dm/pypgatk) **pypgatk** is a Python library part of the [ProteoGenomics Analysis Toolkit](https://pgatk.readthedocs.io/en/latest). It provides different bioinformatics tools for proteogenomics data analysis. @@ -99,4 +101,6 @@ Commands: The library provides multiple commands to download, translate and generate protein sequence databases from reference and mutation genome databases. -### Please read full docs here: +# Full Documentation + +[https://pgatk.readthedocs.io/en/latest/pypgatk.html](https://pgatk.readthedocs.io/en/latest/pypgatk.html) diff --git a/pypgatk/cgenomes/cgenomes_proteindb.py b/pypgatk/cgenomes/cgenomes_proteindb.py index 1e35cf3..14e548d 100644 --- a/pypgatk/cgenomes/cgenomes_proteindb.py +++ b/pypgatk/cgenomes/cgenomes_proteindb.py @@ -17,7 +17,7 @@ class CancerGenomesService(ParameterConfiguration): ACCEPTED_VALUES = "accepted_values" SPLIT_BY_FILTER_COLUMN = "split_by_filter_column" CLINICAL_SAMPLE_FILE = 'clinical_sample_file' - + def __init__(self, config_file, pipeline_arguments): """ Init the class with the specific parameters. @@ -34,7 +34,7 @@ def __init__(self, config_file, pipeline_arguments): if self.CONFIG_OUTPUT_FILE in self.get_pipeline_parameters(): self._local_output_file = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_FILE] - + self._filter_column = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_FILTER_INFO][self.FILTER_COLUMN] if self.FILTER_COLUMN in self.get_pipeline_parameters(): self._filter_column = self.get_pipeline_parameters()[self.FILTER_COLUMN] @@ -47,11 +47,11 @@ def __init__(self, config_file, pipeline_arguments): self._split_by_filter_column = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_FILTER_INFO][self.SPLIT_BY_FILTER_COLUMN] if self.SPLIT_BY_FILTER_COLUMN in self.get_pipeline_parameters(): self._split_by_filter_column = self.get_pipeline_parameters()[self.SPLIT_BY_FILTER_COLUMN] - + self._local_clinical_sample_file = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_FILTER_INFO][self.CLINICAL_SAMPLE_FILE] if self.CLINICAL_SAMPLE_FILE in self.get_pipeline_parameters(): self._local_clinical_sample_file = self.get_pipeline_parameters()[self.CLINICAL_SAMPLE_FILE] - + @staticmethod def get_multiple_options(options_str: str): """ @@ -129,7 +129,7 @@ def get_mut_pro_seq(snp, seq): del_index1 = int(positions[0]) - 1 del_index2 = int(positions[1]) mut_pro_seq = protein_seq[:del_index1] + mut_aa + protein_seq[del_index2:] - + elif "insertion" in snp.type: ins_index1 = int(positions[0]) - 1 mut_pro_seq = protein_seq[:ins_index1] + mut_aa + protein_seq[ins_index1 + 1:] @@ -141,7 +141,7 @@ def get_mut_pro_seq(snp, seq): else: del_index1 = int(positions[0]) - 1 mut_pro_seq = protein_seq[:del_index1] + mut_aa.replace("*", "") - + return mut_pro_seq @@ -158,7 +158,7 @@ def cosmic_to_proteindb(self): COSMIC_CDS_DB[record.id].append(record) except KeyError: COSMIC_CDS_DB[record.id] = [record] - + cosmic_input = open(self._local_mutation_file, encoding="latin-1") # CosmicMutantExport.tsv header = cosmic_input.readline().split("\t") @@ -171,7 +171,7 @@ def cosmic_to_proteindb(self): filter_col = None if self._filter_column: filter_col = header.index(self._filter_column) - + output = open(self._local_output_file, 'w') mutation_dic = {} @@ -188,7 +188,7 @@ def cosmic_to_proteindb(self): if filter_col: if row[filter_col] not in self._accepted_values and self._accepted_values!=['all']: continue - + if "coding silent" in row[muttype_col]: continue @@ -200,10 +200,10 @@ def cosmic_to_proteindb(self): seqs = [] for record in this_gene_records: seqs.append(record.seq) - + except KeyError: # geneID is not in All_COSMIC_Genes.fasta continue - + for seq in seqs: try: mut_pro_seq = self.get_mut_pro_seq(snp, seq) @@ -211,28 +211,28 @@ def cosmic_to_proteindb(self): continue if mut_pro_seq: break - + if mut_pro_seq: entry = ">%s\n%s\n" % (header, mut_pro_seq) if header not in mutation_dic: output.write(entry) mutation_dic[header] = 1 - + if self._split_by_filter_column and filter_col: try: groups_mutations_dict[row[filter_col]][header] = entry except KeyError: groups_mutations_dict[row[filter_col]] = {header: entry} - + for group_name in groups_mutations_dict.keys(): with open(self._local_output_file.replace('.fa', '')+ '_' + regex.sub('', group_name) +'.fa', 'w') as fn: for header in groups_mutations_dict[group_name].keys(): fn.write(groups_mutations_dict[group_name][header]) - + self.get_logger().debug("COSMIC contains in total {} non redundant mutations".format(len(mutation_dic))) cosmic_input.close() output.close() - + @staticmethod def get_sample_headers(header_line): try: @@ -246,7 +246,7 @@ def get_sample_headers(header_line): print('SAMPLE_ID was not found in the header row:', header_line) return None, None return tissue_type_col, sample_id_col - + def get_tissue_type_per_sample(self, local_clinical_sample_file): sample_tissue_type = {} if local_clinical_sample_file: @@ -262,12 +262,12 @@ def get_tissue_type_per_sample(self, local_clinical_sample_file): if tissue_type_col and sample_id_col: sample_tissue_type[sl[sample_id_col]] = sl[tissue_type_col].strip().replace(' ','_') return sample_tissue_type - + @staticmethod def get_mut_header_cols(header_cols, row, filter_column, accepted_values, split_by_filter_column): for col in header_cols.keys(): header_cols[col] = row.index(col) - + #check if (filter_column) tissue type should be considered if accepted_values!=['all'] or split_by_filter_column: try: @@ -275,11 +275,11 @@ def get_mut_header_cols(header_cols, row, filter_column, accepted_values, split_ except ValueError: print("{} was not found in the header {} of mutations file".format(filter_column, row)) header_cols[filter_column] = None - + return header_cols - + def cbioportal_to_proteindb(self): - """cBioportal studies have a data_clinical_sample.txt file + """cBioportal studies have a data_clinical_sample.txt file that shows the Primary Tumor Site per Sample Identifier it matches the the (filter_column) Tumor_Sample_Barcode column in the mutations file. """ @@ -289,19 +289,19 @@ def cbioportal_to_proteindb(self): output = open(self._local_output_file, "w") sample_groups_dict = {} group_mutations_dict = {} - + seq_dic = {} for record in fafile: newacc = record.id.split(".")[0] if newacc not in seq_dic: seq_dic[newacc] = record.seq - - header_cols = {"HGVSc": None, "Transcript_ID": None, "Variant_Classification": None, + + header_cols = {"HGVSc": None, "Transcript_ID": None, "Variant_Classification": None, "Variant_Type": None, "HGVSp_Short": None} nucleotide = ["A", "T", "C", "G"] mutclass = ["Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation"] - + # check if sample id and clinical files are given, if not and tissue type is required then exit if self._accepted_values!=['all'] or self._split_by_filter_column: if self._local_clinical_sample_file: @@ -311,7 +311,7 @@ def cbioportal_to_proteindb(self): else: print('No clinical sample file is given therefore no tissue type can be detected.') return - + for i,line in enumerate(mutfile): row = line.strip().split("\t") if row[0]=='#': @@ -337,7 +337,7 @@ def cbioportal_to_proteindb(self): print("No sampleID was found in (line {}): {}".format(i, row)) if group not in self._accepted_values and self._accepted_values != ['all']: continue - + gene = row[0] try: pos = row[header_cols["HGVSc"]] @@ -353,13 +353,13 @@ def cbioportal_to_proteindb(self): continue if varclass not in mutclass: continue - + try: seq = seq_dic[enst] except KeyError: print("%s not found:" % enst) continue - + if ":" in pos: cdna_pos = pos.split(":")[1] else: @@ -411,30 +411,30 @@ def cbioportal_to_proteindb(self): enst_pos = int(re.findall(r'\d+', cdna_pos.split("_")[1])[0]) else: print("unexpected insertion format") - continue; + continue seq_mut = seq[:enst_pos] + ins_dna + seq[enst_pos:] if seq_mut == "": - continue; + continue mut_pro_seq = seq_mut.translate(to_stop=True) if len(mut_pro_seq) > 6: header = "cbiomut:%s:%s:%s:%s" % (enst, gene, aa_mut, varclass) output.write(">%s\n%s\n" % (header, mut_pro_seq)) - + if self._split_by_filter_column: try: group_mutations_dict[group][header] = mut_pro_seq except KeyError: group_mutations_dict[group] = {header: mut_pro_seq} - + output.close() mutfile.close() fafile.close() - + for group in group_mutations_dict.keys(): with open(self._local_output_file.replace('.fa', '')+ '_' + regex.sub('', group) +'.fa', 'w') as fn: for header in group_mutations_dict[group].keys(): fn.write(">{}\n{}\n".format(header, group_mutations_dict[group][header])) - + diff --git a/pypgatk/cgenomes/cosmic_downloader.py b/pypgatk/cgenomes/cosmic_downloader.py index 96d3195..e40270f 100644 --- a/pypgatk/cgenomes/cosmic_downloader.py +++ b/pypgatk/cgenomes/cosmic_downloader.py @@ -3,100 +3,110 @@ import requests +from pypgatk.toolbox.exceptions import AppConfigException from pypgatk.toolbox.general import ParameterConfiguration, check_create_folders class CosmicDownloadService(ParameterConfiguration): - CONFIG_KEY_DATA_DOWNLOADER = 'cosmic_data' - CONFIG_OUTPUT_DIRECTORY = 'output_directory' - CONFIG_COSMIC_SERVER = 'cosmic_server' - CONFIG_COSMIC_FTP_URL = 'cosmic_ftp' - CONFIG_COSMIC_FTP_USER = "cosmic_user" - CONFIG_COSMIC_FTP_PASSWORD = "cosmic_password" - CONFIG_COSMIC_MUTATIONS_URL = "mutations_url" - CONFIG_COSMIC_MUTATIONS_FILE = "mutations_file" - CONFIG_COSMIC_CELLLINE_MUTATIONS_URL = "mutations_cellline_url" - CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE = "mutations_cellline_file" - CONFIG_COSMIC_CDS_GENES_FILE = "all_cds_genes_file" - CONFIG_COSMIC_CELLLINES_GENES_FILE = "all_celllines_genes_file" - - - def __init__(self, config_file, pipeline_arguments): - """ + CONFIG_KEY_DATA_DOWNLOADER = 'cosmic_data' + CONFIG_OUTPUT_DIRECTORY = 'output_directory' + CONFIG_COSMIC_SERVER = 'cosmic_server' + CONFIG_COSMIC_FTP_URL = 'cosmic_ftp' + CONFIG_COSMIC_FTP_USER = "cosmic_user" + CONFIG_COSMIC_FTP_PASSWORD = "cosmic_password" + CONFIG_COSMIC_MUTATIONS_URL = "mutations_url" + CONFIG_COSMIC_MUTATIONS_FILE = "mutations_file" + CONFIG_COSMIC_CELLLINE_MUTATIONS_URL = "mutations_cellline_url" + CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE = "mutations_cellline_file" + CONFIG_COSMIC_CDS_GENES_FILE = "all_cds_genes_file" + CONFIG_COSMIC_CELLLINES_GENES_FILE = "all_celllines_genes_file" + + def __init__(self, config_file, pipeline_arguments): + """ Init the class with the specific parameters. :param config_file configuration file :param pipeline_arguments pipelines arguments """ - super(CosmicDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments) + super(CosmicDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments) - if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters(): - self._local_path_cosmic = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY] - else: - self._local_path_cosmic = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ - self.CONFIG_OUTPUT_DIRECTORY] + if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters(): + self._local_path_cosmic = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY] + else: + self._local_path_cosmic = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ + self.CONFIG_OUTPUT_DIRECTORY] - self._cosmic_token = base64.b64encode("{}:{}".format(self.get_pipeline_parameters()[self.CONFIG_COSMIC_FTP_USER], self.get_pipeline_parameters()[self.CONFIG_COSMIC_FTP_PASSWORD]) - .encode()).decode('utf-8') + self._cosmic_token = base64.b64encode("{}:{}".format(self.get_pipeline_parameters()[self.CONFIG_COSMIC_FTP_USER], + self.get_pipeline_parameters()[ + self.CONFIG_COSMIC_FTP_PASSWORD]) + .encode()).decode('utf-8') - self.prepare_local_cosmic_repository() + self.prepare_local_cosmic_repository() - def prepare_local_cosmic_repository(self): - self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format( - self.get_local_path_root_cosmic_repo())) - check_create_folders([self.get_local_path_root_cosmic_repo()]) - self.get_logger().debug( - "Local path for cbioportal Release - '{}'".format(self.get_local_path_root_cosmic_repo())) + def prepare_local_cosmic_repository(self): + self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format( + self.get_local_path_root_cosmic_repo())) + check_create_folders([self.get_local_path_root_cosmic_repo()]) + self.get_logger().debug( + "Local path for cbioportal Release - '{}'".format(self.get_local_path_root_cosmic_repo())) - def get_local_path_root_cosmic_repo(self): - return self._local_path_cosmic + def get_local_path_root_cosmic_repo(self): + return self._local_path_cosmic - def download_mutation_file(self): - """ + def download_mutation_file(self): + """ This function will download the mutations file from Cosmic Database. :return: None """ - mutation_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_MUTATIONS_FILE]) - cds_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_CDS_GENES_FILE]) - - mutation_celline_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE]) - cellines_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_CELLLINES_GENES_FILE]) - - server = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_FTP_URL] - - cosmic_version = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_MUTATIONS_URL] - mutation_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_MUTATIONS_FILE] - - cosmic_cellline_version = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_CELLLINE_MUTATIONS_URL] - mutation_celline_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE] - - all_cds_gene_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_CDS_GENES_FILE] - - all_celllines_gene_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ - self.CONFIG_COSMIC_CELLLINES_GENES_FILE] - - mutation_url = "{}/{}/{}".format(server, cosmic_version, mutation_file) - cds_gene_url = "{}/{}/{}".format(server, cosmic_version, all_cds_gene_file) - - celllines_gene_url = "{}/{}/{}".format(server, cosmic_cellline_version, all_celllines_gene_file) - mutation_cellline_url = "{}/{}/{}".format(server, cosmic_cellline_version, mutation_celline_file) - - token = "Basic {}".format(self._cosmic_token) - self.download_file_cosmic(mutation_url, mutation_output_file, token) - self.download_file_cosmic(cds_gene_url, cds_genes_output_file, token) - - self.download_file_cosmic(celllines_gene_url, cellines_genes_output_file, token) - self.download_file_cosmic(mutation_cellline_url, mutation_celline_output_file, token) - - - def download_file_cosmic(self, url, local_file, token): - """ + mutation_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ + self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_MUTATIONS_FILE]) + cds_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ + self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_CDS_GENES_FILE]) + + mutation_celline_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ + self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE]) + cellines_genes_output_file = "{}/{}".format(self.get_local_path_root_cosmic_repo(), + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ + self.CONFIG_COSMIC_SERVER][self.CONFIG_COSMIC_CELLLINES_GENES_FILE]) + + server = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_FTP_URL] + + cosmic_version = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_MUTATIONS_URL] + mutation_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_MUTATIONS_FILE] + + cosmic_cellline_version = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_CELLLINE_MUTATIONS_URL] + mutation_celline_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_CELLLINE_MUTATIONS_FILE] + + all_cds_gene_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_CDS_GENES_FILE] + + all_celllines_gene_file = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_COSMIC_SERVER][ + self.CONFIG_COSMIC_CELLLINES_GENES_FILE] + + mutation_url = "{}/{}/{}".format(server, cosmic_version, mutation_file) + cds_gene_url = "{}/{}/{}".format(server, cosmic_version, all_cds_gene_file) + + celllines_gene_url = "{}/{}/{}".format(server, cosmic_cellline_version, all_celllines_gene_file) + mutation_cellline_url = "{}/{}/{}".format(server, cosmic_cellline_version, mutation_celline_file) + + token = "Basic {}".format(self._cosmic_token) + self.download_file_cosmic(mutation_url, mutation_output_file, token) + self.download_file_cosmic(cds_gene_url, cds_genes_output_file, token) + + self.download_file_cosmic(celllines_gene_url, cellines_genes_output_file, token) + self.download_file_cosmic(mutation_cellline_url, mutation_celline_output_file, token) + + def download_file_cosmic(self, url, local_file, token): + """ Download file from cosmic repository using requests :param url: url of the file to be download :param local_file: local file @@ -104,15 +114,20 @@ def download_file_cosmic(self, url, local_file, token): :return: """ - response = requests.get(url, stream=True, headers={'Authorization': token}) - if response.status_code == 200: - url = json.loads(response.text)['url'] - msg = "Downloading file from url '{}'".format(url) - self.get_logger().debug(msg) - - response = requests.get(url, stream=True) - if response.status_code == 200: - with open(local_file, 'wb') as f: - f.write(response.content) - msg = "Download Finish for file '{}'".format(local_file) - self.get_logger().debug(msg) + response = requests.get(url, stream=True, headers={'Authorization': token}) + if response.status_code == 200: + url = json.loads(response.text)['url'] + msg = "Downloading file from url '{}'".format(url) + self.get_logger().debug(msg) + + response = requests.get(url, stream=True) + if response.status_code == 200: + with open(local_file, 'wb') as f: + f.write(response.content) + msg = "Download Finish for file '{}'".format(local_file) + self.get_logger().debug(msg) + else: + msg = "Error downloading the COSMIC data, error code {} , error message '{}'".format(response.status_code, + local_file) + self.get_logger().debug(msg) + raise AppConfigException(msg) diff --git a/pypgatk/commands/cbioportal_downloader.py b/pypgatk/commands/cbioportal_downloader.py index 36746f3..4e24eea 100644 --- a/pypgatk/commands/cbioportal_downloader.py +++ b/pypgatk/commands/cbioportal_downloader.py @@ -8,36 +8,38 @@ this_dir, this_filename = os.path.split(__file__) + @click.command('cbioportal-downloader', short_help=' Command to download the the cbioportal studies') -@click.option('--config_file', '-c', help='Configuration file for the ensembl data downloader pipeline', - default=this_dir + '../config/cbioportal_config.yaml') -@click.option('--output_directory', '-o', help='Output directory for the peptide databases', +@click.option('-c', '--config_file', help='Configuration file for the ensembl data downloader pipeline', + default=this_dir + '/../config/cbioportal_config.yaml') +@click.option('-o', '--output_directory', help='Output directory for the peptide databases', default="./database_cbioportal/") -@click.option('--list_studies', '-l', +@click.option('-l', '--list_studies', help='Print the list of all the studies in cBioPortal (https://www.cbioportal.org)', is_flag=True) -@click.option('--download_study', '-d', +@click.option('-d', '--download_study', help="Download a specific Study from cBioPortal -- (all to download all studies)") -@click.option('--multithreading', '-th', help=' Enable multithreading to download multiple files ad the same time', is_flag=True) +@click.option('-th', '--multithreading', help=' Enable multithreading to download multiple files ad the same time', + is_flag=True) @click.pass_context def cbioportal_downloader(ctx, config_file, output_directory, list_studies, download_study, multithreading): - if config_file is None: - msg = "The config file for the pipeline is missing, please provide one " - logging.error(msg) - raise AppConfigException(msg) - - pipeline_arguments = {} - if output_directory is not None: - pipeline_arguments[CbioPortalDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory - if list_studies: - pipeline_arguments[CbioPortalDownloadService.CONFIG_LIST_STUDIES] = list_studies - if multithreading is not None: - pipeline_arguments[CbioPortalDownloadService.CONFIG_MULTITHREADING] = multithreading - - cbioportal_downloader_service = CbioPortalDownloadService(config_file, pipeline_arguments) - - if list_studies: - list_of_studies = cbioportal_downloader_service.get_cancer_studies() - print(list_of_studies) - - if download_study is not None: - cbioportal_downloader_service.download_study(download_study) + if config_file is None: + msg = "The config file for the pipeline is missing, please provide one " + logging.error(msg) + raise AppConfigException(msg) + + pipeline_arguments = {} + if output_directory is not None: + pipeline_arguments[CbioPortalDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory + if list_studies: + pipeline_arguments[CbioPortalDownloadService.CONFIG_LIST_STUDIES] = list_studies + if multithreading is not None: + pipeline_arguments[CbioPortalDownloadService.CONFIG_MULTITHREADING] = multithreading + + cbioportal_downloader_service = CbioPortalDownloadService(config_file, pipeline_arguments) + + if list_studies: + list_of_studies = cbioportal_downloader_service.get_cancer_studies() + print(list_of_studies) + + if download_study is not None: + cbioportal_downloader_service.download_study(download_study) diff --git a/pypgatk/commands/cbioportal_to_proteindb.py b/pypgatk/commands/cbioportal_to_proteindb.py index 5713a18..92c0f58 100644 --- a/pypgatk/commands/cbioportal_to_proteindb.py +++ b/pypgatk/commands/cbioportal_to_proteindb.py @@ -7,35 +7,36 @@ this_dir, this_filename = os.path.split(__file__) + @click.command('cbioportal-to-proteindb', short_help='Command to translate cbioportal mutation data into proteindb') -@click.option('--config_file', - '-c', +@click.option('-c', '--config_file', help='Configuration for cbioportal to proteindb tool', - default= this_dir + '../config/cbioportal_config.yaml') + default=this_dir + '/../config/cbioportal_config.yaml') @click.option('-in', '--input_mutation', help='Cbioportal mutation file') @click.option('-fa', '--input_cds', help='CDS genes from ENSEMBL database') @click.option('-out', '--output_db', help='Protein database including all the mutations') -@click.option('-f', '--filter_column', default='Tumor_Sample_Barcode', +@click.option('-f', '--filter_column', default='Tumor_Sample_Barcode', help='Column in the VCF file to be used for filtering or splitting mutations') -@click.option('-a', '--accepted_values', default='all', +@click.option('-a', '--accepted_values', default='all', help='Limit mutations to values (tissue type, sample name, etc) considered for generating proteinDBs, by default mutations from all records are considered') -@click.option('-s', '--split_by_filter_column', - help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False', is_flag=True) -@click.option('-cl', '--clinical_sample_file', help='File to get tissue type from for the samples in input_mutation file') - +@click.option('-s', '--split_by_filter_column', + help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False', + is_flag=True) +@click.option('-cl', '--clinical_sample_file', + help='File to get tissue type from for the samples in input_mutation file') @click.pass_context -def cbioportal_to_proteindb(ctx, config_file, input_mutation, input_cds, output_db, +def cbioportal_to_proteindb(ctx, config_file, input_mutation, input_cds, output_db, clinical_sample_file, filter_column, accepted_values, split_by_filter_column): - if input_mutation is None or input_cds is None or output_db is None: - print_help() + if input_mutation is None or input_cds is None or output_db is None: + print_help() - pipeline_arguments = {CancerGenomesService.CONFIG_CANCER_GENOMES_MUTATION_FILE: input_mutation, - CancerGenomesService.CONFIG_COMPLETE_GENES_FILE: input_cds, - CancerGenomesService.CONFIG_OUTPUT_FILE: output_db, - CancerGenomesService.CLINICAL_SAMPLE_FILE: clinical_sample_file, - CancerGenomesService.FILTER_COLUMN: filter_column, - CancerGenomesService.ACCEPTED_VALUES: accepted_values, - CancerGenomesService.SPLIT_BY_FILTER_COLUMN: split_by_filter_column} + pipeline_arguments = {CancerGenomesService.CONFIG_CANCER_GENOMES_MUTATION_FILE: input_mutation, + CancerGenomesService.CONFIG_COMPLETE_GENES_FILE: input_cds, + CancerGenomesService.CONFIG_OUTPUT_FILE: output_db, + CancerGenomesService.CLINICAL_SAMPLE_FILE: clinical_sample_file, + CancerGenomesService.FILTER_COLUMN: filter_column, + CancerGenomesService.ACCEPTED_VALUES: accepted_values, + CancerGenomesService.SPLIT_BY_FILTER_COLUMN: split_by_filter_column} - cosmic_to_proteindb_service = CancerGenomesService(config_file, pipeline_arguments) - cosmic_to_proteindb_service.cbioportal_to_proteindb() + cosmic_to_proteindb_service = CancerGenomesService(config_file, pipeline_arguments) + cosmic_to_proteindb_service.cbioportal_to_proteindb() diff --git a/pypgatk/commands/cosmic_downloader.py b/pypgatk/commands/cosmic_downloader.py index bd62c8d..80ed7ff 100644 --- a/pypgatk/commands/cosmic_downloader.py +++ b/pypgatk/commands/cosmic_downloader.py @@ -10,34 +10,32 @@ @click.command('cosmic-downloader', short_help='Command to download the cosmic mutation database') -@click.option('--config_file', - '-c', +@click.option('-c', '--config_file', help='Configuration file for the ensembl data downloader pipeline', - default=this_dir + '../config/cosmic_config.yaml') -@click.option('--output_directory', - '-o', + default=this_dir + '/../config/cosmic_config.yaml') +@click.option('-o', '--output_directory', help='Output directory for the peptide databases', default="./database_cosmic/") -@click.option('--username', '-u', +@click.option('-u', '--username', help="Username for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)") -@click.option('--password', '-p', +@click.option('-p', '--password', help="Password for cosmic database -- please if you don't have one register here (https://cancer.sanger.ac.uk/cosmic/register)") @click.pass_context def cosmic_downloader(ctx, config_file, output_directory, username, password): - if config_file is None: - msg = "The config file for the pipeline is missing, please provide one " - logging.error(msg) - raise AppConfigException(msg) + if config_file is None: + msg = "The config file for the pipeline is missing, please provide one " + logging.error(msg) + raise AppConfigException(msg) - pipeline_arguments = {} - if output_directory is not None: - pipeline_arguments[CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory - if username is not None: - pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_USER] = username + pipeline_arguments = {} + if output_directory is not None: + pipeline_arguments[CosmicDownloadService.CONFIG_OUTPUT_DIRECTORY] = output_directory + if username is not None: + pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_USER] = username - if password is not None: - pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD] = password + if password is not None: + pipeline_arguments[CosmicDownloadService.CONFIG_COSMIC_FTP_PASSWORD] = password - cosmic_downloader_service = CosmicDownloadService(config_file, pipeline_arguments) + cosmic_downloader_service = CosmicDownloadService(config_file, pipeline_arguments) - cosmic_downloader_service.download_mutation_file() + cosmic_downloader_service.download_mutation_file() diff --git a/pypgatk/commands/cosmic_to_proteindb.py b/pypgatk/commands/cosmic_to_proteindb.py index 4426ce9..db8f3f7 100644 --- a/pypgatk/commands/cosmic_to_proteindb.py +++ b/pypgatk/commands/cosmic_to_proteindb.py @@ -11,30 +11,30 @@ @click.command('cosmic-to-proteindb', short_help='Command to translate Cosmic mutation data into proteindb') @click.option('-c', '--config_file', help='Configuration file for the cosmic data pipelines', - default=this_dir + '../config/cosmic_config.yaml') + default=this_dir + '/../config/cosmic_config.yaml') @click.option('-in', '--input_mutation', help='Cosmic Mutation data file') @click.option('-fa', '--input_genes', help='All Cosmic genes') -@click.option('-out', '--output_db', +@click.option('-out', '--output_db', help='Protein database including all the mutations') -@click.option('-f', '--filter_column', default='Primary site', +@click.option('-f', '--filter_column', default='Primary site', help='Column in the VCF file to be used for filtering or splitting mutations') -@click.option('-a', '--accepted_values', default='all', +@click.option('-a', '--accepted_values', default='all', help='Limit mutations to values (tissue type, sample name, etc) considered for generating proteinDBs, by default mutations from all records are considered') -@click.option('-s', '--split_by_filter_column', - help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False', is_flag=True) - +@click.option('-s', '--split_by_filter_column', + help='Use this flag to generate a proteinDB per group as specified in the filter_column, default is False', + is_flag=True) @click.pass_context -def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db, +def cosmic_to_proteindb(ctx, config_file, input_mutation, input_genes, output_db, filter_column, accepted_values, split_by_filter_column): - if input_mutation is None or input_genes is None or output_db is None: - print_help() - - pipeline_arguments = {CancerGenomesService.CONFIG_CANCER_GENOMES_MUTATION_FILE: input_mutation, - CancerGenomesService.CONFIG_COMPLETE_GENES_FILE: input_genes, - CancerGenomesService.CONFIG_OUTPUT_FILE: output_db, - CancerGenomesService.FILTER_COLUMN: filter_column, - CancerGenomesService.ACCEPTED_VALUES: accepted_values, - CancerGenomesService.SPLIT_BY_FILTER_COLUMN: split_by_filter_column} - - cosmic_to_proteindb_service = CancerGenomesService(config_file, pipeline_arguments) - cosmic_to_proteindb_service.cosmic_to_proteindb() + if input_mutation is None or input_genes is None or output_db is None: + print_help() + + pipeline_arguments = {CancerGenomesService.CONFIG_CANCER_GENOMES_MUTATION_FILE: input_mutation, + CancerGenomesService.CONFIG_COMPLETE_GENES_FILE: input_genes, + CancerGenomesService.CONFIG_OUTPUT_FILE: output_db, + CancerGenomesService.FILTER_COLUMN: filter_column, + CancerGenomesService.ACCEPTED_VALUES: accepted_values, + CancerGenomesService.SPLIT_BY_FILTER_COLUMN: split_by_filter_column} + + cosmic_to_proteindb_service = CancerGenomesService(config_file, pipeline_arguments) + cosmic_to_proteindb_service.cosmic_to_proteindb() diff --git a/pypgatk/commands/dnaseq_to_proteindb.py b/pypgatk/commands/dnaseq_to_proteindb.py index c896f6b..8ffbf26 100644 --- a/pypgatk/commands/dnaseq_to_proteindb.py +++ b/pypgatk/commands/dnaseq_to_proteindb.py @@ -9,8 +9,8 @@ @click.command("dnaseq-to-proteindb", short_help="Generate peptides based on DNA sequences") -@click.option('--config_file', '-c', help='Configuration to perform conversion between ENSEMBL Files', - default= this_dir + '../config/ensembl_config.yaml') +@click.option('-c', '--config_file', help='Configuration to perform conversion between ENSEMBL Files', + default= this_dir + '/../config/ensembl_config.yaml') @click.option('--input_fasta', help='Path to sequences fasta') @click.option('--translation_table', default=1, type=int, help='Translation Table (default 1)') @click.option('--num_orfs', default=3, type=int, help='Number of ORFs (default 0)') @@ -38,18 +38,16 @@ def dnaseq_to_proteindb(ctx, config_file, input_fasta, translation_table, num_or if input_fasta is None: print_help() - pipeline_arguments = {} - pipeline_arguments[EnsemblDataService.TRANSLATION_TABLE] = translation_table - pipeline_arguments[EnsemblDataService.PROTEIN_DB_OUTPUT] = output_proteindb - pipeline_arguments[EnsemblDataService.HEADER_VAR_PREFIX] = var_prefix - pipeline_arguments[EnsemblDataService.EXCLUDE_BIOTYPES] = exclude_biotypes - pipeline_arguments[EnsemblDataService.SKIP_INCLUDING_ALL_CDS] = skip_including_all_cds - pipeline_arguments[EnsemblDataService.INCLUDE_BIOTYPES] = include_biotypes - pipeline_arguments[EnsemblDataService.BIOTYPE_STR] = biotype_str - pipeline_arguments[EnsemblDataService.NUM_ORFS] = num_orfs - pipeline_arguments[EnsemblDataService.NUM_ORFS_COMPLEMENT] = num_orfs_complement - pipeline_arguments[EnsemblDataService.EXPRESSION_STR] = expression_str - pipeline_arguments[EnsemblDataService.EXPRESSION_THRESH] = expression_thresh + pipeline_arguments = {EnsemblDataService.TRANSLATION_TABLE: translation_table, + EnsemblDataService.PROTEIN_DB_OUTPUT: output_proteindb, + EnsemblDataService.HEADER_VAR_PREFIX: var_prefix, + EnsemblDataService.EXCLUDE_BIOTYPES: exclude_biotypes, + EnsemblDataService.SKIP_INCLUDING_ALL_CDS: skip_including_all_cds, + EnsemblDataService.INCLUDE_BIOTYPES: include_biotypes, + EnsemblDataService.BIOTYPE_STR: biotype_str, EnsemblDataService.NUM_ORFS: num_orfs, + EnsemblDataService.NUM_ORFS_COMPLEMENT: num_orfs_complement, + EnsemblDataService.EXPRESSION_STR: expression_str, + EnsemblDataService.EXPRESSION_THRESH: expression_thresh} ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments) ensembl_data_service.dnaseq_to_proteindb(input_fasta) diff --git a/pypgatk/commands/ensembl_database.py b/pypgatk/commands/ensembl_database.py index 97d4f0b..18767e1 100644 --- a/pypgatk/commands/ensembl_database.py +++ b/pypgatk/commands/ensembl_database.py @@ -9,18 +9,19 @@ this_dir, this_filename = os.path.split(__file__) @click.command('ensembl-check', short_help="Command to check ensembl database for stop codons, gaps") -@click.option('--config_file', - '-c', - help='Configuration to perform Ensembl database cehck', +@click.option('-c' ,'--config_file', + help='Configuration to perform Ensembl database check', default= this_dir + '/../config/ensembl_config.yaml') -@click.option('-in', '--input_fasta', help='input_fasta file to perform the translation') -@click.option('-out', '--output', help='Output File', default="peptide-database.fa") +@click.option('-in' , '--input_fasta', help='input_fasta file to perform the translation') +@click.option('-out' , '--output', help='Output File', default="peptide-database.fa") +@click.option('-adds','--add_stop_codons', help='If a stop codons is found, add a new protein with suffix (_Codon_{num})', is_flag=True) +@click.option('-aa' , '--num_aa', help='Minimun number of aminoacids for a protein to be included in the database', default = 6) @click.pass_context -def ensembl_check(ctx, config_file, input_fasta,output): +def ensembl_check(ctx, config_file, input_fasta,output, add_stop_codons, num_aa): if input_fasta is None: print_help() pipeline_arguments = {EnsemblDataService.PROTEIN_DB_OUTPUT: output} ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments) - ensembl_data_service.check_proteindb(input_fasta) + ensembl_data_service.check_proteindb(input_fasta, add_stop_codons, num_aa) diff --git a/pypgatk/commands/ensembl_downloader.py b/pypgatk/commands/ensembl_downloader.py index d2d49a4..dab71c8 100644 --- a/pypgatk/commands/ensembl_downloader.py +++ b/pypgatk/commands/ensembl_downloader.py @@ -9,32 +9,30 @@ this_dir, this_filename = os.path.split(__file__) @click.command('ensembl-downloader', short_help='Command to download the ensembl information') -@click.option('--config_file', - '-c', +@click.option('-c', '--config_file', help='Configuration file for the ensembl data downloader pipeline', - default=this_dir + '../config/ensembl_downloader_config.yaml') -@click.option('--output_directory', - '-o', + default=this_dir + '/../config/ensembl_downloader_config.yaml') +@click.option('-o', '--output_directory', help='Output directory for the peptide databases', default="./database_ensembl/") -@click.option('--folder_prefix_release', - '-fp', help='Output folder prefix to download the data', +@click.option('-fp', '--folder_prefix_release', + help='Output folder prefix to download the data', default='release-') -@click.option('--taxonomy', - '-t', +@click.option('-t', '--taxonomy', help='Taxonomy identifiers (comma separated list can be given) that will be use to download the data from Ensembl', default='') -@click.option('--list_taxonomies', '-l', +@click.option('-l', '--list_taxonomies', help='Print the list of all the taxonomies in ENSEMBL (https://www.ensembl.org)', is_flag=True, default=False) -@click.option('--skip_gtf', '-sg', help="Skip the gtf file during the download", is_flag=True) -@click.option('--skip_protein', '-sp', help="Skip the protein fasta file during download", is_flag=True) -@click.option('--skip_cds', '-sc', help='Skip the CDS file download', is_flag=True) -@click.option('--skip_cdna', '-sd', help='Skip the cDNA file download', is_flag=True) -@click.option('--skip_ncrna', '-sn', help='Skip the ncRNA file download', is_flag=True) -@click.option('--skip_vcf', '-sv', help='Skip the VCF variant file', is_flag=True) -@click.option('--ensembl_name', '-en', help='Ensembl name code to download, it can be use instead of taxonomy (e.g. homo_sapiens)', default='') +@click.option('-sg', '--skip_gtf', help="Skip the gtf file during the download", is_flag=True) +@click.option('-sp', '--skip_protein', help="Skip the protein fasta file during download", is_flag=True) +@click.option('-sc', '--skip_cds', help='Skip the CDS file download', is_flag=True) +@click.option('-sd', '--skip_cdna', help='Skip the cDNA file download', is_flag=True) +@click.option('-sn', '--skip_ncrna', help='Skip the ncRNA file download', is_flag=True) +@click.option('-sv', '--skip_vcf', help='Skip the VCF variant file', is_flag=True) +@click.option('-en', '--ensembl_name', help='Ensembl name code to download, it can be use instead of taxonomy (e.g. homo_sapiens)', default='') +@click.option('--grch37', help='Download a previous version GRCh37 of ensembl genomes', is_flag=True) def ensembl_downloader(config_file, output_directory, folder_prefix_release, taxonomy, list_taxonomies, - skip_gtf, skip_protein, skip_cds, skip_cdna, skip_ncrna, skip_vcf, ensembl_name): + skip_gtf, skip_protein, skip_cds, skip_cdna, skip_ncrna, skip_vcf, ensembl_name, grch37 = False): """ This tool enables to download from enseml ftp the FASTA and GTF files""" if config_file is None: @@ -89,6 +87,6 @@ def ensembl_downloader(config_file, output_directory, folder_prefix_release, tax for taxonomy_info in list_of_taxonomies: print(taxonomy_info) - ensembl_download_service.download_database_by_species() + ensembl_download_service.download_database_by_species(grch37) logger.info("Pipeline Finish !!!") diff --git a/pypgatk/commands/proteindb_decoy.py b/pypgatk/commands/proteindb_decoy.py index 54620c5..4fd21fd 100644 --- a/pypgatk/commands/proteindb_decoy.py +++ b/pypgatk/commands/proteindb_decoy.py @@ -12,20 +12,20 @@ @click.command('generate-decoy', short_help='Create decoy protein sequences. Each protein ' 'is reversed and the cleavage sites switched with preceding amino acid. Peptides are checked for existence in target sequences if found' 'the tool will attempt to shuffle them. James.Wright@sanger.ac.uk 2015') -@click.option('--config_file', '-c', help='Configuration file for the protein database decoy generation', default= this_dir + '../config/protein_decoy.yaml') -@click.option('--output', '-o', help='Output file for decoy database', default="protein-decoy.fa") -@click.option('--input', '-i', help='FASTA file of target proteins sequences for which to create decoys (*.fasta|*.fa)') -@click.option('--cleavage_sites', '-s', default='KR', help='A list of amino acids at which to cleave during digestion. Default = KR') -@click.option('--anti_cleavage_sites', '-a', help='A list of amino acids at which not to cleave if following cleavage site ie. Proline. Default = none') -@click.option('--cleavage_position', '-p', default='c', help='Set cleavage to be c or n terminal of specified cleavage sites. Options [c, n], Default = c') -@click.option('--min_peptide_length', '-l', type=int, help='Set minimum length of peptides to compare between target and decoy. Default = 5') -@click.option('--max_iterations', '-n', type=int, help='Set maximum number of times to shuffle a peptide to make it non-target before failing. Default=100') -@click.option('--do_not_shuffle', '-x', help='Turn OFF shuffling of decoy peptides that are in the target database. Default=false') -@click.option('--do_not_switch', '-w', help='Turn OFF switching of cleavage site with preceding amino acid. Default=false') -@click.option('--decoy_prefix', '-d', help='Set accession prefix for decoy proteins in output. Default=DECOY_') -@click.option('--temp_file', '-t', help='Set temporary file to write decoys prior to shuffling. Default=protein-decoy.fa') -@click.option('--no_isobaric', '-b', help='Do not make decoy peptides isobaric. Default=false') -@click.option('--memory_save', '-m', help='Slower but uses less memory (does not store decoy peptide list). Default=false') +@click.option('-c', '--config_file', help='Configuration file for the protein database decoy generation', default= this_dir + '/../config/protein_decoy.yaml') +@click.option('-o', '--output', help='Output file for decoy database', default="protein-decoy.fa") +@click.option( '-i', '--input', help='FASTA file of target proteins sequences for which to create decoys (*.fasta|*.fa)') +@click.option('-s', '--cleavage_sites', default='KR', help='A list of amino acids at which to cleave during digestion. Default = KR') +@click.option('-a', '--anti_cleavage_sites', help='A list of amino acids at which not to cleave if following cleavage site ie. Proline. Default = none') +@click.option('-p', '--cleavage_position', default='c', help='Set cleavage to be c or n terminal of specified cleavage sites. Options [c, n], Default = c') +@click.option('-l', '--min_peptide_length', type=int, help='Set minimum length of peptides to compare between target and decoy. Default = 5') +@click.option('-n', '--max_iterations', type=int, help='Set maximum number of times to shuffle a peptide to make it non-target before failing. Default=100') +@click.option('-x', '--do_not_shuffle', help='Turn OFF shuffling of decoy peptides that are in the target database. Default=false') +@click.option('-w', '--do_not_switch', help='Turn OFF switching of cleavage site with preceding amino acid. Default=false') +@click.option('-d', '--decoy_prefix', help='Set accession prefix for decoy proteins in output. Default=DECOY_') +@click.option('-t', '--temp_file', help='Set temporary file to write decoys prior to shuffling. Default=protein-decoy.fa') +@click.option('-b', '--no_isobaric', help='Do not make decoy peptides isobaric. Default=false') +@click.option( '-m', '--memory_save', help='Slower but uses less memory (does not store decoy peptide list). Default=false') @click.pass_context def generate_database(ctx, config_file, output, input, cleavage_sites, anti_cleavage_sites, cleavage_position, min_peptide_length, max_iterations, do_not_shuffle, do_not_switch, decoy_prefix, temp_file, no_isobaric, memory_save): diff --git a/pypgatk/commands/threeframe_translation.py b/pypgatk/commands/threeframe_translation.py index 36a52b3..44a1780 100644 --- a/pypgatk/commands/threeframe_translation.py +++ b/pypgatk/commands/threeframe_translation.py @@ -8,12 +8,10 @@ this_dir, this_filename = os.path.split(__file__) - @click.command('threeframe-translation', short_help="Command to perform 3'frame translation") -@click.option('--config_file', - '-c', +@click.option('-c', '--config_file', help='Configuration to perform conversion between ENSEMBL Files', - default= this_dir + '../config/ensembl_config.yaml') + default= this_dir + '/../config/ensembl_config.yaml') @click.option('-in', '--input_fasta', help='input_fasta file to perform the translation') @click.option('-t', '--translation_table', help='Translation table default value 1', default='1') @click.option('-out', '--output', help='Output File', default="peptide-database.fa") @@ -21,7 +19,6 @@ def threeframe_translation(ctx, config_file, input_fasta, translation_table, output): if input_fasta is None: print_help() - pipeline_arguments = {EnsemblDataService.TRANSLATION_TABLE: translation_table, EnsemblDataService.PROTEIN_DB_OUTPUT: output} diff --git a/pypgatk/commands/vcf_to_proteindb.py b/pypgatk/commands/vcf_to_proteindb.py index 77b73aa..51830d7 100644 --- a/pypgatk/commands/vcf_to_proteindb.py +++ b/pypgatk/commands/vcf_to_proteindb.py @@ -9,8 +9,8 @@ @click.command('vcf-to-proteindb', short_help="Generate peptides based on DNA variants from ENSEMBL VEP VCF files") -@click.option('--config_file', '-c', help='Configuration to perform conversion between ENSEMBL Files', - default= this_dir + '../config/ensembl_config.yaml') +@click.option('-c', '--config_file', help='Configuration to perform conversion between ENSEMBL Files', + default= this_dir + '/../config/ensembl_config.yaml') @click.option('-f', '--input_fasta', help='Path to the transcript sequence') @click.option('-v', '--vep_annotated_vcf', help='Path to the vep annotated VCF file') @click.option('-g', '--gene_annotations_gtf', help='Path to the gene annotations file') @@ -52,25 +52,23 @@ def vcf_to_proteindb(ctx, config_file, input_fasta, vep_annotated_vcf, gene_anno if input_fasta is None or vep_annotated_vcf is None or gene_annotations_gtf is None: print_help() - pipeline_arguments = {} - pipeline_arguments[EnsemblDataService.MITO_TRANSLATION_TABLE] = mito_translation_table - pipeline_arguments[EnsemblDataService.TRANSLATION_TABLE] = translation_table - pipeline_arguments[EnsemblDataService.HEADER_VAR_PREFIX] = var_prefix - pipeline_arguments[EnsemblDataService.REPORT_REFERENCE_SEQ] = report_ref_seq - pipeline_arguments[EnsemblDataService.PROTEIN_DB_OUTPUT] = output_proteindb - pipeline_arguments[EnsemblDataService.ANNOTATION_FIELD_NAME] = annotation_field_name - pipeline_arguments[EnsemblDataService.AF_FIELD] = af_field - pipeline_arguments[EnsemblDataService.AF_THRESHOLD] = af_threshold - pipeline_arguments[EnsemblDataService.TRANSCRIPT_INDEX] = transcript_index - pipeline_arguments[EnsemblDataService.CONSEQUENCE_INDEX] = consequence_index - pipeline_arguments[EnsemblDataService.EXCLUDE_BIOTYPES] = exclude_biotypes - pipeline_arguments[EnsemblDataService.EXCLUDE_CONSEQUENCES] = exclude_consequences - pipeline_arguments[EnsemblDataService.SKIP_INCLUDING_ALL_CDS] = skip_including_all_cds - pipeline_arguments[EnsemblDataService.INCLUDE_BIOTYPES] = include_biotypes - pipeline_arguments[EnsemblDataService.INCLUDE_CONSEQUENCES] = include_consequences - pipeline_arguments[EnsemblDataService.BIOTYPE_STR] = biotype_str - pipeline_arguments[EnsemblDataService.IGNORE_FILTERS] = ignore_filters - pipeline_arguments[EnsemblDataService.ACCEPTED_FILTERS] = accepted_filters + pipeline_arguments = {EnsemblDataService.MITO_TRANSLATION_TABLE: mito_translation_table, + EnsemblDataService.TRANSLATION_TABLE: translation_table, + EnsemblDataService.HEADER_VAR_PREFIX: var_prefix, + EnsemblDataService.REPORT_REFERENCE_SEQ: report_ref_seq, + EnsemblDataService.PROTEIN_DB_OUTPUT: output_proteindb, + EnsemblDataService.ANNOTATION_FIELD_NAME: annotation_field_name, + EnsemblDataService.AF_FIELD: af_field, EnsemblDataService.AF_THRESHOLD: af_threshold, + EnsemblDataService.TRANSCRIPT_INDEX: transcript_index, + EnsemblDataService.CONSEQUENCE_INDEX: consequence_index, + EnsemblDataService.EXCLUDE_BIOTYPES: exclude_biotypes, + EnsemblDataService.EXCLUDE_CONSEQUENCES: exclude_consequences, + EnsemblDataService.SKIP_INCLUDING_ALL_CDS: skip_including_all_cds, + EnsemblDataService.INCLUDE_BIOTYPES: include_biotypes, + EnsemblDataService.INCLUDE_CONSEQUENCES: include_consequences, + EnsemblDataService.BIOTYPE_STR: biotype_str, + EnsemblDataService.IGNORE_FILTERS: ignore_filters, + EnsemblDataService.ACCEPTED_FILTERS: accepted_filters} ensembl_data_service = EnsemblDataService(config_file, pipeline_arguments) ensembl_data_service.vcf_to_proteindb(vep_annotated_vcf, input_fasta, gene_annotations_gtf) diff --git a/pypgatk/db/digest_mutant_protein.py b/pypgatk/db/digest_mutant_protein.py index d36ca28..61e631a 100644 --- a/pypgatk/db/digest_mutant_protein.py +++ b/pypgatk/db/digest_mutant_protein.py @@ -66,7 +66,7 @@ def TRYPSIN(proseq, miss_cleavage): elif opt == '--miss': cleavageMiss = int(arg) else: - print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); + print("Warning! Command-line argument: %s not recognized. Exiting..." % opt) sys.exit() handle1 = SeqIO.parse(fa_file, 'fasta') # canonical protein sequences diff --git a/pypgatk/db/map_peptide2genome.py b/pypgatk/db/map_peptide2genome.py index 40c2011..b274726 100644 --- a/pypgatk/db/map_peptide2genome.py +++ b/pypgatk/db/map_peptide2genome.py @@ -68,7 +68,7 @@ def get_pep_cor(exon_object_list, n1, else: pep_chr_end = exon.end - (n1 - exon.trans_start) - if n2 <= exon.trans_end and n2 >= exon.trans_start: + if exon.trans_end >= n2 >= exon.trans_start: pep_chr = exon.chr pep_strand = exon.strand pep_end_exon = i + 1 @@ -175,7 +175,7 @@ def parse_gtf(infile): exons = feature_dic[enst] except KeyError: non_mapped_pep += 1 - continue; + continue aa_seq = str(seq_dic[ensp].seq) pep_index = aa_seq.index(peptide) @@ -193,11 +193,11 @@ def parse_gtf(infile): if pep_chr_start > pep_chr_end: non_mapped_pep += 1 # print peptide,ensp,enst - continue; + continue if pep_chr_start <= 0: non_mapped_pep += 1 # print peptide,ensp,enst,pep_trans_start,pep_trans_end - continue; + continue # print pep_chr_start,pep_chr_end # print pep_start_exon,pep_end_exon diff --git a/pypgatk/ensembl/data_downloader.py b/pypgatk/ensembl/data_downloader.py index 00aebeb..088939a 100644 --- a/pypgatk/ensembl/data_downloader.py +++ b/pypgatk/ensembl/data_downloader.py @@ -16,326 +16,367 @@ class EnsemblDataDownloadService(ParameterConfiguration): - """ + """ This Service is in charge of grabbing data (download) from Ensembl to a local repository """ - CONFIG_KEY_DATA_DOWNLOADER = 'ensembl_data_downloader' - CONFIG_OUTPUT_DIRECTORY = 'output_directory' - CONFIG_KEY_ENSEMBL_FTP = 'ensembl_ftp' - CONFIG_ENSEMBL_API = 'ensembl_api' - CONFIG_ENSEMBL_API_SERVER = 'server' - CONFIG_ENSEMBL_API_SPECIES = 'species' - CONFIG_KEY_BASE_URL = 'base_url' - CONFIG_KEY_FOLDER_PREFIX_RELEASE = 'folder_prefix_release' - CONFIG_KEY_FOLDER_NAME_FASTA = 'folder_name_fasta' - CONFIG_KEY_FOLDER_NAME_PROTEIN_SEQUENCES = 'folder_name_protein_sequences' - CONFIG_KEY_FOLDER_NAME_GTF = 'folder_name_gtf' - CONFIG_KEY_REWRITE_LOCAL_PATH_ENSEMBL_REPO = 'rewrite_local_path_ensembl_repo' - CONFIG_KEY_ENSEMBL_FILE_NAMES = 'ensembl_file_names' - CONFIG_KEY_PROTEIN_SEQUENCE_FILE = 'protein_sequence_file' - CONFIG_KEY_FILE_TYPE = 'file_type' - CONFIG_KEY_FILE_SUFFIXES = 'file_suffixes' - CONFIG_KEY_FILE_EXTENSION = 'file_extension' - CONFIG_KEY_GTF_FILE = 'gtf_file' - CONFIG_REST_API_TAXON_ID = 'taxon_id' - CONFIG_REST_API_NAME = 'name' - CONFIG_TAXONOMY = 'taxonomy' - CONFIG_ENSEMBL_NAME = 'name' - CONFIG_LIST_TAXONOMIES = 'list_taxonomies' - CONFIG_KEY_SKIP_PROTEIN = 'skip_protein' - CONFIG_KEY_SKIP_GTF = 'skip_gtf' - CONFIG_KEY_SKIP_CDS = 'skip_cds' - CONFIG_KEY_SKIP_CDNA = 'skip_cdna' - CONFIG_KEY_SKIP_NCRNA = 'skip_ncrna' - CONFIG_KEY_SKIP_VCF = 'skip_vcf' - - def __init__(self, config_file, pipeline_arguments): - """ + CONFIG_KEY_DATA_DOWNLOADER = 'ensembl_data_downloader' + CONFIG_OUTPUT_DIRECTORY = 'output_directory' + CONFIG_KEY_ENSEMBL_FTP = 'ensembl_ftp' + CONFIG_ENSEMBL_API = 'ensembl_api' + CONFIG_ENSEMBL_API_SERVER = 'server' + CONFIG_ENSEMBL_API_SPECIES = 'species' + CONFIG_KEY_BASE_URL = 'base_url' + CONFIG_KEY_FOLDER_PREFIX_RELEASE = 'folder_prefix_release' + CONFIG_KEY_FOLDER_NAME_FASTA = 'folder_name_fasta' + CONFIG_KEY_FOLDER_NAME_PROTEIN_SEQUENCES = 'folder_name_protein_sequences' + CONFIG_KEY_FOLDER_NAME_GTF = 'folder_name_gtf' + CONFIG_KEY_REWRITE_LOCAL_PATH_ENSEMBL_REPO = 'rewrite_local_path_ensembl_repo' + CONFIG_KEY_ENSEMBL_FILE_NAMES = 'ensembl_file_names' + CONFIG_KEY_PROTEIN_SEQUENCE_FILE = 'protein_sequence_file' + CONFIG_KEY_FILE_TYPE = 'file_type' + CONFIG_KEY_FILE_SUFFIXES = 'file_suffixes' + CONFIG_KEY_FILE_EXTENSION = 'file_extension' + CONFIG_KEY_GTF_FILE = 'gtf_file' + CONFIG_REST_API_TAXON_ID = 'taxon_id' + CONFIG_REST_API_NAME = 'name' + CONFIG_TAXONOMY = 'taxonomy' + CONFIG_ENSEMBL_NAME = 'name' + CONFIG_LIST_TAXONOMIES = 'list_taxonomies' + CONFIG_KEY_SKIP_PROTEIN = 'skip_protein' + CONFIG_KEY_SKIP_GTF = 'skip_gtf' + CONFIG_KEY_SKIP_CDS = 'skip_cds' + CONFIG_KEY_SKIP_CDNA = 'skip_cdna' + CONFIG_KEY_SKIP_NCRNA = 'skip_ncrna' + CONFIG_KEY_SKIP_VCF = 'skip_vcf' + + def __init__(self, config_file, pipeline_arguments): + """ Init the class with the specific parameters. :param config_file configuration file :param pipeline_arguments pipelines arguments """ - super(EnsemblDataDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, - pipeline_arguments) - - self._ensembl_species = [] - if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters(): - self._local_path_ensembl = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY] - else: - self._local_path_ensembl = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ - self.CONFIG_OUTPUT_DIRECTORY] - - self._list_taxonomies = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_LIST_TAXONOMIES] - if self.CONFIG_LIST_TAXONOMIES in self.get_pipeline_parameters(): - self._list_taxonomies = self.get_pipeline_parameters()[self.CONFIG_LIST_TAXONOMIES] - - self.prepare_local_ensembl_repository() - - def get_local_path_root_ensembl_repo(self): - return self._local_path_ensembl - - def prepare_local_ensembl_repository(self): - self.get_logger().debug( - "Preparing local Ensembl repository, root folder - '{}'".format(self.get_local_path_root_ensembl_repo())) - check_create_folders([self.get_local_path_root_ensembl_repo()]) - self.get_logger().debug( - "Local path for Ensembl Release - '{}'".format(self.get_local_path_root_ensembl_repo())) + super(EnsemblDataDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, + pipeline_arguments) - def get_species_from_rest(self): - """ + self._ensembl_species = [] + if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters(): + self._local_path_ensembl = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY] + else: + self._local_path_ensembl = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][ + self.CONFIG_OUTPUT_DIRECTORY] + + self._list_taxonomies = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_LIST_TAXONOMIES] + if self.CONFIG_LIST_TAXONOMIES in self.get_pipeline_parameters(): + self._list_taxonomies = self.get_pipeline_parameters()[self.CONFIG_LIST_TAXONOMIES] + + self.prepare_local_ensembl_repository() + + def get_local_path_root_ensembl_repo(self): + return self._local_path_ensembl + + def prepare_local_ensembl_repository(self): + self.get_logger().debug( + "Preparing local Ensembl repository, root folder - '{}'".format(self.get_local_path_root_ensembl_repo())) + check_create_folders([self.get_local_path_root_ensembl_repo()]) + self.get_logger().debug( + "Local path for Ensembl Release - '{}'".format(self.get_local_path_root_ensembl_repo())) + + def get_species_from_rest(self): + """ Get the list of species from ENSEMBL rest API. :return: """ - server = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_ENSEMBL_API][ - self.CONFIG_ENSEMBL_API_SERVER] - endpoint = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_ENSEMBL_API][ - self.CONFIG_ENSEMBL_API_SPECIES] - species_info = loads(call_api(server + endpoint).text) - self._ensembl_species = species_info['species'] - return self._ensembl_species - - def download_database_by_species(self): - """ + server = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_ENSEMBL_API][ + self.CONFIG_ENSEMBL_API_SERVER] + endpoint = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_ENSEMBL_API][ + self.CONFIG_ENSEMBL_API_SPECIES] + species_info = loads(call_api(server + endpoint).text) + self._ensembl_species = species_info['species'] + return self._ensembl_species + + def download_database_by_species(self, grch37=False): + """ This method takes a list of Taxonomies from the commandline parameters and download the Protein fasta files and the gtf files. :return: """ - self.get_species_from_rest() - species_parameters = self.get_pipeline_parameters()[self.CONFIG_TAXONOMY] - species_list = species_parameters.split(",") - - species_name = self.get_pipeline_parameters()[self.CONFIG_ENSEMBL_NAME] - species_name_parameters = () - if species_name is not None: - species_name_parameters = species_name.split(",") - - total_files = [] - files = [] - if species_list is not None and len(species_list) > 0 and len(species_parameters) > 0: - for species_id in species_list: - for species in self._ensembl_species: - if species_id == species[self.CONFIG_REST_API_TAXON_ID]: - self.get_logger().debug( - "Downloading the data for the specie -- " + species[self.CONFIG_REST_API_TAXON_ID]) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_PROTEIN]: - prot_files = self.get_pep_files(species) - files.extend(prot_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_GTF]: - gtf_files = self.get_gtf_files(species) - files.extend(gtf_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDS]: - cds_files = self.get_cds_files(species) - files.extend(cds_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDNA]: - cdna_files = self.get_cdna_files(species) - files.extend(cdna_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_NCRNA]: - ncrna_files = self.get_ncrna_files(species) - files.extend(ncrna_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_VCF]: - vcf_files = self.get_vcf_files(species) - files.extend(vcf_files) - - total_files.extend(files) - self.get_logger().debug("Files downloaded -- " + ",".join(files)) - total_files.extend(files) - elif species_name_parameters is not None and len(species_name_parameters) > 0: - for species_name in species_name_parameters: - for species in self._ensembl_species: - if species_name == species[self.CONFIG_REST_API_NAME]: - self.get_logger().debug( - "Downloading the data for the specie -- " + species[self.CONFIG_REST_API_NAME]) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_PROTEIN]: - prot_files = self.get_pep_files(species) - files.extend(prot_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_GTF]: - gtf_files = self.get_gtf_files(species) - files.extend(gtf_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDS]: - cds_files = self.get_cds_files(species) - files.extend(cds_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDNA]: - cdna_files = self.get_cdna_files(species) - files.extend(cdna_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_NCRNA]: - ncrna_files = self.get_ncrna_files(species) - files.extend(ncrna_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_VCF]: - vcf_files = self.get_vcf_files(species) - files.extend(vcf_files) - - total_files.extend(files) - self.get_logger().debug("Files downloaded -- " + ",".join(files)) - total_files.extend(files) - else: - for species in self._ensembl_species: - self.get_logger().debug( - "Downloading the data for the specie -- " + species[self.CONFIG_REST_API_TAXON_ID]) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_PROTEIN]: - prot_files = self.get_pep_files(species) - files.extend(prot_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_GTF]: - gtf_files = self.get_gtf_files(species) - files.extend(gtf_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDS]: - cds_files = self.get_cds_files(species) - files.extend(cds_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDNA]: - cdna_files = self.get_cdna_files(species) - files.extend(cdna_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_NCRNA]: - ncrna_files = self.get_ncrna_files(species) - files.extend(ncrna_files) - if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_VCF]: - vcf_files = self.get_vcf_files(species) - files.extend(vcf_files) - - total_files.extend(files) - self.get_logger().debug("Files downloaded -- " + ",".join(files)) - total_files.extend(files) - - return total_files - - def get_cds_files(self, species: dict) -> list: - """ + self.get_species_from_rest() + species_parameters = self.get_pipeline_parameters()[self.CONFIG_TAXONOMY] + species_list = species_parameters.split(",") + + species_name = self.get_pipeline_parameters()[self.CONFIG_ENSEMBL_NAME] + species_name_parameters = () + if species_name is not None: + species_name_parameters = species_name.split(",") + + total_files = [] + files = [] + if species_list is not None and len(species_list) > 0 and len(species_parameters) > 0: + for species_id in species_list: + for species in self._ensembl_species: + if species_id == species[self.CONFIG_REST_API_TAXON_ID]: + self.get_logger().debug( + "Downloading the data for the specie -- " + species[self.CONFIG_REST_API_TAXON_ID]) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_PROTEIN]: + prot_files = self.get_pep_files(species, grch37) + files.extend(prot_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_GTF]: + gtf_files = self.get_gtf_files(species, grch37) + files.extend(gtf_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDS]: + cds_files = self.get_cds_files(species, grch37) + files.extend(cds_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDNA]: + cdna_files = self.get_cdna_files(species, grch37) + files.extend(cdna_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_NCRNA]: + ncrna_files = self.get_ncrna_files(species, grch37) + files.extend(ncrna_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_VCF]: + vcf_files = self.get_vcf_files(species) + files.extend(vcf_files) + + total_files.extend(files) + self.get_logger().debug("Files downloaded -- " + ",".join(files)) + total_files.extend(files) + elif species_name_parameters is not None and len(species_name_parameters) > 0: + for species_name in species_name_parameters: + for species in self._ensembl_species: + if species_name == species[self.CONFIG_REST_API_NAME]: + self.get_logger().debug( + "Downloading the data for the specie -- " + species[self.CONFIG_REST_API_NAME]) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_PROTEIN]: + prot_files = self.get_pep_files(species) + files.extend(prot_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_GTF]: + gtf_files = self.get_gtf_files(species) + files.extend(gtf_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDS]: + cds_files = self.get_cds_files(species) + files.extend(cds_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDNA]: + cdna_files = self.get_cdna_files(species) + files.extend(cdna_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_NCRNA]: + ncrna_files = self.get_ncrna_files(species) + files.extend(ncrna_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_VCF]: + vcf_files = self.get_vcf_files(species) + files.extend(vcf_files) + + total_files.extend(files) + self.get_logger().debug("Files downloaded -- " + ",".join(files)) + total_files.extend(files) + else: + for species in self._ensembl_species: + self.get_logger().debug( + "Downloading the data for the specie -- " + species[self.CONFIG_REST_API_TAXON_ID]) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_PROTEIN]: + prot_files = self.get_pep_files(species, grch37) + files.extend(prot_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_GTF]: + gtf_files = self.get_gtf_files(species, grch37) + files.extend(gtf_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDS]: + cds_files = self.get_cds_files(species, grch37) + files.extend(cds_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_CDNA]: + cdna_files = self.get_cdna_files(species, grch37) + files.extend(cdna_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_NCRNA]: + ncrna_files = self.get_ncrna_files(species, grch37) + files.extend(ncrna_files) + if not self.get_pipeline_parameters()[self.CONFIG_KEY_SKIP_VCF]: + vcf_files = self.get_vcf_files(species) + files.extend(vcf_files) + + total_files.extend(files) + self.get_logger().debug("Files downloaded -- " + ",".join(files)) + total_files.extend(files) + + return total_files + + def get_cds_files(self, species: dict, grch37=False) -> list: + """ Get the cds files for a specific species object. :return: List of files names. """ - files = [] - try: - file_name = '{}.{}.cds.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:], - species['assembly']) - file_url = '{}/release-{}/fasta/{}/cds/{}'.format( - self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ - self.CONFIG_KEY_BASE_URL], - species['release'], species['name'], file_name) - files.append(download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - except KeyError: - print("No valid info is available species: ", species) - - return files - - def get_cdna_files(self, species: dict) -> list: - """ + files = [] + try: + if grch37: + species['assembly'] = 'GRCh37' + file_name = '{}.{}.cds.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:], + species['assembly']) + file_url = '{}/release-{}/fasta/{}/cds/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], species['release'], species['name'], file_name) + if grch37: + file_url = '{}/grch37/release-{}/fasta/{}/cds/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], species['release'], species['name'], file_name) + files.append( + download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) + except KeyError: + print("No valid info is available species: ", species) + + return files + + def get_cdna_files(self, species: dict, grch37=False) -> list: + """ Get the cds files for a specific species object. :return: List of files names. """ - files = [] - try: - file_name = '{}.{}.cdna.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:], - species['assembly']) - file_url = '{}/release-{}/fasta/{}/cdna/{}'.format( - self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ - self.CONFIG_KEY_BASE_URL], - species['release'], species['name'], file_name) - files.append(download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - except KeyError: - print("No valid info is available species: ", species) - - return files - - def get_ncrna_files(self, species: dict) -> list: - """ + files = [] + try: + if grch37: + species['assembly'] = 'GRCh37' + file_name = '{}.{}.cdna.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:], + species['assembly']) + file_url = '{}/release-{}/fasta/{}/cdna/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], + species['release'], species['name'], file_name) + if grch37: + file_url = '{}/grch37/release-{}/fasta/{}/cdna/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], + species['release'], species['name'], file_name) + files.append( + download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) + except KeyError: + print("No valid info is available species: ", species) + + return files + + def get_ncrna_files(self, species: dict, grch37=False) -> list: + """ Get the cds files for a specific species object. :return: List of files names. """ - files = [] - try: - file_name = '{}.{}.ncrna.fa.gz'.format(species['name'][0].upper() + species['name'][1:], - species['assembly']) - file_url = '{}/release-{}/fasta/{}/ncrna/{}'.format( - self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ - self.CONFIG_KEY_BASE_URL], - species['release'], species['name'], file_name) - files.append(download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - except KeyError: - print("No valid info is available species: ", species) - - return files - - def get_pep_files(self, species: dict) -> list: - """ - Get the peptide files for a specific species object. - :return: List of files names. - """ - files = [] - try: - file_name = '{}.{}.pep.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:], - species['assembly']) - file_url = '{}/release-{}/fasta/{}/pep/{}'.format( - self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ - self.CONFIG_KEY_BASE_URL], - species['release'], species['name'], file_name) - files.append(download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - except KeyError: - print("No valid info is available species: ", species) - - return files - - def get_gtf_files(self, species: dict) -> list: - """ - This method retrieve the gtf files for a specific specie object - :param species: - :return: - """ - """ - Generate GTF file name from the species info and download the GTF file - """ - files = [] - try: - file_name = '{}.{}.{}.gtf.gz'.format(species['name'][0].upper() + species['name'][1:], species['assembly'], - species['release'], ) - file_url = '{}/release-{}/gtf/{}/{}'.format( - self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ - self.CONFIG_KEY_BASE_URL], species['release'], species['name'], file_name) - files.append(download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - except KeyError: - self.get_logger().debug("No valid info is available species: ", species) - - return files - - def get_vcf_files(self, species: dict) -> list: - """ - This method retrieve the vcf file for a specific specie object - :param species: - :return: - """ - """ - Generate VCF file name from the species info and download it - ftp://ftp.ensembl.org/pub/release-97/variation/vcf/felis_catus/felis_catus_incl_consequences.vcf.gz - """ - files = [] - try: - file_name = '{}_incl_consequences.vcf.gz'.format(species['name']) - file_url = '{}/release-{}/variation/vcf/{}/'.format( - self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ - self.CONFIG_KEY_BASE_URL], species['release'], species['name']) - - downloaded_file = download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger()) - if downloaded_file is not None: - files.append(downloaded_file) - - elif species['name']=='homo_sapiens': - "for humans the variants are stored per chromosome, so we need to download them all and combine them into one file here" - chrN=1 - file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN) - downloaded_file = download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger()) - if downloaded_file is not None: - "if chr1 is downloaded then try all others" - files.append(downloaded_file) - for chrN in range(2, 23):#chr2-22 - file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN) - files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'X') - files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'Y') - files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'MT') - files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) - - except KeyError: - self.get_logger().debug("No valid info is available species: ", species) - - return files - \ No newline at end of file + files = [] + try: + if grch37: + species['assembly'] = 'GRCh37' + + file_name = '{}.{}.ncrna.fa.gz'.format(species['name'][0].upper() + species['name'][1:], + species['assembly']) + file_url = '{}/release-{}/fasta/{}/ncrna/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], + species['release'], species['name'], file_name) + if grch37: + file_url = '{}/grch37/release-{}/fasta/{}/ncrna/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], + species['release'], species['name'], file_name) + files.append( + download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) + except KeyError: + print("No valid info is available species: ", species) + + return files + + def get_pep_files(self, species: dict, grch37=False) -> list: + """ + Get the peptide files for a specific species object. + :return: List of files names. + """ + files = [] + try: + # TODO: Would be better to check by API the assembly version + if grch37: + species['assembly'] = 'GRCh37' + file_name = '{}.{}.pep.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:], + species['assembly']) + file_url = '{}/release-{}/fasta/{}/pep/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], + species['release'], species['name'], file_name) + if grch37: + file_url = '{}/grch37/release-{}/fasta/{}/pep/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], + species['release'], species['name'], file_name) + files.append( + download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) + except KeyError: + print("No valid info is available species: ", species) + + return files + + def get_gtf_files(self, species: dict, grch37=False) -> list: + """ + This method retrieve the gtf files for a specific specie object + :param grch37: if the GrCh37 genome assembly is desired enable to true + :param species: species to download the file. + :return: + """ + files = [] + try: + if grch37: + species['assembly'] = 'GRCh37' + species['release'] = '87' + file_name = '{}.{}.{}.gtf.gz'.format(species['name'][0].upper() + species['name'][1:], species['assembly'], + species['release'], ) + file_url = '{}/release-{}/gtf/{}/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], species['release'], species['name'], file_name) + if grch37: + file_url = '{}/grch37/release-{}/gtf/{}/{}'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], species['release'], species['name'], file_name) + files.append( + download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())) + + except KeyError: + self.get_logger().debug("No valid info is available species: ", species) + + return files + + def get_vcf_files(self, species: dict) -> list: + """ + This method retrieve the vcf file for a specific specie object + :param species: + :return: + """ + files = [] + try: + file_name = '{}_incl_consequences.vcf.gz'.format(species['name']) + file_url = '{}/release-{}/variation/vcf/{}/'.format( + self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][ + self.CONFIG_KEY_BASE_URL], species['release'], species['name']) + + downloaded_file = download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, + self.get_logger()) + if downloaded_file is not None: + files.append(downloaded_file) + + elif species['name'] == 'homo_sapiens': + # for humans the variants are stored per chromosome, so we need to download them all and combine them into one file here" + chrN = 1 + file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN) + downloaded_file = download_file(file_url + file_name, + self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger()) + if downloaded_file is not None: + # if chr1 is downloaded then try all others + files.append(downloaded_file) + for chrN in range(2, 23): # chr2-22 + file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN) + files.append( + download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, + self.get_logger())) + file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'X') + files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, + self.get_logger())) + file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'Y') + files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, + self.get_logger())) + file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'MT') + files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name, + self.get_logger())) + + except KeyError: + self.get_logger().debug("No valid info is available species: ", species) + + return files diff --git a/pypgatk/ensembl/ensembl.py b/pypgatk/ensembl/ensembl.py index 1f8561b..edba5d6 100644 --- a/pypgatk/ensembl/ensembl.py +++ b/pypgatk/ensembl/ensembl.py @@ -2,281 +2,292 @@ import vcf from Bio import SeqIO from Bio.Seq import Seq -import sys + from pypgatk.toolbox.general import ParameterConfiguration class EnsemblDataService(ParameterConfiguration): - CONFIG_KEY_VCF = "ensembl_translation" - INPUT_FASTA = "input_fasta" - TRANSLATION_TABLE = "translation_table" - MITO_TRANSLATION_TABLE = "mito_translation_table" - HEADER_VAR_PREFIX = "var_prefix" - REPORT_REFERENCE_SEQ = "report_ref_seq" - PROTEIN_DB_OUTPUT = "proteindb_output_file" - ANNOTATION_FIELD_NAME = "annotation_field_name" - AF_FIELD = "af_field" - AF_THRESHOLD = "af_threshold" - TRANSCRIPT_INDEX = "transcript_index" - CONSEQUENCE_INDEX = "consequence_index" - EXCLUDE_BIOTYPES = "exclude_biotypes" - EXCLUDE_CONSEQUENCES = "exclude_consequences" - SKIP_INCLUDING_ALL_CDS = "skip_including_all_cds" - INCLUDE_BIOTYPES = "include_biotypes" - INCLUDE_CONSEQUENCES = "include_consequences" - BIOTYPE_STR = "biotype_str" - SKIP_INCLUDING_ALL_CDSS = "skip_including_all_CDSs" - CONFIG_KEY_DATA = "ensembl_translation" - NUM_ORFS = "num_orfs" - NUM_ORFS_COMPLEMENT = "num_orfs_complement" - EXPRESSION_STR = "expression_str" - EXPRESSION_THRESH = "expression_thresh" - IGNORE_FILTERS = "ignore_filters" - ACCEPTED_FILTERS = "accepted_filters" - - def __init__(self, config_file, pipeline_arguments): - """ + CONFIG_KEY_VCF = "ensembl_translation" + INPUT_FASTA = "input_fasta" + TRANSLATION_TABLE = "translation_table" + MITO_TRANSLATION_TABLE = "mito_translation_table" + HEADER_VAR_PREFIX = "var_prefix" + REPORT_REFERENCE_SEQ = "report_ref_seq" + PROTEIN_DB_OUTPUT = "proteindb_output_file" + ANNOTATION_FIELD_NAME = "annotation_field_name" + AF_FIELD = "af_field" + AF_THRESHOLD = "af_threshold" + TRANSCRIPT_INDEX = "transcript_index" + CONSEQUENCE_INDEX = "consequence_index" + EXCLUDE_BIOTYPES = "exclude_biotypes" + EXCLUDE_CONSEQUENCES = "exclude_consequences" + SKIP_INCLUDING_ALL_CDS = "skip_including_all_cds" + INCLUDE_BIOTYPES = "include_biotypes" + INCLUDE_CONSEQUENCES = "include_consequences" + BIOTYPE_STR = "biotype_str" + SKIP_INCLUDING_ALL_CDSS = "skip_including_all_CDSs" + CONFIG_KEY_DATA = "ensembl_translation" + NUM_ORFS = "num_orfs" + NUM_ORFS_COMPLEMENT = "num_orfs_complement" + EXPRESSION_STR = "expression_str" + EXPRESSION_THRESH = "expression_thresh" + IGNORE_FILTERS = "ignore_filters" + ACCEPTED_FILTERS = "accepted_filters" + + def __init__(self, config_file, pipeline_arguments): + """ Init the class with the specific parameters. :param config_file configuration file :param pipeline_arguments pipelines arguments """ - super(EnsemblDataService, self).__init__(self.CONFIG_KEY_DATA, config_file, - pipeline_arguments) - - self._proteindb_output = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.PROTEIN_DB_OUTPUT] - if self.PROTEIN_DB_OUTPUT in self.get_pipeline_parameters(): - self._proteindb_output = self.get_pipeline_parameters()[self.PROTEIN_DB_OUTPUT] - - self._translation_table = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.TRANSLATION_TABLE] - if self.TRANSLATION_TABLE in self.get_pipeline_parameters(): - self._translation_table = self.get_pipeline_parameters()[self.TRANSLATION_TABLE] - - self._mito_translation_table = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.MITO_TRANSLATION_TABLE] - if self.MITO_TRANSLATION_TABLE in self.get_pipeline_parameters(): - self._mito_translation_table = self.get_pipeline_parameters()[self.MITO_TRANSLATION_TABLE] - - self._header_var_prefix = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.HEADER_VAR_PREFIX] - if self.HEADER_VAR_PREFIX in self.get_pipeline_parameters(): - self._header_var_prefix = self.get_pipeline_parameters()[self.HEADER_VAR_PREFIX] - - self._report_reference_seq = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.REPORT_REFERENCE_SEQ] - if self.REPORT_REFERENCE_SEQ in self.get_pipeline_parameters(): - self._report_reference_seq = self.get_pipeline_parameters()[self.REPORT_REFERENCE_SEQ] - - self._annotation_field_name = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.ANNOTATION_FIELD_NAME] - if self.ANNOTATION_FIELD_NAME in self.get_pipeline_parameters(): - self._annotation_field_name = self.get_pipeline_parameters()[self.ANNOTATION_FIELD_NAME] - - self._af_field = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.AF_FIELD] - if self.AF_FIELD in self.get_pipeline_parameters(): - self._af_field = self.get_pipeline_parameters()[self.AF_FIELD] - - self._af_threshold = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.AF_THRESHOLD] - if self.AF_THRESHOLD in self.get_pipeline_parameters(): - self._af_threshold = self.get_pipeline_parameters()[self.AF_THRESHOLD] - - self._transcript_index = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.TRANSCRIPT_INDEX] - if self.TRANSCRIPT_INDEX in self.get_pipeline_parameters(): - self._transcript_index = self.get_pipeline_parameters()[self.TRANSCRIPT_INDEX] - - self._consequence_index = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.CONSEQUENCE_INDEX] - if self.CONSEQUENCE_INDEX in self.get_pipeline_parameters(): - self._consequence_index = self.get_pipeline_parameters()[self.CONSEQUENCE_INDEX] - - self._exclude_biotypes = self.get_multiple_options( - self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.EXCLUDE_BIOTYPES]) - if self.EXCLUDE_BIOTYPES in self.get_pipeline_parameters(): - self._exclude_biotypes = self.get_multiple_options(self.get_pipeline_parameters()[self.EXCLUDE_BIOTYPES]) - - self._exclude_consequences = self.get_multiple_options( - self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.EXCLUDE_CONSEQUENCES]) - if self.EXCLUDE_CONSEQUENCES in self.get_pipeline_parameters(): - self._exclude_consequences = self.get_multiple_options( - self.get_pipeline_parameters()[self.EXCLUDE_CONSEQUENCES]) - - self._skip_including_all_cds = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.SKIP_INCLUDING_ALL_CDS] - if self.SKIP_INCLUDING_ALL_CDS in self.get_pipeline_parameters(): - self._skip_including_all_cds = self.get_pipeline_parameters()[self.SKIP_INCLUDING_ALL_CDS] - - self._include_biotypes = self.get_multiple_options( - self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.INCLUDE_BIOTYPES]) - if self.INCLUDE_BIOTYPES in self.get_pipeline_parameters(): - self._include_biotypes = self.get_multiple_options(self.get_pipeline_parameters()[self.INCLUDE_BIOTYPES]) - - self._include_consequences = self.get_multiple_options( - self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.INCLUDE_CONSEQUENCES]) - if self.INCLUDE_CONSEQUENCES in self.get_pipeline_parameters(): - self._include_consequences = self.get_multiple_options( - self.get_pipeline_parameters()[self.INCLUDE_CONSEQUENCES]) - - self._biotype_str = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.BIOTYPE_STR] - if self.BIOTYPE_STR in self.get_pipeline_parameters(): - self._biotype_str = self.get_pipeline_parameters()[self.BIOTYPE_STR] - - self._num_orfs = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.NUM_ORFS] - if self.NUM_ORFS in self.get_pipeline_parameters(): - self._num_orfs = self.get_pipeline_parameters()[self.NUM_ORFS] - - self._num_orfs_complement = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.NUM_ORFS_COMPLEMENT] - if self.NUM_ORFS_COMPLEMENT in self.get_pipeline_parameters(): - self._num_orfs_complement = self.get_pipeline_parameters()[self.NUM_ORFS_COMPLEMENT] - - self._expression_str = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.EXPRESSION_STR] - if self.EXPRESSION_STR in self.get_pipeline_parameters(): - self._expression_str = self.get_pipeline_parameters()[self.EXPRESSION_STR] - - self._expression_thresh = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.EXPRESSION_THRESH] - if self.EXPRESSION_THRESH in self.get_pipeline_parameters(): - self._expression_thresh = self.get_pipeline_parameters()[self.EXPRESSION_THRESH] - - self._ignore_filters = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ - self.IGNORE_FILTERS] - if self.IGNORE_FILTERS in self.get_pipeline_parameters(): - self._ignore_filters = self.get_pipeline_parameters()[self.IGNORE_FILTERS] - - self._accepted_filters = self.get_multiple_options( - self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.ACCEPTED_FILTERS]) - if self.ACCEPTED_FILTERS in self.get_pipeline_parameters(): - self._accepted_filters = self.get_multiple_options( - self.get_pipeline_parameters()[self.ACCEPTED_FILTERS]) - - def three_frame_translation(self, input_fasta): - """ + super(EnsemblDataService, self).__init__(self.CONFIG_KEY_DATA, config_file, + pipeline_arguments) + + self._proteindb_output = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.PROTEIN_DB_OUTPUT] + if self.PROTEIN_DB_OUTPUT in self.get_pipeline_parameters(): + self._proteindb_output = self.get_pipeline_parameters()[self.PROTEIN_DB_OUTPUT] + + self._translation_table = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.TRANSLATION_TABLE] + if self.TRANSLATION_TABLE in self.get_pipeline_parameters(): + self._translation_table = self.get_pipeline_parameters()[self.TRANSLATION_TABLE] + + self._mito_translation_table = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.MITO_TRANSLATION_TABLE] + if self.MITO_TRANSLATION_TABLE in self.get_pipeline_parameters(): + self._mito_translation_table = self.get_pipeline_parameters()[self.MITO_TRANSLATION_TABLE] + + self._header_var_prefix = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.HEADER_VAR_PREFIX] + if self.HEADER_VAR_PREFIX in self.get_pipeline_parameters(): + self._header_var_prefix = self.get_pipeline_parameters()[self.HEADER_VAR_PREFIX] + + self._report_reference_seq = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.REPORT_REFERENCE_SEQ] + if self.REPORT_REFERENCE_SEQ in self.get_pipeline_parameters(): + self._report_reference_seq = self.get_pipeline_parameters()[self.REPORT_REFERENCE_SEQ] + + self._annotation_field_name = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.ANNOTATION_FIELD_NAME] + if self.ANNOTATION_FIELD_NAME in self.get_pipeline_parameters(): + self._annotation_field_name = self.get_pipeline_parameters()[self.ANNOTATION_FIELD_NAME] + + self._af_field = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.AF_FIELD] + if self.AF_FIELD in self.get_pipeline_parameters(): + self._af_field = self.get_pipeline_parameters()[self.AF_FIELD] + + self._af_threshold = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.AF_THRESHOLD] + if self.AF_THRESHOLD in self.get_pipeline_parameters(): + self._af_threshold = self.get_pipeline_parameters()[self.AF_THRESHOLD] + + self._transcript_index = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.TRANSCRIPT_INDEX] + if self.TRANSCRIPT_INDEX in self.get_pipeline_parameters(): + self._transcript_index = self.get_pipeline_parameters()[self.TRANSCRIPT_INDEX] + + self._consequence_index = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.CONSEQUENCE_INDEX] + if self.CONSEQUENCE_INDEX in self.get_pipeline_parameters(): + self._consequence_index = self.get_pipeline_parameters()[self.CONSEQUENCE_INDEX] + + self._exclude_biotypes = self.get_multiple_options( + self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.EXCLUDE_BIOTYPES]) + if self.EXCLUDE_BIOTYPES in self.get_pipeline_parameters(): + self._exclude_biotypes = self.get_multiple_options(self.get_pipeline_parameters()[self.EXCLUDE_BIOTYPES]) + + self._exclude_consequences = self.get_multiple_options( + self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.EXCLUDE_CONSEQUENCES]) + if self.EXCLUDE_CONSEQUENCES in self.get_pipeline_parameters(): + self._exclude_consequences = self.get_multiple_options( + self.get_pipeline_parameters()[self.EXCLUDE_CONSEQUENCES]) + + self._skip_including_all_cds = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.SKIP_INCLUDING_ALL_CDS] + if self.SKIP_INCLUDING_ALL_CDS in self.get_pipeline_parameters(): + self._skip_including_all_cds = self.get_pipeline_parameters()[self.SKIP_INCLUDING_ALL_CDS] + + self._include_biotypes = self.get_multiple_options( + self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.INCLUDE_BIOTYPES]) + if self.INCLUDE_BIOTYPES in self.get_pipeline_parameters(): + self._include_biotypes = self.get_multiple_options(self.get_pipeline_parameters()[self.INCLUDE_BIOTYPES]) + + self._include_consequences = self.get_multiple_options( + self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.INCLUDE_CONSEQUENCES]) + if self.INCLUDE_CONSEQUENCES in self.get_pipeline_parameters(): + self._include_consequences = self.get_multiple_options( + self.get_pipeline_parameters()[self.INCLUDE_CONSEQUENCES]) + + self._biotype_str = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.BIOTYPE_STR] + if self.BIOTYPE_STR in self.get_pipeline_parameters(): + self._biotype_str = self.get_pipeline_parameters()[self.BIOTYPE_STR] + + self._num_orfs = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.NUM_ORFS] + if self.NUM_ORFS in self.get_pipeline_parameters(): + self._num_orfs = self.get_pipeline_parameters()[self.NUM_ORFS] + + self._num_orfs_complement = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.NUM_ORFS_COMPLEMENT] + if self.NUM_ORFS_COMPLEMENT in self.get_pipeline_parameters(): + self._num_orfs_complement = self.get_pipeline_parameters()[self.NUM_ORFS_COMPLEMENT] + + self._expression_str = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.EXPRESSION_STR] + if self.EXPRESSION_STR in self.get_pipeline_parameters(): + self._expression_str = self.get_pipeline_parameters()[self.EXPRESSION_STR] + + self._expression_thresh = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.EXPRESSION_THRESH] + if self.EXPRESSION_THRESH in self.get_pipeline_parameters(): + self._expression_thresh = self.get_pipeline_parameters()[self.EXPRESSION_THRESH] + + self._ignore_filters = self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][ + self.IGNORE_FILTERS] + if self.IGNORE_FILTERS in self.get_pipeline_parameters(): + self._ignore_filters = self.get_pipeline_parameters()[self.IGNORE_FILTERS] + + self._accepted_filters = self.get_multiple_options( + self.get_default_parameters()[self.CONFIG_KEY_DATA][self.CONFIG_KEY_VCF][self.ACCEPTED_FILTERS]) + if self.ACCEPTED_FILTERS in self.get_pipeline_parameters(): + self._accepted_filters = self.get_multiple_options( + self.get_pipeline_parameters()[self.ACCEPTED_FILTERS]) + + def three_frame_translation(self, input_fasta): + """ This function translate a transcriptome into a 3'frame translation protein sequence database :param input_fasta: fasta input file :return: """ - input_handle = open(input_fasta, 'r') - output_handle = open(self._proteindb_output, 'rw') - - for record in SeqIO.parse(input_handle, 'fasta'): - seq = record.seq - RF1 = seq.translate(table=self._translation_table) - RF2 = seq[1::].translate(table=self._translation_table) - RF3 = seq[2::].translate(table=self._translation_table) - - if record.id == "": - print("skip entries without id", record.description) - continue - output_handle.write("%s\n%s\n" % ('>' + record.id + '_RF1', RF1)) - output_handle.write("%s\n%s\n" % ('>' + record.id + '_RF2', RF2)) - output_handle.write("%s\n%s\n" % ('>' + record.id + '_RF3', RF3)) - - @staticmethod - def get_multiple_options(options_str: str): - """ + input_handle = open(input_fasta, 'r') + output_handle = open(self._proteindb_output, 'w') + + for record in SeqIO.parse(input_handle, 'fasta'): + seq = record.seq + RF1 = seq.translate(table=self._translation_table) + RF2 = seq[1::].translate(table=self._translation_table) + RF3 = seq[2::].translate(table=self._translation_table) + + if record.id == "": + print("skip entries without id", record.description) + continue + output_handle.write("%s\n%s\n" % ('>' + record.id + '_RF1', RF1)) + output_handle.write("%s\n%s\n" % ('>' + record.id + '_RF2', RF2)) + output_handle.write("%s\n%s\n" % ('>' + record.id + '_RF3', RF3)) + + @staticmethod + def get_multiple_options(options_str: str): + """ This method takes an String like option1, option2, ... and produce and array [option1, option2,... ] :param options_str: :return: Array """ - return list(map(lambda x: x.strip(), options_str.split(","))) + return list(map(lambda x: x.strip(), options_str.split(","))) - @staticmethod - def check_overlap(var_start, var_end, features_info=[[0, 1, 'type']]): - """ + @staticmethod + def check_overlap(var_start, var_end, features_info=None): + """ This function returns true when the variant overlaps any of the features :param var_start: Start location :param var_end: End location - :param features_info: Feature information + :param features_info: Feature information (default = [[0, 1, 'type']]) :return: """ - if var_start == -1: - return True - # check if the var overlaps any of the features - for feature_pos in features_info: - pep_start = feature_pos[0] - pep_end = feature_pos[1] - if var_start <= pep_start <= var_end: # fully contained or partial overlap from the end - return True - elif var_start <= pep_end <= var_end: # partial overlap in the begining - return True - elif pep_start <= var_start and pep_end >= var_end: # fully covered - return True - return False - - @staticmethod - def get_altseq(ref_seq, ref_allele, var_allele, var_pos, strand, features_info, cds_info=[]): - """ - the given sequence in the fasta file represents all exons of the transcript combined. - for protein coding genes, the CDS is specified therefore the sequence position has to be - calculated based on the CDS's positions - However, for non-protein coding genes, the whole sequence is used - """ - alt_seq = "" - if len(cds_info) == 2: - start_coding_index = cds_info[0] - 1 # it should be index not pos - stop_coding_index = cds_info[1] # get end position of the last cds - else: - start_coding_index = 0 - total_len = 0 - for x in features_info: - total_len += x[1] - x[0] + 1 - stop_coding_index = total_len # the features are sorted by end therefroe the end pos of the last item is the last coding nc - - if strand == '-': # ge the correct orientation, because exons are oredered based on their position - ref_seq = ref_seq[ - ::-1] # in order to calculate from the first base of the first feature (sorted by genomic coordinates) - ref_allele = ref_allele.complement() # the reverse will be done on return - var_allele = var_allele.complement() # the reverse will be done on return - - ref_seq = ref_seq[ - start_coding_index:stop_coding_index] # just keep the coding regions (mostly effective in case of protein-coding genes) - nc_index = 0 - if len(ref_allele) == len(var_allele) or ref_allele[0] == var_allele[0]: - for feature in features_info: # for every exon, cds or stop codon - if var_pos in range(feature[0], feature[ - 1] + 1): # get index of the var relative to the position of the overlapping feature in the coding region - var_index_in_cds = nc_index + (var_pos - feature[0]) - # modify the coding reference sequence accoding to the var_allele - c = len(ref_allele) - alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[ - var_index_in_cds + c::] # variant and ref strand?? - if strand == '-': - return ref_seq[::-1], alt_seq[::-1] - else: - return ref_seq, alt_seq - - nc_index += (feature[1] - feature[0] + 1) - - return ref_seq, alt_seq - - @staticmethod - def parse_gtf(gene_annotations_gtf, gtf_db_file): - """ + if features_info is None: + features_info = [[0, 1, 'type']] + if var_start == -1: + return True + # check if the var overlaps any of the features + for feature_pos in features_info: + pep_start = feature_pos[0] + pep_end = feature_pos[1] + if var_start <= pep_start <= var_end: # fully contained or partial overlap from the end + return True + elif var_start <= pep_end <= var_end: # partial overlap in the begining + return True + elif pep_start <= var_start and pep_end >= var_end: # fully covered + return True + return False + + @staticmethod + def get_altseq(ref_seq, ref_allele, var_allele, var_pos, strand, features_info, cds_info=None): + """ + The given sequence in the fasta file represents all exons of the transcript combined. + for protein coding genes, the CDS is specified therefore the sequence position has to be + calculated based on the CDS's positions + However, for non-protein coding genes, the whole sequence is used + :param ref_seq: + :param ref_allele: + :param var_allele: + :param var_pos: + :param strand: + :param features_info: + :param cds_info: + :return: + """ + if cds_info is None: + cds_info = [] + alt_seq = "" + if len(cds_info) == 2: + start_coding_index = cds_info[0] - 1 # it should be index not pos + stop_coding_index = cds_info[1] # get end position of the last cds + else: + start_coding_index = 0 + total_len = 0 + for x in features_info: + total_len += x[1] - x[0] + 1 + stop_coding_index = total_len # the features are sorted by end therefroe the end pos of the last item is the last coding nc + + if strand == '-': # ge the correct orientation, because exons are oredered based on their position + ref_seq = ref_seq[ + ::-1] # in order to calculate from the first base of the first feature (sorted by genomic coordinates) + ref_allele = ref_allele.complement() # the reverse will be done on return + var_allele = var_allele.complement() # the reverse will be done on return + + ref_seq = ref_seq[ + start_coding_index:stop_coding_index] # just keep the coding regions (mostly effective in case of protein-coding genes) + nc_index = 0 + if len(ref_allele) == len(var_allele) or ref_allele[0] == var_allele[0]: + for feature in features_info: # for every exon, cds or stop codon + if var_pos in range(feature[0], feature[ + 1] + 1): # get index of the var relative to the position of the overlapping feature in the coding region + var_index_in_cds = nc_index + (var_pos - feature[0]) + # modify the coding reference sequence accoding to the var_allele + c = len(ref_allele) + alt_seq = ref_seq[0:var_index_in_cds] + var_allele + ref_seq[ + var_index_in_cds + c::] # variant and ref strand?? + if strand == '-': + return ref_seq[::-1], alt_seq[::-1] + else: + return ref_seq, alt_seq + + nc_index += (feature[1] - feature[0] + 1) + + return ref_seq, alt_seq + + @staticmethod + def parse_gtf(gene_annotations_gtf, gtf_db_file): + """ Convert GTF file into a FeatureDB :param gene_annotations_gtf: :param gtf_db_file: :return: """ - try: - gffutils.create_db(gene_annotations_gtf, gtf_db_file, merge_strategy="create_unique", - keep_order=True, disable_infer_transcripts=True, disable_infer_genes=True, - verbose=True, - force=False) - except: # already exists - print("already exists", gtf_db_file) - pass - - db = gffutils.FeatureDB(gtf_db_file) - return db - - @staticmethod - def get_features(db, feature_id, biotype_str, feature_types=['exon']): - """ + try: + gffutils.create_db(gene_annotations_gtf, gtf_db_file, merge_strategy="create_unique", + keep_order=True, disable_infer_transcripts=True, disable_infer_genes=True, + verbose=True, + force=False) + except Exception as e: # already exists + print("Databae already exists" + str(e), gtf_db_file) + + db = gffutils.FeatureDB(gtf_db_file) + return db + + @staticmethod + def get_features(db, feature_id, biotype_str, feature_types=None): + """ Get chr, genomic positions, strand and biotype for feature_id also genomic positions for all its elements (exons/cds&start_codon) :param db: @@ -285,25 +296,28 @@ def get_features(db, feature_id, biotype_str, feature_types=['exon']): :param feature_types: :return: """ - try: - feature = db[feature_id] - except gffutils.exceptions.FeatureNotFoundError: #remove version number from the ID - try: - feature = db[feature_id.split('.')[0]] - except gffutils.exceptions.FeatureNotFoundError: - print("""Feature {} found in fasta file but not in gtf file. Check that the fasta file and the gtf files match. - A common issue is when the fasta file have chromosome patches but not the gtf""".format(feature_id)) - return None, None, None, None - coding_features = [] - features = db.children(feature, featuretype=feature_types, order_by='end') - for f in features: - f_type = f.featuretype - coding_features.append([f.start, f.end, f_type]) - return feature.chrom, feature.strand, coding_features, feature.attributes[biotype_str][0] - - @staticmethod - def get_orfs_vcf(ref_seq: str, alt_seq: str, translation_table: int, num_orfs=1): - """ + if feature_types is None: + feature_types = ['exon'] + try: + feature = db[feature_id] + except gffutils.exceptions.FeatureNotFoundError: # remove version number from the ID + try: + feature = db[feature_id.split('.')[0]] + except gffutils.exceptions.FeatureNotFoundError: + print("""Feature {} found in fasta file but not in gtf file. Check that the fasta file and the gtf files match. + A common issue is when the fasta file have chromosome patches but not the gtf""".format( + feature_id)) + return None, None, None, None + coding_features = [] + features = db.children(feature, featuretype=feature_types, order_by='end') + for f in features: + f_type = f.featuretype + coding_features.append([f.start, f.end, f_type]) + return feature.chrom, feature.strand, coding_features, feature.attributes[biotype_str][0] + + @staticmethod + def get_orfs_vcf(ref_seq: str, alt_seq: str, translation_table: int, num_orfs=1): + """ Translate the coding_ref and the coding_alt into ORFs :param ref_seq: :param alt_seq: @@ -312,117 +326,120 @@ def get_orfs_vcf(ref_seq: str, alt_seq: str, translation_table: int, num_orfs=1) :return: """ - ref_orfs = [] - alt_orfs = [] - for n in range(0, num_orfs): - ref_orfs.append(ref_seq[n::].translate(translation_table)) - alt_orfs.append(alt_seq[n::].translate(translation_table)) + ref_orfs = [] + alt_orfs = [] + for n in range(0, num_orfs): + ref_orfs.append(ref_seq[n::].translate(translation_table)) + alt_orfs.append(alt_seq[n::].translate(translation_table)) - return ref_orfs, alt_orfs + return ref_orfs, alt_orfs - @staticmethod - def get_orfs_dna(ref_seq: str, translation_table: int, num_orfs: int, num_orfs_complement: int, to_stop: bool): - """translate the coding_ref into ORFs""" + @staticmethod + def get_orfs_dna(ref_seq: str, translation_table: int, num_orfs: int, num_orfs_complement: int, to_stop: bool): + """translate the coding_ref into ORFs""" - ref_orfs = [] - for n in range(0, num_orfs): - ref_orfs.append(ref_seq[n::].translate(translation_table, to_stop=to_stop)) + ref_orfs = [] + for n in range(0, num_orfs): + ref_orfs.append(ref_seq[n::].translate(translation_table, to_stop=to_stop)) - rev_ref_seq = ref_seq.reverse_complement() - for n in range(0, num_orfs_complement): - ref_orfs.append(rev_ref_seq[n::].translate(translation_table, to_stop=to_stop)) + rev_ref_seq = ref_seq.reverse_complement() + for n in range(0, num_orfs_complement): + ref_orfs.append(rev_ref_seq[n::].translate(translation_table, to_stop=to_stop)) - return ref_orfs + return ref_orfs - def dnaseq_to_proteindb(self, input_fasta): - """ + def dnaseq_to_proteindb(self, input_fasta): + """ translates DNA sequences to protein sequences :param input_fasta: :return: """ - seq_dict = SeqIO.index(input_fasta, "fasta") - - with open(self._proteindb_output, 'w') as prots_fn: - for record_id in seq_dict.keys(): - - ref_seq = seq_dict[ - record_id].seq # get the seq and desc for the record from the fasta of the gtf - desc = str(seq_dict[record_id].description) - - key_values = {} # extract key=value in the desc into a dict - sep = ' ' - if '|' in desc: - sep = '|' - for value in desc.split(sep): - if value.split('=')[0]=='cds' or value.split(':')[0]=='cds': - value.replace('cds', 'CDS') - if '=' in value: - key_values[value.split('=')[0]] = value.split('=')[1] - elif ':' in value: - key_values[value.split(':')[0]] = value.split(':')[1] - elif value.split('=')[0]=='CDS': #when only it is specified to be a CDS, it means the whole sequence to be used - key_values[value.split('=')[0]] = '{}-{}'.format(1, len(ref_seq)) - - feature_biotype = "" - if self._biotype_str: - try: - feature_biotype = key_values[self._biotype_str] - except KeyError: - msg = "Biotype info was not found in the header using {} for record {} {}".format(self._biotype_str, record_id, desc) - self.get_logger().debug(msg) - - # only include features that have the specified biotypes or they have CDSs info - if 'CDS' in key_values.keys() and ( - not self._skip_including_all_cds or 'altORFs' in self._include_biotypes): - pass - elif self._biotype_str and (feature_biotype == "" or (feature_biotype in self._exclude_biotypes or - ( - feature_biotype not in self._include_biotypes and self._include_biotypes != [ - 'all']))): - continue - - # check wether to filter on expression and if it passes - if self._expression_str: - try: - if float(key_values[self._expression_str]) < self._expression_thresh: - continue - except KeyError: - msg = "Expression information not found in the fasta header with expression_str: {} for record {} {}".format(self._expression_str, record_id, desc) - self.get_logger().debug(msg) - continue - except TypeError: - msg = "Expression value is not of valid type (float) at record: {} {}".format(record_id, key_values[self._expression_str]) - self.get_logger().debug(msg) - continue - - # translate the whole sequences (3 ORFs) for non CDS sequences and not take alt_ORFs for CDSs - if 'CDS' not in key_values.keys() or ('CDS' in key_values.keys() and - ('altORFs' in self._include_biotypes or - self._include_biotypes == ['all'])): - ref_orfs = self.get_orfs_dna(ref_seq, self._translation_table, self._num_orfs, - self._num_orfs_complement, to_stop=False) - print(self._header_var_prefix) - self.write_output(seq_id=self._header_var_prefix+record_id, desc=desc, seqs=ref_orfs, prots_fn=prots_fn) - - # also allow for direct translation of the CDS, when the cds info exists in the fasta header skip_including_all_cds is false - if 'CDS' in key_values.keys() and not self._skip_including_all_cds: - try: - cds_info = [int(x) for x in key_values['CDS'].split('-')] - ref_seq = ref_seq[cds_info[0] - 1:cds_info[1]] - ref_orfs = self.get_orfs_dna(ref_seq, self._translation_table, 1, 0, to_stop=True) - self.write_output(seq_id=record_id, desc=desc, seqs=ref_orfs, prots_fn=prots_fn) - except (ValueError, IndexError, KeyError): - print("Could not extra cds position from fasta header for: ", record_id, desc) - - return self._proteindb_output - - @staticmethod - def get_key(fasta_header): - return fasta_header.split('|')[0].split(' ')[0] - - def vcf_to_proteindb(self, vcf_file, input_fasta, gene_annotations_gtf): - """ + seq_dict = SeqIO.index(input_fasta, "fasta") + + with open(self._proteindb_output, 'w') as prots_fn: + for record_id in seq_dict.keys(): + + ref_seq = seq_dict[record_id].seq # get the seq and desc for the record from the fasta of the gtf + desc = str(seq_dict[record_id].description) + + key_values = {} # extract key=value in the desc into a dict + sep = ' ' + if '|' in desc: + sep = '|' + for value in desc.split(sep): + if value.split('=')[0] == 'cds' or value.split(':')[0] == 'cds': + value.replace('cds', 'CDS') + if '=' in value: + key_values[value.split('=')[0]] = value.split('=')[1] + elif ':' in value: + key_values[value.split(':')[0]] = value.split(':')[1] + elif value.split('=')[ + 0] == 'CDS': # when only it is specified to be a CDS, it means the whole sequence to be used + key_values[value.split('=')[0]] = '{}-{}'.format(1, len(ref_seq)) + + feature_biotype = "" + if self._biotype_str: + try: + feature_biotype = key_values[self._biotype_str] + except KeyError: + msg = "Biotype info was not found in the header using {} for record {} {}".format(self._biotype_str, + record_id, desc) + self.get_logger().debug(msg) + + # only include features that have the specified biotypes or they have CDSs info + if 'CDS' in key_values.keys() and ( + not self._skip_including_all_cds or 'altORFs' in self._include_biotypes): + pass + elif self._biotype_str and (feature_biotype == "" or (feature_biotype in self._exclude_biotypes or + ( + feature_biotype not in self._include_biotypes and self._include_biotypes != [ + 'all']))): + continue + + # check wether to filter on expression and if it passes + if self._expression_str: + try: + if float(key_values[self._expression_str]) < self._expression_thresh: + continue + except KeyError: + msg = "Expression information not found in the fasta header with expression_str: {} for record {} {}".format( + self._expression_str, record_id, desc) + self.get_logger().debug(msg) + continue + except TypeError: + msg = "Expression value is not of valid type (float) at record: {} {}".format(record_id, key_values[ + self._expression_str]) + self.get_logger().debug(msg) + continue + + # translate the whole sequences (3 ORFs) for non CDS sequences and not take alt_ORFs for CDSs + if 'CDS' not in key_values.keys() or ('CDS' in key_values.keys() and + ('altORFs' in self._include_biotypes or + self._include_biotypes == ['all'])): + ref_orfs = self.get_orfs_dna(ref_seq, self._translation_table, self._num_orfs, + self._num_orfs_complement, to_stop=False) + print(self._header_var_prefix) + self.write_output(seq_id=self._header_var_prefix + record_id, desc=desc, seqs=ref_orfs, prots_fn=prots_fn) + + # also allow for direct translation of the CDS, when the cds info exists in the fasta header skip_including_all_cds is false + if 'CDS' in key_values.keys() and not self._skip_including_all_cds: + try: + cds_info = [int(x) for x in key_values['CDS'].split('-')] + ref_seq = ref_seq[cds_info[0] - 1:cds_info[1]] + ref_orfs = self.get_orfs_dna(ref_seq, self._translation_table, 1, 0, to_stop=True) + self.write_output(seq_id=record_id, desc=desc, seqs=ref_orfs, prots_fn=prots_fn) + except (ValueError, IndexError, KeyError): + print("Could not extra cds position from fasta header for: ", record_id, desc) + + return self._proteindb_output + + @staticmethod + def get_key(fasta_header): + return fasta_header.split('|')[0].split(' ')[0] + + def vcf_to_proteindb(self, vcf_file, input_fasta, gene_annotations_gtf): + """ Generate peps for variants by modifying sequences of affected transcripts (VCF - VEP annotated). It only considers variants within potential coding regions of the transcript (CDSs & stop codons for protein-coding genes, exons for non-protein coding genes). @@ -432,220 +449,243 @@ def vcf_to_proteindb(self, vcf_file, input_fasta, gene_annotations_gtf): :return: """ - db = self.parse_gtf(gene_annotations_gtf, gene_annotations_gtf.replace('.gtf', '.db')) - - transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key) - # handle cases where the transcript has version in the GTF but not in the VCF - transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} - with open(self._proteindb_output, 'w') as prots_fn: - vcf_reader = vcf.Reader(open(vcf_file, 'r')) - - for record in vcf_reader: - if record.ALT==[None] or record.REF==[None]: - msg = "Invalid VCF record, skipping: {}".format(record) - self.get_logger().debug(msg) - continue - if not self._ignore_filters: - if record.FILTER: # if not PASS: None and empty means PASS - if not (set(record.FILTER[0].split(',')) <= set(self._accepted_filters)): - continue - - # only process variants above a given allele frequency threshold if the AF string is not empty - if self._af_field: - # get AF from the INFO field - try: - af = float(record.INFO[self._af_field]) - except TypeError: - af = float(record.INFO[self._af_field][0]) - except KeyError: - continue - - # check if the AF passed the threshold - if af < self._af_threshold: - continue - - trans_table = self._translation_table - consequences = [] - if str(record.CHROM).lstrip('chr').upper() in ['M', 'MT']: - trans_table = self._mito_translation_table - - processed_transcript_allele = [] - - for transcript_record in record.INFO[self._annotation_field_name]: - transcript_info = transcript_record.split('|') - try: - consequence = transcript_info[self._consequence_index] - except IndexError: - msg = "Give a valid index for the consequence in the INFO field for: {}".format(transcript_record) - self.get_logger().debug(msg) - continue - consequences.append(consequence) - try: - transcript_id = transcript_info[self._transcript_index] - except IndexError: - msg = "Give a valid index for the Transcript ID in the INFO field for: {}".format(transcript_record) - self.get_logger().debug(msg) - continue - if transcript_id == "": - continue - - try: - transcript_id_v = transcript_id_mapping[transcript_id] - except KeyError: - transcript_id_v = transcript_id - - try: - row = transcripts_dict[transcript_id_v] - ref_seq = row.seq # get the seq and desc for the transcript from the fasta of the gtf - desc = str(row.description) - except KeyError: - msg = "Transcript {} not found in fasta of the GTF file {}".format(transcript_id_v, record) - self.get_logger().debug(msg) - continue - - feature_types = ['exon'] - # check if cds info exists in the fasta header otherwise translate all exons - cds_info = [] - num_orfs = 3 - if 'CDS=' in desc: - try: - cds_info = [int(x) for x in desc.split(' ')[1].split('=')[1].split('-')] - feature_types = ['CDS', 'stop_codon'] - num_orfs = 1 - except (ValueError, IndexError): - msg = "Could not extra cds position from fasta header for: {}".format(desc) - self.get_logger().debug(msg) - pass - chrom, strand, features_info, feature_biotype = self.get_features(db, transcript_id_v, - self._biotype_str, - feature_types) - if chrom == None: #the record info was not found - continue - # skip transcripts with unwanted consequences - if (consequence in self._exclude_consequences or - (consequence not in self._include_consequences and - self._include_consequences != ['all'])): - continue - - # only include features that have the specified biotypes or they have CDSs info - if 'CDS' in feature_types and not self._skip_including_all_cds: - pass - elif (feature_biotype in self._exclude_biotypes or - (feature_biotype not in self._include_biotypes and - self._include_biotypes != ['all'])): - continue - for alt in record.ALT: # in cases of multiple alternative alleles consider all - if alt is None: - continue - if transcript_id + str(record.REF) + str( - alt) in processed_transcript_allele: # because VEP reports affected transcripts per alt allele - continue - - processed_transcript_allele.append(transcript_id + str(record.REF) + str(alt)) - "for non-CDSs, only consider the exon that actually overlaps the variant" - - try: - overlap_flag = self.check_overlap(record.POS, record.POS + len(alt), features_info) - except TypeError: - msg = "Wrong VCF record in {}".format(record) - self.get_logger().debug(msg) - continue - - if (chrom.lstrip("chr") == str(record.CHROM).lstrip("chr") and - overlap_flag): - coding_ref_seq, coding_alt_seq = self.get_altseq(ref_seq, Seq(str(record.REF)), - Seq(str(alt)), int(record.POS), strand, - features_info, cds_info) - if coding_alt_seq != "": - ref_orfs, alt_orfs = self.get_orfs_vcf(coding_ref_seq, coding_alt_seq, trans_table, - num_orfs) - record_id = "" - if record.ID: - record_id = '_' + str(record.ID) - self.write_output(seq_id='_'.join([self._header_var_prefix + str(record_id), - '.'.join([str(record.CHROM), str(record.POS), - str(record.REF), str(alt)]), - transcript_id_v]), - desc=feature_biotype + ":" + consequence, - seqs=alt_orfs, - prots_fn=prots_fn) - - if self._report_reference_seq: - self.write_output(seq_id=transcript_id_v, - desc=feature_biotype, - seqs=ref_orfs, - prots_fn=prots_fn) - - return self._proteindb_output - - def check_proteindb(self, input_fasta : str = None, num_aa: str = 6): - - input_handle = open(input_fasta, 'r') - output_handle = open(self._proteindb_output, 'w') - proteins = [] - pcount = 0 - stop_count = 0 - gap_count = 0 - no_met = 0 - less = 0 - - for record in SeqIO.parse(input_handle, 'fasta'): - - seq = str(record.seq) - pcount += 1 - - - # parse the description string into a dictionary - new_desc_string = record.description - new_desc_string = new_desc_string[new_desc_string.find(' ') + 1:] - # test for odd amino acids, stop codons, gaps - if not seq.startswith('M'): - no_met += 1 - if seq.endswith('*'): - seq = seq[:-1] - if '*' in seq: - stop_count += 1 + db = self.parse_gtf(gene_annotations_gtf, gene_annotations_gtf.replace('.gtf', '.db')) + + transcripts_dict = SeqIO.index(input_fasta, "fasta", key_function=self.get_key) + # handle cases where the transcript has version in the GTF but not in the VCF + transcript_id_mapping = {k.split('.')[0]: k for k in transcripts_dict.keys()} + with open(self._proteindb_output, 'w') as prots_fn: + vcf_reader = vcf.Reader(open(vcf_file, 'r')) + + for record in vcf_reader: + if record.ALT == [None] or record.REF == [None]: + msg = "Invalid VCF record, skipping: {}".format(record) + self.get_logger().debug(msg) + continue + if not self._ignore_filters: + if record.FILTER: # if not PASS: None and empty means PASS + if not (set(record.FILTER[0].split(',')) <= set(self._accepted_filters)): + continue + + # only process variants above a given allele frequency threshold if the AF string is not empty + if self._af_field: + # get AF from the INFO field + try: + af = float(record.INFO[self._af_field]) + except TypeError: + af = float(record.INFO[self._af_field][0]) + except KeyError: + continue + + # check if the AF passed the threshold + if af < self._af_threshold: + continue + + trans_table = self._translation_table + consequences = [] + if str(record.CHROM).lstrip('chr').upper() in ['M', 'MT']: + trans_table = self._mito_translation_table + + processed_transcript_allele = [] + + for transcript_record in record.INFO[self._annotation_field_name]: + transcript_info = transcript_record.split('|') + try: + consequence = transcript_info[self._consequence_index] + except IndexError: + msg = "Give a valid index for the consequence in the INFO field for: {}".format(transcript_record) + self.get_logger().debug(msg) + continue + consequences.append(consequence) + try: + transcript_id = transcript_info[self._transcript_index] + except IndexError: + msg = "Give a valid index for the Transcript ID in the INFO field for: {}".format(transcript_record) + self.get_logger().debug(msg) + continue + if transcript_id == "": + continue + + try: + transcript_id_v = transcript_id_mapping[transcript_id] + except KeyError: + transcript_id_v = transcript_id + + try: + row = transcripts_dict[transcript_id_v] + ref_seq = row.seq # get the seq and desc for the transcript from the fasta of the gtf + desc = str(row.description) + except KeyError: + msg = "Transcript {} not found in fasta of the GTF file {}".format(transcript_id_v, record) + self.get_logger().debug(msg) + continue + + feature_types = ['exon'] + # check if cds info exists in the fasta header otherwise translate all exons + cds_info = [] + num_orfs = 3 + if 'CDS=' in desc: + try: + cds_info = [int(x) for x in desc.split(' ')[1].split('=')[1].split('-')] + feature_types = ['CDS', 'stop_codon'] + num_orfs = 1 + except (ValueError, IndexError): + msg = "Could not extra cds position from fasta header for: {}".format(desc) + self.get_logger().debug(msg) + + chrom, strand, features_info, feature_biotype = self.get_features(db, transcript_id_v, + self._biotype_str, + feature_types) + if chrom is None: # the record info was not found + continue + # skip transcripts with unwanted consequences + if (consequence in self._exclude_consequences or + (consequence not in self._include_consequences and + self._include_consequences != ['all'])): + continue + + # only include features that have the specified biotypes or they have CDSs info + if 'CDS' in feature_types and not self._skip_including_all_cds: + pass + elif (feature_biotype in self._exclude_biotypes or + (feature_biotype not in self._include_biotypes and + self._include_biotypes != ['all'])): + continue + for alt in record.ALT: # in cases of multiple alternative alleles consider all + if alt is None: + continue + if transcript_id + str(record.REF) + str( + alt) in processed_transcript_allele: # because VEP reports affected transcripts per alt allele + continue + + processed_transcript_allele.append(transcript_id + str(record.REF) + str(alt)) + # for non-CDSs, only consider the exon that actually overlaps the variant + + try: + overlap_flag = self.check_overlap(record.POS, record.POS + len(alt), features_info) + except TypeError: + msg = "Wrong VCF record in {}".format(record) + self.get_logger().debug(msg) + continue + + if (chrom.lstrip("chr") == str(record.CHROM).lstrip("chr") and + overlap_flag): + coding_ref_seq, coding_alt_seq = self.get_altseq(ref_seq, Seq(str(record.REF)), + Seq(str(alt)), int(record.POS), strand, + features_info, cds_info) + if coding_alt_seq != "": + ref_orfs, alt_orfs = self.get_orfs_vcf(coding_ref_seq, coding_alt_seq, trans_table, + num_orfs) + record_id = "" + if record.ID: + record_id = '_' + str(record.ID) + self.write_output(seq_id='_'.join([self._header_var_prefix + str(record_id), + '.'.join([str(record.CHROM), str(record.POS), + str(record.REF), str(alt)]), + transcript_id_v]), + desc=feature_biotype + ":" + consequence, + seqs=alt_orfs, + prots_fn=prots_fn) + + if self._report_reference_seq: + self.write_output(seq_id=transcript_id_v, + desc=feature_biotype, + seqs=ref_orfs, + prots_fn=prots_fn) + + return self._proteindb_output + + @staticmethod + def add_protein_to_map(seq: str, new_desc_string: str, protein_id: str, proteins, output_handle): + protein = {'description': new_desc_string, 'sequence': seq, 'accession': protein_id} + proteins.append(protein) + output_handle.write(">{}\t{}\n{}\n".format(protein_id, new_desc_string, seq)) + return proteins + + def check_proteindb(self, input_fasta: str = None, add_stop_codon: bool = False, num_aa: int = 6): + + input_handle = open(input_fasta, 'r') + output_handle = open(self._proteindb_output, 'w') + proteins = [] + pcount = 0 + stop_count = 0 + gap_count = 0 + no_met = 0 + less = 0 + + for record in SeqIO.parse(input_handle, 'fasta'): + + seq = str(record.seq) + pcount += 1 + + # parse the description string into a dictionary + new_desc_string = record.description + new_desc_string = new_desc_string[new_desc_string.find(' ') + 1:] + # test for odd amino acids, stop codons, gaps + if not seq.startswith('M'): + no_met += 1 + if seq.endswith('*'): + seq = seq[:-1] + if '-' in seq: + gap_count += 1 + new_desc_string = new_desc_string + ' (Contains gaps)' + if '*' in seq: + stop_count += 1 + if add_stop_codon: + seq_list = seq.split("*") + codon_index = 1 + for codon in seq_list: + new_desc_string = new_desc_string + ' codon ' + str(codon_index) + protein_id = record.id + '_codon_' + str(codon_index) + seq = codon + if len(seq) > num_aa: + proteins = self.add_protein_to_map(seq, new_desc_string, protein_id, proteins, output_handle) + codon_index = codon_index + 1 + else: cut = seq.index('*') string = ' (Premature stop %s/%s)' % (cut, len(seq)) new_desc_string = new_desc_string + string seq = seq[:cut] - if '-' in seq: - gap_count += 1 - new_desc_string = new_desc_string + ' (Contains gaps)' - - # save the protein in list - + # save the protein in list + if len(seq) > num_aa: + protein_id = record.id + proteins = self.add_protein_to_map(seq, new_desc_string, protein_id, proteins, output_handle) + else: + less += 1 + else: if len(seq) > num_aa: - protein = {} - protein['description'] = new_desc_string - protein['sequence'] = seq - protein['accession'] = record.id - proteins.append(protein) - - output_handle.write(">{}\t{}\n{}\n".format(record.id, new_desc_string, seq)) + protein_id = record.id + proteins = self.add_protein_to_map(seq, new_desc_string, protein_id, proteins, output_handle) else: less += 1 - print(" translations that do not start with Met:", no_met) - print(" translations that have premature stop codons:", stop_count) - print(" translations that contain gaps:", gap_count) - print(" total number of input sequences was:", pcount) - print(" total number of sequences written was:", len(proteins)) - print(" total number of proteins less than {} aminoacids: {}".format(num_aa, less)) - - - def write_output(self, seq_id, desc, seqs, prots_fn): - """write the orfs to the output file""" - write_i = False - if len(seqs) > 1: # only add _num when multiple ORFs are generated (e.g in 3 ORF) - write_i = True - - for i, orf in enumerate(seqs): - if write_i: # only add _num when multiple ORFs are generated (e.g in 3 ORF) - prots_fn.write('>{} {}\n{}\n'.format(seq_id + "_" + str(i + 1), desc, orf)) - else: - prots_fn.write('>{} {}\n{}\n'.format(seq_id, desc, orf)) + print(" translations that do not start with Met:", no_met) + print(" translations that have premature stop codons:", stop_count) + print(" translations that contain gaps:", gap_count) + print(" total number of input sequences was:", pcount) + print(" total number of sequences written was:", len(proteins)) + print(" total number of proteins less than {} aminoacids: {}".format(num_aa, less)) + + @staticmethod + def write_output(seq_id, desc, seqs, prots_fn): + """ + write the orfs to the output file + :param seq_id: Sequence Accession + :param desc: Sequence Description + :param seqs: Sequence + :param prots_fn: + :return: + """ + write_i = False + if len(seqs) > 1: # only add _num when multiple ORFs are generated (e.g in 3 ORF) + write_i = True + + for i, orf in enumerate(seqs): + if write_i: # only add _num when multiple ORFs are generated (e.g in 3 ORF) + prots_fn.write('>{} {}\n{}\n'.format(seq_id + "_" + str(i + 1), desc, orf)) + else: + prots_fn.write('>{} {}\n{}\n'.format(seq_id, desc, orf)) if __name__ == '__main__': - print("ERROR: This script is part of a pipeline collection and it is not meant to be run in stand alone mode") + print("ERROR: This script is part of a pipeline collection and it is not meant to be run in stand alone mode") diff --git a/pypgatk/proteomics/db/decoy_pyrat.py b/pypgatk/proteomics/db/decoy_pyrat.py index db59a7a..2a975bc 100644 --- a/pypgatk/proteomics/db/decoy_pyrat.py +++ b/pypgatk/proteomics/db/decoy_pyrat.py @@ -207,9 +207,9 @@ def decoy_database(self): # loop each seq in the file for record in fasta: seq = str(record.seq) - dcount+=1 + dcount+=1 #make sequence isobaric (check args for switch off) - if self._isobaric == False: + if not self._isobaric: seq = seq.replace('I', 'L') #digest sequence add peptides to set @@ -220,7 +220,7 @@ def decoy_database(self): decoyseq = ProteinDBService.revswitch(seq, self._no_switch, self._cleavage_sites) #do not store decoy peptide set in reduced memory mode - if self._memory_save == False: + if not self._memory_save: #update decoy peptide set dpeps.update( ProteinDBService.digest(decoyseq, self._cleavage_sites, self._cleavage_position, self._anti_cleavage_sites, self._peptide_length)) diff --git a/pypgatk/tests/ensembl_tests.py b/pypgatk/tests/ensembl_tests.py new file mode 100644 index 0000000..171a4ec --- /dev/null +++ b/pypgatk/tests/ensembl_tests.py @@ -0,0 +1,10 @@ +import unittest + + +class MyTestCase(unittest.TestCase): + def test_something(self): + self.assertEqual(True, True) + + +if __name__ == '__main__': + unittest.main() diff --git a/pypgatk/tests/pypgatk_tests.py b/pypgatk/tests/pypgatk_tests.py index 800da9b..188c794 100644 --- a/pypgatk/tests/pypgatk_tests.py +++ b/pypgatk/tests/pypgatk_tests.py @@ -188,6 +188,18 @@ def download_ensembl_data(): '--taxonomy', '9103', '--output_directory', 'testdata']) assert result.exit_code == 0 + +def download_ensembl_data_37(): + """ + Test downloading ensembl data for species with taxonomy identifier = 9606 + :return: + """ + runner = CliRunner() + result = runner.invoke(cli, + ['ensembl-downloader', '--config_file', 'config/ensembl_downloader_config.yaml', + '--taxonomy', '9606', '--output_directory', 'testdata', '--grch37']) + assert result.exit_code == 0 + def download_cbioportal_data(): """ Test downloading cbioportal data for study id: paac_jhu_2014 @@ -199,6 +211,16 @@ def download_cbioportal_data(): '--download_study', 'paac_jhu_2014', '--output_directory', 'testdata']) assert result.exit_code == 0 + +def check_ensembl_database(): + runner = CliRunner() + result = runner.invoke(cli, + ['ensembl-check', '--config_file', 'config/ensembl_config.yaml', + '--input_fasta', 'testdata/proteindb_from_ENSEMBL_VCF.fa', '--output', 'testdata/proteindb_from_ENSEMBL_VCF-clean.fa', '--add_stop_codons', '--num_aa', '6']) + assert result.exit_code == 0 + + + if __name__ == '__main__': vcf_to_proteindb() vcf_gnomad_to_proteindb() @@ -212,4 +234,6 @@ def download_cbioportal_data(): generate_decoy_database() cosmic_to_proteindb() download_ensembl_data() + download_ensembl_data_37() download_cbioportal_data() + check_ensembl_database() diff --git a/setup.py b/setup.py index 9039042..503f92c 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def readme(): return f.read() setup(name='pypgatk', - version='0.0.8', + version='0.0.9', description='Python tools for proteogenomics', url='http://github.com/bigbio/py-pgatk', long_description=readme(),