From bb71e897959e39f0905e070f621dad51b7369543 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Tue, 12 Jul 2022 17:29:17 +0200 Subject: [PATCH 1/9] ENH: add an action to search for run IDs using a text query --- edirect_requests.http | 64 ++++++++++++++++++ http-client.env.json | 5 ++ q2_fondue/entrezpy_clients/_esearch.py | 11 +-- q2_fondue/entrezpy_clients/_pipelines.py | 85 +++++++++++++++++------- q2_fondue/metadata.py | 3 +- q2_fondue/plugin_setup.py | 27 +++++++- q2_fondue/query.py | 39 +++++++++++ q2_fondue/sequences.py | 2 +- q2_fondue/tests/test_metadata.py | 63 +++++++++++++++--- q2_fondue/tests/test_sequences.py | 2 +- 10 files changed, 257 insertions(+), 44 deletions(-) create mode 100644 edirect_requests.http create mode 100644 http-client.env.json create mode 100644 q2_fondue/query.py diff --git a/edirect_requests.http b/edirect_requests.http new file mode 100644 index 0000000..f8ba1d1 --- /dev/null +++ b/edirect_requests.http @@ -0,0 +1,64 @@ +// ESearch request +GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi + ?email=mziemski%40ethz.ch + &db=biosample + &term=txid410656%5BOrganism%5D+AND+biosample+sra%5BFilter%5D+AND+%22public%22%5BFilter%5D+AND+%28mouse+OR+rat+OR+pig+OR+dog+OR+human%29 + &retmode=json + &retmax={{ retmax }} + &retstart=0 +# &usehistory=y + +> {% + var webenv = response.body.esearchresult.webenv + var qkey = response.body.esearchresult.querykey + var ids = response.body.esearchresult.idlist.join(",") + client.global.set("esearch_retmax", response.body.esearchresult.idlist.length) + client.log(`${webenv}, ${qkey}, ${ids}`) + client.global.set("esearch_webenv", webenv) + client.global.set("esearch_qkey", qkey) + client.global.set("esearch_ids", ids) + %} +<> 2022-07-07T080418.200.json + +### +// ELink request +GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi + ?email=mziemski%40ethz.ch + &dbfrom=biosample + &db=sra + &cmd=neighbor_history + &retmode=json + &id={{ esearch_ids }} +# &WebEnv={{ esearch_webenv }} +# &query_key={{ esearch_qkey }} + +> {% + var webenv = response.body.linksets[0].webenv + var qkey = response.body.linksets[0].linksetdbhistories[0].querykey + var ids_len = response.body.linksets[0].ids.length + var first_id = response.body.linksets[0].ids[0] + client.log(`${webenv}, ${qkey}, ${ids_len} IDs, first ID was: ${first_id}`) + client.global.set("elink_webenv", webenv) + client.global.set("elink_qkey", qkey) + %} + +### +// EFetch request +GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi + ?email=mziemski%40ethz.ch + &db=sra + &retmode=xml + &rettype=docsum + &retstart=0 + &retmax=10000 + &WebEnv={{ elink_webenv }} + &query_key={{ elink_qkey }} + +> {% + var id_count = (response.body.match(//g) || []).length; + // client.log(`There were ${id_count} IDs in the response.`) + client.log(parseInt(client.global.get("esearch_retmax"))) + client.test("Request executed successfully", function() { + client.assert(id_count <= parseInt(client.global.get("esearch_retmax")), `Only ${id_count} IDs were found`); + }); + %} diff --git a/http-client.env.json b/http-client.env.json new file mode 100644 index 0000000..3475a3a --- /dev/null +++ b/http-client.env.json @@ -0,0 +1,5 @@ +{ + "dev": { + "retmax": "0" + } +} \ No newline at end of file diff --git a/q2_fondue/entrezpy_clients/_esearch.py b/q2_fondue/entrezpy_clients/_esearch.py index 3832b55..10cdfb5 100644 --- a/q2_fondue/entrezpy_clients/_esearch.py +++ b/q2_fondue/entrezpy_clients/_esearch.py @@ -6,7 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from typing import List +from typing import List, Union import pandas as pd from entrezpy.esearch.esearch_analyzer import EsearchAnalyzer @@ -53,7 +53,7 @@ def validate_result(self) -> dict: **{_id: 'ID is invalid.' for _id in invalid_ids.index} } - def parse_search_results(self, response, uids: List[str]): + def parse_search_results(self, response, uids: Union[List[str], None]): """Parses response received from Esearch as a pandas Series object. Hit counts obtained in the response will be extracted and assigned to @@ -79,9 +79,10 @@ def parse_search_results(self, response, uids: List[str]): } # find ids that are missing - missing_ids = [x for x in uids if x not in found_terms.keys()] - missing_ids = {x: 0 for x in missing_ids} - found_terms.update(missing_ids) + if uids: + missing_ids = [x for x in uids if x not in found_terms.keys()] + missing_ids = {x: 0 for x in missing_ids} + found_terms.update(missing_ids) self.result = pd.Series(found_terms, name='count') diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py index 614c66c..4b03880 100644 --- a/q2_fondue/entrezpy_clients/_pipelines.py +++ b/q2_fondue/entrezpy_clients/_pipelines.py @@ -5,6 +5,7 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from typing import Union from entrezpy import conduit as ec @@ -14,10 +15,17 @@ from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer from q2_fondue.entrezpy_clients._utils import set_up_entrezpy_logging +import entrezpy.esearch.esearcher as searcher + +from q2_fondue.utils import _chunker + +BATCH_SIZE = 500 + def _get_run_ids( - email: str, n_jobs: int, ids: list, - source: str, log_level: str) -> list: + email: str, n_jobs: int, ids: Union[list, None], + query: Union[str, None], source: str, log_level: str +) -> list: """Pipeline to retrieve metadata of run IDs associated with studies (`source`='study'), bioprojects (`source`='bioproject'), samples (`source`='sample') or experiments (`source`='experiment') @@ -27,6 +35,7 @@ def _get_run_ids( email (str): User email. n_jobs (int): Number of jobs. ids (list): List of study, bioproject, sample or experiment IDs. + query (str): Search query to find IDs by. source (str): Type of IDs provided ('study', 'bioproject', 'sample' or 'experiment'). log_level (str): The log level to set. @@ -34,38 +43,64 @@ def _get_run_ids( Returns: list: Run IDs associated with provided ids. """ + term = " OR ".join(ids) if ids else query + # create pipeline to fetch all run IDs + elink = True if source == 'bioproject': db = 'bioproject' - elink = True + elif source == 'biosample': + db = 'biosample' else: db = 'sra' elink = False - econduit = ec.Conduit(email=email, threads=n_jobs) + # find UIDS based on a query + esearcher = searcher.Esearcher( + 'esearcher', email, apikey=None, + apikey_var=None, threads=n_jobs, qid=None) + esearch_response = esearcher.inquire( + { + 'db': db, 'term': term, + 'usehistory': False, 'rettype': 'json' + }, + analyzer=ESearchAnalyzer(ids)) + + # use the UIDS to link to other DBs and fetch related records + # we won't be using multi-threading here as this shouldn't take + # long (we're only fetching IDs) and we don't want those dead + # threads afterwards + econduit = ec.Conduit(email=email, threads=0) set_up_entrezpy_logging(econduit, log_level) - samp_ids_pipeline = econduit.new_pipeline() - - # search for IDs - es = samp_ids_pipeline.add_search( - {'db': db, 'term': " OR ".join(ids)}, - analyzer=ESearchAnalyzer(ids) - ) - if elink: - # given bioproject, find linked SRA runs - el = samp_ids_pipeline.add_link( - {'db': 'sra'}, - analyzer=ElinkAnalyzer(), dependency=es + run_ids_pipeline = econduit.new_pipeline() + + for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE): + if elink: + el = run_ids_pipeline.add_link( + { + 'db': 'sra', 'dbfrom': db, + 'id': _ids, 'link': False + }, + analyzer=ElinkAnalyzer(), + ) + else: + el = None + + # given SRA run IDs, fetch all metadata + run_ids_pipeline.add_fetch( + { + 'rettype': 'docsum', 'retmode': 'xml', + 'reqsize': BATCH_SIZE, 'retmax': len(_ids) + }, + analyzer=EFetchAnalyzer(log_level), dependency=el ) - else: - el = es - # given SRA run IDs, fetch all metadata - samp_ids_pipeline.add_fetch( - {'rettype': 'docsum', 'retmode': 'xml'}, - analyzer=EFetchAnalyzer(log_level), dependency=el - ) + econduit.run(run_ids_pipeline) - a = econduit.run(samp_ids_pipeline) + # recover metadata from all instances of EFetchAnalyzer + all_meta = [] + for x in econduit.analyzers.values(): + if isinstance(x, EFetchAnalyzer): + all_meta.extend(x.result.metadata) - return sorted(a.result.metadata) + return sorted(all_meta) diff --git a/q2_fondue/metadata.py b/q2_fondue/metadata.py index 912b7a8..4c4eefc 100644 --- a/q2_fondue/metadata.py +++ b/q2_fondue/metadata.py @@ -112,7 +112,8 @@ def _get_other_meta( email, n_jobs, project_ids, id_type, log_level, logger ) -> (pd.DataFrame, dict): run_ids = _get_run_ids( - email, n_jobs, project_ids, id_type, log_level) + email, n_jobs, project_ids, None, id_type, log_level + ) return _get_run_meta(email, n_jobs, run_ids, True, log_level, logger) diff --git a/q2_fondue/plugin_setup.py b/q2_fondue/plugin_setup.py index 2851bcb..a68fb38 100644 --- a/q2_fondue/plugin_setup.py +++ b/q2_fondue/plugin_setup.py @@ -18,6 +18,7 @@ from q2_fondue import __version__ from q2_fondue.get_all import get_all +from q2_fondue.query import get_ids_from_query from q2_fondue.metadata import get_metadata, merge_metadata from q2_fondue.sequences import get_sequences, combine_seqs from q2_fondue.scraper import scrape_collection @@ -65,7 +66,8 @@ 'paired_reads': 'Artifact containing paired-end fastq.gz files ' 'for all the requested IDs.', 'failed_runs': 'List of all run IDs for which fetching {} failed, ' - 'with their corresponding error messages.' + 'with their corresponding error messages.', + 'ids': 'Artifact containing retrieved SRA accession IDs.' } output_scraper_txt = 'Artifact containing all {} IDs scraped from ' \ @@ -257,6 +259,29 @@ citations=[citations['stephan_hugel_2019_2917290']] ) +plugin.methods.register_function( + function=get_ids_from_query, + inputs={}, + parameters={ + 'query': Str, + **common_params + }, + outputs=[('ids', NCBIAccessionIDs)], + input_descriptions={}, + parameter_descriptions={ + 'query': 'Search query to find SRA IDs by.', + **common_param_descr + }, + output_descriptions={ + 'ids': output_descriptions['metadata'], + }, + name='Find SRA accession IDs based on a search query.', + description=( + 'Find SRA accession IDs based on a search query.' + ), + citations=[] +) + plugin.register_formats( SRAMetadataFormat, SRAMetadataDirFmt, SRAFailedIDsFormat, SRAFailedIDsDirFmt, diff --git a/q2_fondue/query.py b/q2_fondue/query.py new file mode 100644 index 0000000..e3a7d33 --- /dev/null +++ b/q2_fondue/query.py @@ -0,0 +1,39 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022, Bokulich Laboratories. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import threading +import pandas as pd + +from q2_fondue.utils import handle_threaded_exception +from q2_fondue.entrezpy_clients._pipelines import _get_run_ids + +threading.excepthook = handle_threaded_exception + + +def get_ids_from_query( + query: str, email: str, + n_jobs: int = 1, log_level: str = 'INFO' +) -> pd.Series: + """Retrieves SRA run IDs based on a search query performed + on the BioSample database. + + Args: + query (str): Search query to be executed on + the BioSample database. + email (str): A valid e-mail address (required by NCBI). + n_jobs (int, default=1): Number of threads to be used in parallel. + log_level (str, default='INFO'): Logging level. + + Returns: + ids (pd.Series): Retrieved SRA run IDs. + """ + run_ids = _get_run_ids( + email, n_jobs, None, query, 'biosample', log_level + ) + + return pd.Series(run_ids, name='ID') diff --git a/q2_fondue/sequences.py b/q2_fondue/sequences.py index cfe2eef..5d451d1 100644 --- a/q2_fondue/sequences.py +++ b/q2_fondue/sequences.py @@ -380,7 +380,7 @@ def get_sequences( id_type = _determine_id_type(accession_ids) if id_type != 'run': accession_ids = _get_run_ids( - email, n_jobs, accession_ids, id_type, log_level + email, n_jobs, accession_ids, None, id_type, log_level ) fetched_q = Queue() diff --git a/q2_fondue/tests/test_metadata.py b/q2_fondue/tests/test_metadata.py index d426f07..98ed85f 100644 --- a/q2_fondue/tests/test_metadata.py +++ b/q2_fondue/tests/test_metadata.py @@ -11,6 +11,8 @@ import pandas as pd import numpy as np import unittest + +from entrezpy.elink.elink_analyzer import ElinkAnalyzer from parameterized import parameterized from entrezpy import conduit from entrezpy.esearch import esearcher @@ -19,7 +21,7 @@ from pandas._testing import assert_frame_equal, assert_series_equal from numpy.testing import assert_array_equal from qiime2.metadata import Metadata -from unittest.mock import patch, MagicMock, ANY +from unittest.mock import patch, MagicMock, ANY, call from q2_fondue.entrezpy_clients import _esearch from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer @@ -41,6 +43,7 @@ def __init__(self, fake_efetch_result, fake_efetch_response): self.fake_efetch_result = fake_efetch_result self.fake_efetch_response = fake_efetch_response self.pipeline = MagicMock() + self.analyzers = {} def new_pipeline(self): self.pipeline.add_search = MagicMock(return_value='fake_search') @@ -391,31 +394,71 @@ def test_get_run_meta_one_invalid_id(self, patch_ef, patch_val, patch_es): ("sample", "sra") ]) @patch('q2_fondue.metadata._get_run_meta') + @patch('entrezpy.esearch.esearcher.Esearcher') + @patch.object(_esearch, 'ESearchAnalyzer') + @patch('q2_fondue.entrezpy_clients._pipelines.BATCH_SIZE', 6) def test_get_other_meta_different_ids( - self, id_type, db2search, mock_get): + self, id_type, db2search, + mock_analyzer, mock_search, mock_get + ): exp_ids = [ 'SRR000007', 'SRR000018', 'SRR000020', 'SRR000038', 'SRR000043', 'SRR000046', 'SRR000048', 'SRR000050', 'SRR000057', 'SRR000058', 'SRR13961759', 'SRR13961771'] + mock_search.return_value = self.fake_esearcher + fake_uids = [str(i) for i in range(len(exp_ids))] + mock_analyzer.return_value = MagicMock( + result=MagicMock(uids=fake_uids) + ) + mock_search.return_value.inquire = mock_analyzer with patch.object(conduit, 'Conduit') as mock_conduit: + fake_analyzer1 = EFetchAnalyzer('INFO') + fake_analyzer1.result = MagicMock(metadata=exp_ids[:6]) + fake_analyzer2 = EFetchAnalyzer('INFO') + fake_analyzer2.result = MagicMock(metadata=exp_ids[6:]) + self.fake_econduit.analyzers = { + '1': ElinkAnalyzer, + '2': fake_analyzer1, + '3': fake_analyzer2 + } mock_conduit.return_value = self.fake_econduit mock_get.return_value = exp_ids - _ = _get_other_meta( + obs_ids = _get_other_meta( 'someone@somewhere.com', 1, ['AB', 'cd'], id_type, 'INFO', MagicMock() ) - self.fake_econduit.pipeline.add_search.assert_called_once_with( - {'db': db2search, 'term': "AB OR cd"}, analyzer=ANY - ) - self.fake_econduit.pipeline.add_fetch.assert_called_once_with( - {'rettype': 'docsum', 'retmode': 'xml'}, - analyzer=ANY, dependency=ANY - ) + if db2search != 'sra': + self.fake_econduit.pipeline.add_link.assert_has_calls([ + call( + { + 'db': 'sra', 'dbfrom': db2search, + 'id': fake_uids[:6], 'link': False + }, + analyzer=ANY + ), + call( + { + 'db': 'sra', 'dbfrom': db2search, + 'id': fake_uids[6:], 'link': False + }, + analyzer=ANY + ) + ]) + self.fake_econduit.pipeline.add_fetch.has_calls([ + call( + { + 'rettype': 'docsum', 'retmode': 'xml', + 'reqsize': 6, 'retmax': 2 + }, + analyzer=ANY, dependency=ANY) + + ] * 2) mock_get.assert_called_once_with( 'someone@somewhere.com', 1, exp_ids, True, 'INFO', ANY ) + self.assertListEqual(sorted(exp_ids), obs_ids) @patch('q2_fondue.metadata._get_run_meta') @patch('q2_fondue.metadata._get_other_meta') diff --git a/q2_fondue/tests/test_sequences.py b/q2_fondue/tests/test_sequences.py index a78ca5b..fa7dfdb 100644 --- a/q2_fondue/tests/test_sequences.py +++ b/q2_fondue/tests/test_sequences.py @@ -724,7 +724,7 @@ def test_get_sequences_other( test_temp_md, email='some@where.com', retries=0) mock_get.assert_called_with( - 'some@where.com', 1, [acc_id], id_type, 'INFO' + 'some@where.com', 1, [acc_id], None, id_type, 'INFO' ) mock_proc.assert_has_calls([ call(target=_run_fasterq_dump_for_all, args=( From 487f1c2cd7a9885fb79e1f5a7d51241a257149f6 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Aug 2022 12:01:10 +0200 Subject: [PATCH 2/9] Add query test --- q2_fondue/tests/test_query.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 q2_fondue/tests/test_query.py diff --git a/q2_fondue/tests/test_query.py b/q2_fondue/tests/test_query.py new file mode 100644 index 0000000..836dca8 --- /dev/null +++ b/q2_fondue/tests/test_query.py @@ -0,0 +1,40 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022, Bokulich Laboratories. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import pandas as pd +import unittest + +from pandas.testing import assert_frame_equal +from qiime2.plugins import fondue +from unittest.mock import patch + +from q2_fondue.tests.test_sequences import SequenceTests + + +class TestQuery(SequenceTests): + package = 'q2_fondue.tests' + + @patch( + 'q2_fondue.query._get_run_ids', + return_value=['SRR123', 'SRR234'] + ) + def test_query(self, mock_ids): + query = 'some magical query text' + + obs_ids, = fondue.actions.get_ids_from_query( + query, 'fake@email.com', 1, 'DEBUG' + ) + exp_ids = pd.DataFrame(index=pd.Index(['SRR123', 'SRR234'], name='ID')) + + mock_ids.assert_called_once_with( + 'fake@email.com', 1, None, query, 'biosample', 'DEBUG' + ) + assert_frame_equal(obs_ids.view(pd.DataFrame), exp_ids) + + +if __name__ == "__main__": + unittest.main() From d40af43a5ed3e87683c42045d823c4dec1ec75b2 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Aug 2022 12:11:10 +0200 Subject: [PATCH 3/9] Remove the http request files --- edirect_requests.http | 64 ------------------------------------------- http-client.env.json | 5 ---- 2 files changed, 69 deletions(-) delete mode 100644 edirect_requests.http delete mode 100644 http-client.env.json diff --git a/edirect_requests.http b/edirect_requests.http deleted file mode 100644 index f8ba1d1..0000000 --- a/edirect_requests.http +++ /dev/null @@ -1,64 +0,0 @@ -// ESearch request -GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi - ?email=mziemski%40ethz.ch - &db=biosample - &term=txid410656%5BOrganism%5D+AND+biosample+sra%5BFilter%5D+AND+%22public%22%5BFilter%5D+AND+%28mouse+OR+rat+OR+pig+OR+dog+OR+human%29 - &retmode=json - &retmax={{ retmax }} - &retstart=0 -# &usehistory=y - -> {% - var webenv = response.body.esearchresult.webenv - var qkey = response.body.esearchresult.querykey - var ids = response.body.esearchresult.idlist.join(",") - client.global.set("esearch_retmax", response.body.esearchresult.idlist.length) - client.log(`${webenv}, ${qkey}, ${ids}`) - client.global.set("esearch_webenv", webenv) - client.global.set("esearch_qkey", qkey) - client.global.set("esearch_ids", ids) - %} -<> 2022-07-07T080418.200.json - -### -// ELink request -GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi - ?email=mziemski%40ethz.ch - &dbfrom=biosample - &db=sra - &cmd=neighbor_history - &retmode=json - &id={{ esearch_ids }} -# &WebEnv={{ esearch_webenv }} -# &query_key={{ esearch_qkey }} - -> {% - var webenv = response.body.linksets[0].webenv - var qkey = response.body.linksets[0].linksetdbhistories[0].querykey - var ids_len = response.body.linksets[0].ids.length - var first_id = response.body.linksets[0].ids[0] - client.log(`${webenv}, ${qkey}, ${ids_len} IDs, first ID was: ${first_id}`) - client.global.set("elink_webenv", webenv) - client.global.set("elink_qkey", qkey) - %} - -### -// EFetch request -GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi - ?email=mziemski%40ethz.ch - &db=sra - &retmode=xml - &rettype=docsum - &retstart=0 - &retmax=10000 - &WebEnv={{ elink_webenv }} - &query_key={{ elink_qkey }} - -> {% - var id_count = (response.body.match(//g) || []).length; - // client.log(`There were ${id_count} IDs in the response.`) - client.log(parseInt(client.global.get("esearch_retmax"))) - client.test("Request executed successfully", function() { - client.assert(id_count <= parseInt(client.global.get("esearch_retmax")), `Only ${id_count} IDs were found`); - }); - %} diff --git a/http-client.env.json b/http-client.env.json deleted file mode 100644 index 3475a3a..0000000 --- a/http-client.env.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "dev": { - "retmax": "0" - } -} \ No newline at end of file From 75ebdb5bb0b72894c9e7e91562840b29afe6cae3 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Aug 2022 13:57:56 +0200 Subject: [PATCH 4/9] Add missing test case --- q2_fondue/tests/test_metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/q2_fondue/tests/test_metadata.py b/q2_fondue/tests/test_metadata.py index 98ed85f..b3bef82 100644 --- a/q2_fondue/tests/test_metadata.py +++ b/q2_fondue/tests/test_metadata.py @@ -391,7 +391,8 @@ def test_get_run_meta_one_invalid_id(self, patch_ef, patch_val, patch_es): ("study", "sra"), ("bioproject", "bioproject"), ("experiment", "sra"), - ("sample", "sra") + ("sample", "sra"), + ("biosample", "biosample") ]) @patch('q2_fondue.metadata._get_run_meta') @patch('entrezpy.esearch.esearcher.Esearcher') From d139776e1ee3cab830f7a3ed802f4378acb14740 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Aug 2022 14:11:54 +0200 Subject: [PATCH 5/9] Add more comments --- q2_fondue/entrezpy_clients/_pipelines.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py index 4b03880..94d20da 100644 --- a/q2_fondue/entrezpy_clients/_pipelines.py +++ b/q2_fondue/entrezpy_clients/_pipelines.py @@ -55,7 +55,15 @@ def _get_run_ids( db = 'sra' elink = False - # find UIDS based on a query + # find UIDS based on a query; + # instead of saving the result on the history server + # we will store all the UIDs recovered based on the + # search query and use those in the mini-pipeline below; + # this way we are not limited by ELink only accepting up to + # who knows how many IDs and erroring out if we provide too + # many (which could be the case e.g.: when we ask for more + # than 10000 BioProject IDs or the text query returns more + # than 10000 IDs presumably) esearcher = searcher.Esearcher( 'esearcher', email, apikey=None, apikey_var=None, threads=n_jobs, qid=None) @@ -66,7 +74,7 @@ def _get_run_ids( }, analyzer=ESearchAnalyzer(ids)) - # use the UIDS to link to other DBs and fetch related records + # use the UIDs to link to other DBs and fetch related records; # we won't be using multi-threading here as this shouldn't take # long (we're only fetching IDs) and we don't want those dead # threads afterwards From 3b7a2e63f0cf49733d967aa58761279b3b3c0af4 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Aug 2022 14:19:27 +0200 Subject: [PATCH 6/9] Adjust action input descriptions --- q2_fondue/plugin_setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/q2_fondue/plugin_setup.py b/q2_fondue/plugin_setup.py index a68fb38..a7bd00c 100644 --- a/q2_fondue/plugin_setup.py +++ b/q2_fondue/plugin_setup.py @@ -269,7 +269,8 @@ outputs=[('ids', NCBIAccessionIDs)], input_descriptions={}, parameter_descriptions={ - 'query': 'Search query to find SRA IDs by.', + 'query': 'Search query to retrieve SRA IDs from ' + 'the BioSample database.', **common_param_descr }, output_descriptions={ @@ -277,7 +278,8 @@ }, name='Find SRA accession IDs based on a search query.', description=( - 'Find SRA accession IDs based on a search query.' + 'Find SRA accession IDs in the BioSample database ' + 'using a text search query.' ), citations=[] ) From 9ca7d393e63a977e16b9a9aa4dc8d5f7f056f7ff Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Wed, 17 Aug 2022 15:31:49 +0200 Subject: [PATCH 7/9] Fix fetching run ids from studies and alike --- q2_fondue/entrezpy_clients/_pipelines.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py index 94d20da..9a48fbf 100644 --- a/q2_fondue/entrezpy_clients/_pipelines.py +++ b/q2_fondue/entrezpy_clients/_pipelines.py @@ -95,11 +95,17 @@ def _get_run_ids( el = None # given SRA run IDs, fetch all metadata + efetch_params = { + 'rettype': 'docsum', 'retmode': 'xml', + 'reqsize': BATCH_SIZE, 'retmax': len(_ids) + } + if not elink: + # we need to specify these manually as in this scenario + # EFetch is not linked to anything + efetch_params.update({'id': _ids, 'db': db}) + run_ids_pipeline.add_fetch( - { - 'rettype': 'docsum', 'retmode': 'xml', - 'reqsize': BATCH_SIZE, 'retmax': len(_ids) - }, + efetch_params, analyzer=EFetchAnalyzer(log_level), dependency=el ) From 1efba4944c16dab458311443a5d97d98d41dfe10 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Thu, 18 Aug 2022 16:39:54 +0200 Subject: [PATCH 8/9] Review suggestions --- q2_fondue/entrezpy_clients/_pipelines.py | 19 ++++++++++++------- q2_fondue/plugin_setup.py | 8 ++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py index 9a48fbf..49276ba 100644 --- a/q2_fondue/entrezpy_clients/_pipelines.py +++ b/q2_fondue/entrezpy_clients/_pipelines.py @@ -26,9 +26,10 @@ def _get_run_ids( email: str, n_jobs: int, ids: Union[list, None], query: Union[str, None], source: str, log_level: str ) -> list: - """Pipeline to retrieve metadata of run IDs associated with - studies (`source`='study'), bioprojects (`source`='bioproject'), - samples (`source`='sample') or experiments (`source`='experiment') + """Pipeline to retrieve run IDs associated with BioSample query + (provided in `query`) or other aggregate IDs like studies + (`source`='study'), bioprojects (`source`='bioproject'), samples + (`source`='sample') or experiments (`source`='experiment') provided in `ids`. Args: @@ -82,6 +83,10 @@ def _get_run_ids( set_up_entrezpy_logging(econduit, log_level) run_ids_pipeline = econduit.new_pipeline() + # create a pipeline to link and fetch the run IDs; + # we process the IDs obtained from the previous step in batches + # as ELink cannot handle more than a certain amount of IDs + # at the same time (recommended by NCBI) for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE): if elink: el = run_ids_pipeline.add_link( @@ -111,10 +116,10 @@ def _get_run_ids( econduit.run(run_ids_pipeline) - # recover metadata from all instances of EFetchAnalyzer - all_meta = [] + # recover run IDs from all instances of EFetchAnalyzer + all_run_ids = [] for x in econduit.analyzers.values(): if isinstance(x, EFetchAnalyzer): - all_meta.extend(x.result.metadata) + all_run_ids.extend(x.result.metadata) - return sorted(all_meta) + return sorted(all_run_ids) diff --git a/q2_fondue/plugin_setup.py b/q2_fondue/plugin_setup.py index a7bd00c..10759d3 100644 --- a/q2_fondue/plugin_setup.py +++ b/q2_fondue/plugin_setup.py @@ -67,7 +67,7 @@ 'for all the requested IDs.', 'failed_runs': 'List of all run IDs for which fetching {} failed, ' 'with their corresponding error messages.', - 'ids': 'Artifact containing retrieved SRA accession IDs.' + 'ids': 'Artifact containing retrieved SRA run accession IDs.' } output_scraper_txt = 'Artifact containing all {} IDs scraped from ' \ @@ -269,16 +269,16 @@ outputs=[('ids', NCBIAccessionIDs)], input_descriptions={}, parameter_descriptions={ - 'query': 'Search query to retrieve SRA IDs from ' + 'query': 'Search query to retrieve SRA run IDs from ' 'the BioSample database.', **common_param_descr }, output_descriptions={ 'ids': output_descriptions['metadata'], }, - name='Find SRA accession IDs based on a search query.', + name='Find SRA run accession IDs based on a search query.', description=( - 'Find SRA accession IDs in the BioSample database ' + 'Find SRA run accession IDs in the BioSample database ' 'using a text search query.' ), citations=[] From 88d2680fe0c4221a62a9b82e54746c180cd40839 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Fri, 19 Aug 2022 10:42:03 +0200 Subject: [PATCH 9/9] Update README --- README.md | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8fba2a0..87fd9b9 100644 --- a/README.md +++ b/README.md @@ -61,14 +61,15 @@ To find out which temporary directory is used by Qiime 2, you can run `echo $TMP ### Available actions q2-fondue provides a couple of actions to fetch and manipulate nucleotide sequencing data and related metadata from SRA as well as an action to scrape run, study, BioProject, experiment and sample IDs from a Zotero web library. Below you will find a list of available actions and their short descriptions. -| Action | Description | -|------------------|--------------------------------------------------------------------------| -| `get-sequences` | Fetch sequences by IDs[*] from the SRA repository. | -| `get-metadata` | Fetch metadata by IDs[*] from the SRA repository. | -| `get-all` | Fetch sequences and metadata by IDs[*] from the SRA repo. | -| `merge-metadata` | Merge several metadata files into a single metadata object. | -| `combine-seqs` | Combine sequences from multiple artifacts into a single artifact. | -| `scrape-collection`| Scrape Zotero collection for IDs[*] and associated DOI names.| +| Action | Description | +|----------------------|--------------------------------------------------------------------------| +| `get-sequences` | Fetch sequences by IDs[*] from the SRA repository. | +| `get-metadata` | Fetch metadata by IDs[*] from the SRA repository. | +| `get-all` | Fetch sequences and metadata by IDs[*] from the SRA repo. | +| `get-ids-from-query` | Find SRA run accession IDs based on a search query. | +| `merge-metadata` | Merge several metadata files into a single metadata object. | +| `combine-seqs` | Combine sequences from multiple artifacts into a single artifact. | +| `scrape-collection` | Scrape Zotero collection for IDs[*] and associated DOI names.| [*]: Supported IDs include run, study, BioProject, experiment and study IDs. @@ -116,6 +117,21 @@ where: - `--o-experiment-ids` is the output artifact containing the scraped experiment IDs. - `--o-sample-ids` is the output artifact containing the scraped sample IDs. +3) To retrieve run accession IDs based on a text search query (performed on the BioSample database) you can use the `get-ids-from-query` method: +```shell +qiime fondue get-ids-from-query \ + --p-query "txid410656[Organism] AND \"public\"[Filter] AND (chicken OR poultry)" \ + --p-email your_email@somewhere.com \ + --p-n-jobs 2 \ + --o-ids run_ids.qza \ + --verbose +``` +where: +- `--p-query` is the text search query to be executed on the BioSample database. +- `--p-email` is your email address (required by NCBI). +- `--p-n-jobs` is the number of parallel download jobs (defaults to 1). +- `--o-ids` is the output artifact containing the retrieved run IDs. + ### Fetching metadata To fetch metadata associated with a set of output IDs, execute the following command: