From bb71e897959e39f0905e070f621dad51b7369543 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Tue, 12 Jul 2022 17:29:17 +0200
Subject: [PATCH 1/9] ENH: add an action to search for run IDs using a text
 query

---
 edirect_requests.http                    | 64 ++++++++++++++++++
 http-client.env.json                     |  5 ++
 q2_fondue/entrezpy_clients/_esearch.py   | 11 +--
 q2_fondue/entrezpy_clients/_pipelines.py | 85 +++++++++++++++++-------
 q2_fondue/metadata.py                    |  3 +-
 q2_fondue/plugin_setup.py                | 27 +++++++-
 q2_fondue/query.py                       | 39 +++++++++++
 q2_fondue/sequences.py                   |  2 +-
 q2_fondue/tests/test_metadata.py         | 63 +++++++++++++++---
 q2_fondue/tests/test_sequences.py        |  2 +-
 10 files changed, 257 insertions(+), 44 deletions(-)
 create mode 100644 edirect_requests.http
 create mode 100644 http-client.env.json
 create mode 100644 q2_fondue/query.py
diff --git a/edirect_requests.http b/edirect_requests.http
new file mode 100644
index 0000000..f8ba1d1
--- /dev/null
+++ b/edirect_requests.http
@@ -0,0 +1,64 @@
+// ESearch request
+GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi
+    ?email=mziemski%40ethz.ch
+    &db=biosample
+    &term=txid410656%5BOrganism%5D+AND+biosample+sra%5BFilter%5D+AND+%22public%22%5BFilter%5D+AND+%28mouse+OR+rat+OR+pig+OR+dog+OR+human%29
+    &retmode=json
+    &retmax={{ retmax }}
+    &retstart=0
+#    &usehistory=y
+
+> {%
+ var webenv = response.body.esearchresult.webenv
+ var qkey = response.body.esearchresult.querykey
+ var ids = response.body.esearchresult.idlist.join(",")
+ client.global.set("esearch_retmax", response.body.esearchresult.idlist.length)
+ client.log(`${webenv}, ${qkey}, ${ids}`)
+ client.global.set("esearch_webenv", webenv)
+ client.global.set("esearch_qkey", qkey)
+ client.global.set("esearch_ids", ids)
+ %}
+<> 2022-07-07T080418.200.json
+
+###
+// ELink request
+GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi
+    ?email=mziemski%40ethz.ch
+    &dbfrom=biosample
+    &db=sra
+    &cmd=neighbor_history
+    &retmode=json
+    &id={{ esearch_ids }}
+#    &WebEnv={{ esearch_webenv }}
+#    &query_key={{ esearch_qkey }}
+
+> {%
+ var webenv = response.body.linksets[0].webenv
+ var qkey = response.body.linksets[0].linksetdbhistories[0].querykey
+ var ids_len = response.body.linksets[0].ids.length
+ var first_id = response.body.linksets[0].ids[0]
+ client.log(`${webenv}, ${qkey}, ${ids_len} IDs, first ID was: ${first_id}`)
+ client.global.set("elink_webenv", webenv)
+ client.global.set("elink_qkey", qkey)
+ %}
+
+###
+// EFetch request
+GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
+    ?email=mziemski%40ethz.ch
+    &db=sra
+    &retmode=xml
+    &rettype=docsum
+    &retstart=0
+    &retmax=10000
+    &WebEnv={{ elink_webenv }}
+    &query_key={{ elink_qkey }}
+
+> {%
+ var id_count = (response.body.match(/<DocSum>/g) || []).length;
+ // client.log(`There were ${id_count} IDs in the response.`)
+ client.log(parseInt(client.global.get("esearch_retmax")))
+ client.test("Request executed successfully", function() {
+   client.assert(id_count <= parseInt(client.global.get("esearch_retmax")), `Only ${id_count} IDs were found`);
+ });
+ %}
diff --git a/http-client.env.json b/http-client.env.json
new file mode 100644
index 0000000..3475a3a
--- /dev/null
+++ b/http-client.env.json
@@ -0,0 +1,5 @@
+{
+  "dev": {
+    "retmax": "0"
+  }
+}
\ No newline at end of file
diff --git a/q2_fondue/entrezpy_clients/_esearch.py b/q2_fondue/entrezpy_clients/_esearch.py
index 3832b55..10cdfb5 100644
--- a/q2_fondue/entrezpy_clients/_esearch.py
+++ b/q2_fondue/entrezpy_clients/_esearch.py
@@ -6,7 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from typing import List
+from typing import List, Union
 
 import pandas as pd
 from entrezpy.esearch.esearch_analyzer import EsearchAnalyzer
@@ -53,7 +53,7 @@ def validate_result(self) -> dict:
             **{_id: 'ID is invalid.' for _id in invalid_ids.index}
         }
 
-    def parse_search_results(self, response, uids: List[str]):
+    def parse_search_results(self, response, uids: Union[List[str], None]):
         """Parses response received from Esearch as a pandas Series object.
 
         Hit counts obtained in the response will be extracted and assigned to
@@ -79,9 +79,10 @@ def parse_search_results(self, response, uids: List[str]):
         }
 
         # find ids that are missing
-        missing_ids = [x for x in uids if x not in found_terms.keys()]
-        missing_ids = {x: 0 for x in missing_ids}
-        found_terms.update(missing_ids)
+        if uids:
+            missing_ids = [x for x in uids if x not in found_terms.keys()]
+            missing_ids = {x: 0 for x in missing_ids}
+            found_terms.update(missing_ids)
 
         self.result = pd.Series(found_terms, name='count')
 
diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py
index 614c66c..4b03880 100644
--- a/q2_fondue/entrezpy_clients/_pipelines.py
+++ b/q2_fondue/entrezpy_clients/_pipelines.py
@@ -5,6 +5,7 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+from typing import Union
 
 from entrezpy import conduit as ec
 
@@ -14,10 +15,17 @@
 from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer
 from q2_fondue.entrezpy_clients._utils import set_up_entrezpy_logging
 
+import entrezpy.esearch.esearcher as searcher
+
+from q2_fondue.utils import _chunker
+
+BATCH_SIZE = 500
+
 
 def _get_run_ids(
-        email: str, n_jobs: int, ids: list,
-        source: str, log_level: str) -> list:
+        email: str, n_jobs: int, ids: Union[list, None],
+        query: Union[str, None], source: str, log_level: str
+) -> list:
     """Pipeline to retrieve metadata of run IDs associated with
     studies (`source`='study'), bioprojects (`source`='bioproject'),
     samples (`source`='sample') or experiments (`source`='experiment')
@@ -27,6 +35,7 @@ def _get_run_ids(
         email (str): User email.
         n_jobs (int): Number of jobs.
         ids (list): List of study, bioproject, sample or experiment IDs.
+        query (str): Search query to find IDs by.
         source (str): Type of IDs provided ('study', 'bioproject',
                       'sample' or 'experiment').
         log_level (str): The log level to set.
@@ -34,38 +43,64 @@ def _get_run_ids(
     Returns:
         list: Run IDs associated with provided ids.
     """
+    term = " OR ".join(ids) if ids else query
+
     # create pipeline to fetch all run IDs
+    elink = True
     if source == 'bioproject':
         db = 'bioproject'
-        elink = True
+    elif source == 'biosample':
+        db = 'biosample'
     else:
         db = 'sra'
         elink = False
 
-    econduit = ec.Conduit(email=email, threads=n_jobs)
+    # find UIDS based on a query
+    esearcher = searcher.Esearcher(
+        'esearcher', email, apikey=None,
+        apikey_var=None, threads=n_jobs, qid=None)
+    esearch_response = esearcher.inquire(
+        {
+            'db': db, 'term': term,
+            'usehistory': False, 'rettype': 'json'
+        },
+        analyzer=ESearchAnalyzer(ids))
+
+    # use the UIDS to link to other DBs and fetch related records
+    # we won't be using multi-threading here as this shouldn't take
+    # long (we're only fetching IDs) and we don't want those dead
+    # threads afterwards
+    econduit = ec.Conduit(email=email, threads=0)
     set_up_entrezpy_logging(econduit, log_level)
-    samp_ids_pipeline = econduit.new_pipeline()
-
-    # search for IDs
-    es = samp_ids_pipeline.add_search(
-        {'db': db, 'term': " OR ".join(ids)},
-        analyzer=ESearchAnalyzer(ids)
-    )
-    if elink:
-        # given bioproject, find linked SRA runs
-        el = samp_ids_pipeline.add_link(
-            {'db': 'sra'},
-            analyzer=ElinkAnalyzer(), dependency=es
+    run_ids_pipeline = econduit.new_pipeline()
+
+    for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE):
+        if elink:
+            el = run_ids_pipeline.add_link(
+                {
+                    'db': 'sra', 'dbfrom': db,
+                    'id': _ids, 'link': False
+                },
+                analyzer=ElinkAnalyzer(),
+            )
+        else:
+            el = None
+
+        # given SRA run IDs, fetch all metadata
+        run_ids_pipeline.add_fetch(
+            {
+                'rettype': 'docsum', 'retmode': 'xml',
+                'reqsize': BATCH_SIZE, 'retmax': len(_ids)
+            },
+            analyzer=EFetchAnalyzer(log_level), dependency=el
         )
-    else:
-        el = es
 
-    # given SRA run IDs, fetch all metadata
-    samp_ids_pipeline.add_fetch(
-        {'rettype': 'docsum', 'retmode': 'xml'},
-        analyzer=EFetchAnalyzer(log_level), dependency=el
-    )
+    econduit.run(run_ids_pipeline)
 
-    a = econduit.run(samp_ids_pipeline)
+    # recover metadata from all instances of EFetchAnalyzer
+    all_meta = []
+    for x in econduit.analyzers.values():
+        if isinstance(x, EFetchAnalyzer):
+            all_meta.extend(x.result.metadata)
 
-    return sorted(a.result.metadata)
+    return sorted(all_meta)
diff --git a/q2_fondue/metadata.py b/q2_fondue/metadata.py
index 912b7a8..4c4eefc 100644
--- a/q2_fondue/metadata.py
+++ b/q2_fondue/metadata.py
@@ -112,7 +112,8 @@ def _get_other_meta(
         email, n_jobs, project_ids, id_type, log_level, logger
 ) -> (pd.DataFrame, dict):
     run_ids = _get_run_ids(
-                    email, n_jobs, project_ids, id_type, log_level)
+        email, n_jobs, project_ids, None, id_type, log_level
+    )
 
     return _get_run_meta(email, n_jobs, run_ids, True, log_level, logger)
 
diff --git a/q2_fondue/plugin_setup.py b/q2_fondue/plugin_setup.py
index 2851bcb..a68fb38 100644
--- a/q2_fondue/plugin_setup.py
+++ b/q2_fondue/plugin_setup.py
@@ -18,6 +18,7 @@
 
 from q2_fondue import __version__
 from q2_fondue.get_all import get_all
+from q2_fondue.query import get_ids_from_query
 from q2_fondue.metadata import get_metadata, merge_metadata
 from q2_fondue.sequences import get_sequences, combine_seqs
 from q2_fondue.scraper import scrape_collection
@@ -65,7 +66,8 @@
     'paired_reads': 'Artifact containing paired-end fastq.gz files '
                     'for all the requested IDs.',
     'failed_runs': 'List of all run IDs for which fetching {} failed, '
-                   'with their corresponding error messages.'
+                   'with their corresponding error messages.',
+    'ids': 'Artifact containing retrieved SRA accession IDs.'
 }
 
 output_scraper_txt = 'Artifact containing all {} IDs scraped from ' \
@@ -257,6 +259,29 @@
     citations=[citations['stephan_hugel_2019_2917290']]
 )
 
+plugin.methods.register_function(
+    function=get_ids_from_query,
+    inputs={},
+    parameters={
+        'query': Str,
+        **common_params
+    },
+    outputs=[('ids', NCBIAccessionIDs)],
+    input_descriptions={},
+    parameter_descriptions={
+        'query': 'Search query to find SRA IDs by.',
+        **common_param_descr
+    },
+    output_descriptions={
+        'ids': output_descriptions['metadata'],
+    },
+    name='Find SRA accession IDs based on a search query.',
+    description=(
+        'Find SRA accession IDs based on a search query.'
+    ),
+    citations=[]
+)
+
 plugin.register_formats(
     SRAMetadataFormat, SRAMetadataDirFmt,
     SRAFailedIDsFormat, SRAFailedIDsDirFmt,
diff --git a/q2_fondue/query.py b/q2_fondue/query.py
new file mode 100644
index 0000000..e3a7d33
--- /dev/null
+++ b/q2_fondue/query.py
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022, Bokulich Laboratories.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import threading
+import pandas as pd
+
+from q2_fondue.utils import handle_threaded_exception
+from q2_fondue.entrezpy_clients._pipelines import _get_run_ids
+
+threading.excepthook = handle_threaded_exception
+
+
+def get_ids_from_query(
+        query: str, email: str,
+        n_jobs: int = 1, log_level: str = 'INFO'
+) -> pd.Series:
+    """Retrieves SRA run IDs based on a search query performed
+        on the BioSample database.
+
+    Args:
+        query (str): Search query to be executed on
+            the BioSample database.
+        email (str): A valid e-mail address (required by NCBI).
+        n_jobs (int, default=1): Number of threads to be used in parallel.
+        log_level (str, default='INFO'): Logging level.
+
+    Returns:
+        ids (pd.Series): Retrieved SRA run IDs.
+    """
+    run_ids = _get_run_ids(
+        email, n_jobs, None, query, 'biosample', log_level
+    )
+
+    return pd.Series(run_ids, name='ID')
diff --git a/q2_fondue/sequences.py b/q2_fondue/sequences.py
index cfe2eef..5d451d1 100644
--- a/q2_fondue/sequences.py
+++ b/q2_fondue/sequences.py
@@ -380,7 +380,7 @@ def get_sequences(
     id_type = _determine_id_type(accession_ids)
     if id_type != 'run':
         accession_ids = _get_run_ids(
-            email, n_jobs, accession_ids, id_type, log_level
+            email, n_jobs, accession_ids, None, id_type, log_level
         )
 
     fetched_q = Queue()
diff --git a/q2_fondue/tests/test_metadata.py b/q2_fondue/tests/test_metadata.py
index d426f07..98ed85f 100644
--- a/q2_fondue/tests/test_metadata.py
+++ b/q2_fondue/tests/test_metadata.py
@@ -11,6 +11,8 @@
 import pandas as pd
 import numpy as np
 import unittest
+
+from entrezpy.elink.elink_analyzer import ElinkAnalyzer
 from parameterized import parameterized
 from entrezpy import conduit
 from entrezpy.esearch import esearcher
@@ -19,7 +21,7 @@
 from pandas._testing import assert_frame_equal, assert_series_equal
 from numpy.testing import assert_array_equal
 from qiime2.metadata import Metadata
-from unittest.mock import patch, MagicMock, ANY
+from unittest.mock import patch, MagicMock, ANY, call
 
 from q2_fondue.entrezpy_clients import _esearch
 from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer
@@ -41,6 +43,7 @@ def __init__(self, fake_efetch_result, fake_efetch_response):
         self.fake_efetch_result = fake_efetch_result
         self.fake_efetch_response = fake_efetch_response
         self.pipeline = MagicMock()
+        self.analyzers = {}
 
     def new_pipeline(self):
         self.pipeline.add_search = MagicMock(return_value='fake_search')
@@ -391,31 +394,71 @@ def test_get_run_meta_one_invalid_id(self, patch_ef, patch_val, patch_es):
         ("sample", "sra")
     ])
     @patch('q2_fondue.metadata._get_run_meta')
+    @patch('entrezpy.esearch.esearcher.Esearcher')
+    @patch.object(_esearch, 'ESearchAnalyzer')
+    @patch('q2_fondue.entrezpy_clients._pipelines.BATCH_SIZE', 6)
     def test_get_other_meta_different_ids(
-            self, id_type, db2search, mock_get):
+            self, id_type, db2search,
+            mock_analyzer, mock_search, mock_get
+    ):
         exp_ids = [
             'SRR000007', 'SRR000018', 'SRR000020', 'SRR000038',
             'SRR000043', 'SRR000046', 'SRR000048', 'SRR000050',
             'SRR000057', 'SRR000058', 'SRR13961759', 'SRR13961771']
+        mock_search.return_value = self.fake_esearcher
+        fake_uids = [str(i) for i in range(len(exp_ids))]
+        mock_analyzer.return_value = MagicMock(
+            result=MagicMock(uids=fake_uids)
+        )
+        mock_search.return_value.inquire = mock_analyzer
         with patch.object(conduit, 'Conduit') as mock_conduit:
+            fake_analyzer1 = EFetchAnalyzer('INFO')
+            fake_analyzer1.result = MagicMock(metadata=exp_ids[:6])
+            fake_analyzer2 = EFetchAnalyzer('INFO')
+            fake_analyzer2.result = MagicMock(metadata=exp_ids[6:])
+            self.fake_econduit.analyzers = {
+                '1': ElinkAnalyzer,
+                '2': fake_analyzer1,
+                '3': fake_analyzer2
+            }
             mock_conduit.return_value = self.fake_econduit
             mock_get.return_value = exp_ids
 
-            _ = _get_other_meta(
+            obs_ids = _get_other_meta(
                 'someone@somewhere.com', 1, ['AB', 'cd'], id_type,
                 'INFO', MagicMock()
             )
 
-            self.fake_econduit.pipeline.add_search.assert_called_once_with(
-                {'db': db2search, 'term': "AB OR cd"}, analyzer=ANY
-            )
-            self.fake_econduit.pipeline.add_fetch.assert_called_once_with(
-                {'rettype': 'docsum', 'retmode': 'xml'},
-                analyzer=ANY, dependency=ANY
-            )
+            if db2search != 'sra':
+                self.fake_econduit.pipeline.add_link.assert_has_calls([
+                    call(
+                        {
+                            'db': 'sra', 'dbfrom': db2search,
+                            'id': fake_uids[:6], 'link': False
+                        },
+                        analyzer=ANY
+                    ),
+                    call(
+                        {
+                            'db': 'sra', 'dbfrom': db2search,
+                            'id': fake_uids[6:], 'link': False
+                        },
+                        analyzer=ANY
+                    )
+                ])
+            self.fake_econduit.pipeline.add_fetch.has_calls([
+                call(
+                    {
+                        'rettype': 'docsum', 'retmode': 'xml',
+                        'reqsize': 6, 'retmax': 2
+                    },
+                    analyzer=ANY, dependency=ANY)
+
+            ] * 2)
             mock_get.assert_called_once_with(
                 'someone@somewhere.com', 1, exp_ids, True, 'INFO', ANY
             )
+            self.assertListEqual(sorted(exp_ids), obs_ids)
 
     @patch('q2_fondue.metadata._get_run_meta')
     @patch('q2_fondue.metadata._get_other_meta')
diff --git a/q2_fondue/tests/test_sequences.py b/q2_fondue/tests/test_sequences.py
index a78ca5b..fa7dfdb 100644
--- a/q2_fondue/tests/test_sequences.py
+++ b/q2_fondue/tests/test_sequences.py
@@ -724,7 +724,7 @@ def test_get_sequences_other(
             test_temp_md, email='some@where.com', retries=0)
 
         mock_get.assert_called_with(
-            'some@where.com', 1, [acc_id], id_type, 'INFO'
+            'some@where.com', 1, [acc_id], None, id_type, 'INFO'
         )
         mock_proc.assert_has_calls([
             call(target=_run_fasterq_dump_for_all, args=(

From 487f1c2cd7a9885fb79e1f5a7d51241a257149f6 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Wed, 17 Aug 2022 12:01:10 +0200
Subject: [PATCH 2/9] Add query test

---
 q2_fondue/tests/test_query.py | 40 +++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 q2_fondue/tests/test_query.py

diff --git a/q2_fondue/tests/test_query.py b/q2_fondue/tests/test_query.py
new file mode 100644
index 0000000..836dca8
--- /dev/null
+++ b/q2_fondue/tests/test_query.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022, Bokulich Laboratories.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import unittest
+
+from pandas.testing import assert_frame_equal
+from qiime2.plugins import fondue
+from unittest.mock import patch
+
+from q2_fondue.tests.test_sequences import SequenceTests
+
+
+class TestQuery(SequenceTests):
+    package = 'q2_fondue.tests'
+
+    @patch(
+        'q2_fondue.query._get_run_ids',
+        return_value=['SRR123', 'SRR234']
+    )
+    def test_query(self, mock_ids):
+        query = 'some magical query text'
+
+        obs_ids, = fondue.actions.get_ids_from_query(
+            query, 'fake@email.com', 1, 'DEBUG'
+        )
+        exp_ids = pd.DataFrame(index=pd.Index(['SRR123', 'SRR234'], name='ID'))
+
+        mock_ids.assert_called_once_with(
+            'fake@email.com', 1, None, query, 'biosample', 'DEBUG'
+        )
+        assert_frame_equal(obs_ids.view(pd.DataFrame), exp_ids)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d40af43a5ed3e87683c42045d823c4dec1ec75b2 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Wed, 17 Aug 2022 12:11:10 +0200
Subject: [PATCH 3/9] Remove the http request files

---
 edirect_requests.http | 64 -------------------------------------------
 http-client.env.json  |  5 ----
 2 files changed, 69 deletions(-)
 delete mode 100644 edirect_requests.http
 delete mode 100644 http-client.env.json

diff --git a/edirect_requests.http b/edirect_requests.http
deleted file mode 100644
index f8ba1d1..0000000
--- a/edirect_requests.http
+++ /dev/null
@@ -1,64 +0,0 @@
-// ESearch request
-GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi
-    ?email=mziemski%40ethz.ch
-    &db=biosample
-    &term=txid410656%5BOrganism%5D+AND+biosample+sra%5BFilter%5D+AND+%22public%22%5BFilter%5D+AND+%28mouse+OR+rat+OR+pig+OR+dog+OR+human%29
-    &retmode=json
-    &retmax={{ retmax }}
-    &retstart=0
-#    &usehistory=y
-
-> {%
- var webenv = response.body.esearchresult.webenv
- var qkey = response.body.esearchresult.querykey
- var ids = response.body.esearchresult.idlist.join(",")
- client.global.set("esearch_retmax", response.body.esearchresult.idlist.length)
- client.log(`${webenv}, ${qkey}, ${ids}`)
- client.global.set("esearch_webenv", webenv)
- client.global.set("esearch_qkey", qkey)
- client.global.set("esearch_ids", ids)
- %}
-<> 2022-07-07T080418.200.json
-
-###
-// ELink request
-GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi
-    ?email=mziemski%40ethz.ch
-    &dbfrom=biosample
-    &db=sra
-    &cmd=neighbor_history
-    &retmode=json
-    &id={{ esearch_ids }}
-#    &WebEnv={{ esearch_webenv }}
-#    &query_key={{ esearch_qkey }}
-
-> {%
- var webenv = response.body.linksets[0].webenv
- var qkey = response.body.linksets[0].linksetdbhistories[0].querykey
- var ids_len = response.body.linksets[0].ids.length
- var first_id = response.body.linksets[0].ids[0]
- client.log(`${webenv}, ${qkey}, ${ids_len} IDs, first ID was: ${first_id}`)
- client.global.set("elink_webenv", webenv)
- client.global.set("elink_qkey", qkey)
- %}
-
-###
-// EFetch request
-GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
-    ?email=mziemski%40ethz.ch
-    &db=sra
-    &retmode=xml
-    &rettype=docsum
-    &retstart=0
-    &retmax=10000
-    &WebEnv={{ elink_webenv }}
-    &query_key={{ elink_qkey }}
-
-> {%
- var id_count = (response.body.match(/<DocSum>/g) || []).length;
- // client.log(`There were ${id_count} IDs in the response.`)
- client.log(parseInt(client.global.get("esearch_retmax")))
- client.test("Request executed successfully", function() {
-   client.assert(id_count <= parseInt(client.global.get("esearch_retmax")), `Only ${id_count} IDs were found`);
- });
- %}
diff --git a/http-client.env.json b/http-client.env.json
deleted file mode 100644
index 3475a3a..0000000
--- a/http-client.env.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "dev": {
-    "retmax": "0"
-  }
-}
\ No newline at end of file

From 75ebdb5bb0b72894c9e7e91562840b29afe6cae3 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Wed, 17 Aug 2022 13:57:56 +0200
Subject: [PATCH 4/9] Add missing test case

---
 q2_fondue/tests/test_metadata.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/q2_fondue/tests/test_metadata.py b/q2_fondue/tests/test_metadata.py
index 98ed85f..b3bef82 100644
--- a/q2_fondue/tests/test_metadata.py
+++ b/q2_fondue/tests/test_metadata.py
@@ -391,7 +391,8 @@ def test_get_run_meta_one_invalid_id(self, patch_ef, patch_val, patch_es):
         ("study", "sra"),
         ("bioproject", "bioproject"),
         ("experiment", "sra"),
-        ("sample", "sra")
+        ("sample", "sra"),
+        ("biosample", "biosample")
     ])
     @patch('q2_fondue.metadata._get_run_meta')
     @patch('entrezpy.esearch.esearcher.Esearcher')

From d139776e1ee3cab830f7a3ed802f4378acb14740 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Wed, 17 Aug 2022 14:11:54 +0200
Subject: [PATCH 5/9] Add more comments

---
 q2_fondue/entrezpy_clients/_pipelines.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py
index 4b03880..94d20da 100644
--- a/q2_fondue/entrezpy_clients/_pipelines.py
+++ b/q2_fondue/entrezpy_clients/_pipelines.py
@@ -55,7 +55,15 @@ def _get_run_ids(
         db = 'sra'
         elink = False
 
-    # find UIDS based on a query
+    # find UIDS based on a query;
+    # instead of saving the result on the history server
+    # we will store all the UIDs recovered based on the
+    # search query and use those in the mini-pipeline below;
+    # this way we are not limited by ELink only accepting up to
+    # who knows how many IDs and erroring out if we provide too
+    # many (which could be the case e.g.: when we ask for more
+    # than 10000 BioProject IDs or the text query returns more
+    # than 10000 IDs presumably)
     esearcher = searcher.Esearcher(
         'esearcher', email, apikey=None,
         apikey_var=None, threads=n_jobs, qid=None)
@@ -66,7 +74,7 @@ def _get_run_ids(
         },
         analyzer=ESearchAnalyzer(ids))
 
-    # use the UIDS to link to other DBs and fetch related records
+    # use the UIDs to link to other DBs and fetch related records;
     # we won't be using multi-threading here as this shouldn't take
     # long (we're only fetching IDs) and we don't want those dead
     # threads afterwards

From 3b7a2e63f0cf49733d967aa58761279b3b3c0af4 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Wed, 17 Aug 2022 14:19:27 +0200
Subject: [PATCH 6/9] Adjust action input descriptions

---
 q2_fondue/plugin_setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/q2_fondue/plugin_setup.py b/q2_fondue/plugin_setup.py
index a68fb38..a7bd00c 100644
--- a/q2_fondue/plugin_setup.py
+++ b/q2_fondue/plugin_setup.py
@@ -269,7 +269,8 @@
     outputs=[('ids', NCBIAccessionIDs)],
     input_descriptions={},
     parameter_descriptions={
-        'query': 'Search query to find SRA IDs by.',
+        'query': 'Search query to retrieve SRA IDs from '
+                 'the BioSample database.',
         **common_param_descr
     },
     output_descriptions={
@@ -277,7 +278,8 @@
     },
     name='Find SRA accession IDs based on a search query.',
     description=(
-        'Find SRA accession IDs based on a search query.'
+        'Find SRA accession IDs in the BioSample database '
+        'using a text search query.'
     ),
     citations=[]
 )

From 9ca7d393e63a977e16b9a9aa4dc8d5f7f056f7ff Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Wed, 17 Aug 2022 15:31:49 +0200
Subject: [PATCH 7/9] Fix fetching run ids from studies and alike

---
 q2_fondue/entrezpy_clients/_pipelines.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py
index 94d20da..9a48fbf 100644
--- a/q2_fondue/entrezpy_clients/_pipelines.py
+++ b/q2_fondue/entrezpy_clients/_pipelines.py
@@ -95,11 +95,17 @@ def _get_run_ids(
             el = None
 
         # given SRA run IDs, fetch all metadata
+        efetch_params = {
+            'rettype': 'docsum', 'retmode': 'xml',
+            'reqsize': BATCH_SIZE, 'retmax': len(_ids)
+        }
+        if not elink:
+            # we need to specify these manually as in this scenario
+            # EFetch is not linked to anything
+            efetch_params.update({'id': _ids, 'db': db})
+
         run_ids_pipeline.add_fetch(
-            {
-                'rettype': 'docsum', 'retmode': 'xml',
-                'reqsize': BATCH_SIZE, 'retmax': len(_ids)
-            },
+            efetch_params,
             analyzer=EFetchAnalyzer(log_level), dependency=el
         )
 

From 1efba4944c16dab458311443a5d97d98d41dfe10 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Thu, 18 Aug 2022 16:39:54 +0200
Subject: [PATCH 8/9] Review suggestions

---
 q2_fondue/entrezpy_clients/_pipelines.py | 19 ++++++++++++-------
 q2_fondue/plugin_setup.py                |  8 ++++----
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/q2_fondue/entrezpy_clients/_pipelines.py b/q2_fondue/entrezpy_clients/_pipelines.py
index 9a48fbf..49276ba 100644
--- a/q2_fondue/entrezpy_clients/_pipelines.py
+++ b/q2_fondue/entrezpy_clients/_pipelines.py
@@ -26,9 +26,10 @@ def _get_run_ids(
         email: str, n_jobs: int, ids: Union[list, None],
         query: Union[str, None], source: str, log_level: str
 ) -> list:
-    """Pipeline to retrieve metadata of run IDs associated with
-    studies (`source`='study'), bioprojects (`source`='bioproject'),
-    samples (`source`='sample') or experiments (`source`='experiment')
+    """Pipeline to retrieve run IDs associated with BioSample query
+    (provided in `query`) or other aggregate IDs like studies
+    (`source`='study'), bioprojects (`source`='bioproject'), samples
+    (`source`='sample') or experiments (`source`='experiment')
     provided in `ids`.
 
     Args:
@@ -82,6 +83,10 @@ def _get_run_ids(
     set_up_entrezpy_logging(econduit, log_level)
     run_ids_pipeline = econduit.new_pipeline()
 
+    # create a pipeline to link and fetch the run IDs;
+    # we process the IDs obtained from the previous step in batches
+    # as ELink cannot handle more than a certain amount of IDs
+    # at the same time (recommended by NCBI)
     for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE):
         if elink:
             el = run_ids_pipeline.add_link(
@@ -111,10 +116,10 @@ def _get_run_ids(
 
     econduit.run(run_ids_pipeline)
 
-    # recover metadata from all instances of EFetchAnalyzer
-    all_meta = []
+    # recover run IDs from all instances of EFetchAnalyzer
+    all_run_ids = []
     for x in econduit.analyzers.values():
         if isinstance(x, EFetchAnalyzer):
-            all_meta.extend(x.result.metadata)
+            all_run_ids.extend(x.result.metadata)
 
-    return sorted(all_meta)
+    return sorted(all_run_ids)
diff --git a/q2_fondue/plugin_setup.py b/q2_fondue/plugin_setup.py
index a7bd00c..10759d3 100644
--- a/q2_fondue/plugin_setup.py
+++ b/q2_fondue/plugin_setup.py
@@ -67,7 +67,7 @@
                     'for all the requested IDs.',
     'failed_runs': 'List of all run IDs for which fetching {} failed, '
                    'with their corresponding error messages.',
-    'ids': 'Artifact containing retrieved SRA accession IDs.'
+    'ids': 'Artifact containing retrieved SRA run accession IDs.'
 }
 
 output_scraper_txt = 'Artifact containing all {} IDs scraped from ' \
@@ -269,16 +269,16 @@
     outputs=[('ids', NCBIAccessionIDs)],
     input_descriptions={},
     parameter_descriptions={
-        'query': 'Search query to retrieve SRA IDs from '
+        'query': 'Search query to retrieve SRA run IDs from '
                  'the BioSample database.',
         **common_param_descr
     },
     output_descriptions={
         'ids': output_descriptions['metadata'],
     },
-    name='Find SRA accession IDs based on a search query.',
+    name='Find SRA run accession IDs based on a search query.',
     description=(
-        'Find SRA accession IDs in the BioSample database '
+        'Find SRA run accession IDs in the BioSample database '
         'using a text search query.'
     ),
     citations=[]

From 88d2680fe0c4221a62a9b82e54746c180cd40839 Mon Sep 17 00:00:00 2001
From: Michal Ziemski <mziemski@ethz.ch>
Date: Fri, 19 Aug 2022 10:42:03 +0200
Subject: [PATCH 9/9] Update README

---
 README.md | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 8fba2a0..87fd9b9 100644
--- a/README.md
+++ b/README.md
@@ -61,14 +61,15 @@ To find out which temporary directory is used by Qiime 2, you can run `echo $TMP
 ### Available actions
 q2-fondue provides a couple of actions to fetch and manipulate nucleotide sequencing data and related metadata from SRA as well as an action to scrape run, study, BioProject, experiment and sample IDs from a Zotero web library. Below you will find a list of available actions and their short descriptions.
 
-| Action           | Description                                                              |
-|------------------|--------------------------------------------------------------------------|
-| `get-sequences`  | Fetch sequences by IDs[*] from the SRA repository.        |
-| `get-metadata`   | Fetch metadata by IDs[*] from the SRA repository.         |
-| `get-all`        | Fetch sequences and metadata by IDs[*] from the SRA repo. |
-| `merge-metadata` | Merge several metadata files into a single metadata object.              |
-| `combine-seqs`   | Combine sequences from multiple artifacts into a single artifact.        |
-| `scrape-collection`| Scrape Zotero collection for IDs[*] and associated DOI names.|
+| Action               | Description                                                              |
+|----------------------|--------------------------------------------------------------------------|
+| `get-sequences`      | Fetch sequences by IDs[*] from the SRA repository.        |
+| `get-metadata`       | Fetch metadata by IDs[*] from the SRA repository.         |
+| `get-all`            | Fetch sequences and metadata by IDs[*] from the SRA repo. |
+| `get-ids-from-query` | Find SRA run accession IDs based on a search query. |
+| `merge-metadata`     | Merge several metadata files into a single metadata object.              |
+| `combine-seqs`       | Combine sequences from multiple artifacts into a single artifact.        |
+| `scrape-collection`  | Scrape Zotero collection for IDs[*] and associated DOI names.|
 
 [*]: Supported IDs include run, study, BioProject, experiment and study IDs.
 
@@ -116,6 +117,21 @@ where:
 - `--o-experiment-ids` is the output artifact containing the scraped experiment IDs.
 - `--o-sample-ids` is the output artifact containing the scraped sample IDs.
 
+3) To retrieve run accession IDs based on a text search query (performed on the BioSample database) you can use the `get-ids-from-query` method:
+```shell
+qiime fondue get-ids-from-query \
+              --p-query "txid410656[Organism] AND \"public\"[Filter] AND (chicken OR poultry)" \
+              --p-email your_email@somewhere.com \
+              --p-n-jobs 2 \
+              --o-ids run_ids.qza \
+              --verbose
+```
+where:
+- `--p-query` is the text search query to be executed on the BioSample database.
+- `--p-email` is your email address (required by NCBI).
+- `--p-n-jobs` is the number of parallel download jobs (defaults to 1).
+- `--o-ids` is the output artifact containing the retrieved run IDs.
+
 ### Fetching metadata
 To fetch metadata associated with a set of output IDs, execute the following command: