TIBHannover · katrinleinweber · Apr 18, 2018 · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018
diff --git a/R/retrieve_data.R b/R/retrieve_data.R
@@ -1,67 +1,53 @@
 #' Retrieve data from BacDive
 #'
-#' @param searchTerm Mandatory character string (in case of the `searchType`
+#' @param searchTerm Mandatory character string (in case of `searchType = `
 #'   `sequence`, `culturecollectionno` or `taxon`) or integer (in case of
 #'   `bacdive_id`), specifying what shall be searched for.
 #'
 #' @param searchType Mandatory character string that specifies which type of
 #'   search will be performed (technically, which API endpoint). Can be `taxon`
 #'   (default), `bacdive_id`, `sequence`, or `culturecollectionno`.
 #'
-#' @param force_taxon_download Logical; default: `FALSE`. In case of a taxon
-#'   search, BacDive will return not the actual data of the search results, but
-#'   only a paged list of URLs pointing to the actual datasets. Setting
-#'   `force_taxon_download = TRUE` (default: `FALSE`) triggers many downloads of
-#'   the individual result datasets. Please note: This may take much longer than
-#'   an unambiguous search, and may cause R(Studio) to become temporarily
-#'   unresponsive. Maybe go for a walk for a few minutes ;-)
-#'
-#' @return EITHER (from an unambiguous searchTerm, or in case of
-#'   `force_taxon_download = TRUE`) a list of lists containing the single
-#'   BacDive dataset for that `searchTerm`,
-#'
-#'   OR (from a _am_biguous search; eg.g `serchType = "taxon"`) a numeric vector
-#'   of BacDive-IDs, on which you can call your own loop containing
-#'   `retrieve_data()` to retrieve the individual data sets.
+#' @return A list of lists containing either a single BacDive dataset in case
+#'   the `searchTerm` was unambiguous (`bacdive_id`, `sequence`,
+#'   `culturecollectionno`), or a large list containing all datasets that match
+#'   an ambiguous `searchTerm` (most `taxon`s).
 #'
 #' @export
 #' @examples retrieve_data(searchTerm = "Bacillus subtilis subtilis")
 #'   # This returns a numeric vector of IDs. To download all the corresponding
 #'   # data, use:
-#'   retrieve_data("Bacillus subtilis subtilis", force_taxon_download = TRUE)
+#'   retrieve_data("Bacillus subtilis subtilis")
 #'
 #'   # In case the `searchTerm` is unambiguous already, the data download will
 #'   # procede automatically:
 #'   retrieve_data(searchTerm = "DSM 319", searchType = "culturecollectionno")
 #'   retrieve_data(searchTerm = "AJ000733", searchType = "sequence")
 #'   retrieve_data(searchTerm = 717, searchType = "bacdive_id")
 retrieve_data <- function(searchTerm,
-                          searchType = "taxon",
-                          force_taxon_download = FALSE) {
+                          searchType = "taxon") {
   payload <-
     jsonlite::fromJSON(download(construct_url(searchTerm, searchType)))
 
-
-
   if (identical(payload$detail, "Not found"))
   {
     stop(
       "Your search returned no result, sorry. Please make sure that you provided a searchTerm, and specified the correct searchType. Please type '?retrieve_data' and read through the 'searchType' section to learn more."
     )
   }
-  else if (is_paged(payload) && !force_taxon_download)
-    aggregate_result_IDs(payload)
-  else if (is_paged(payload) && force_taxon_download)
+  else if (is_dataset(payload))
   {
-    if (payload$count > 100) warn_slow_download(payload$count)
-    aggregate_datasets(payload)
+    payload <- list(payload)
+    names(payload) <- searchTerm
+    return(payload)
   }
-  else if (length(payload$results) == 1)
+  else
   {
-    # repeat download, if API returned a single ID, instead of a full dataset
-    jsonlite::fromJSON(download(paste0(payload[1]$url, "?format=json")))
+    if (!is.null(payload$count) &&
+        payload$count > 100)
+      warn_slow_download(payload$count)
+    aggregate_datasets(payload)
   }
-  else payload
 }
 
 
@@ -74,20 +60,20 @@ retrieve_data <- function(searchTerm,
 #'   but also used with something else by the tests.
 #'
 #' @return A serialised JSON string.
-download <- function(URL, userpwd = paste(get_credentials(), collapse = ":")) {
-  gsub(
-    pattern = "[[:space:]]+",
-    replacement = " ",
-    perl = TRUE,
-    # Prevent "lexical error: invalid character inside string."
-    # https://github.com/jeroen/jsonlite/issues/47
-  RCurl::getURL(
-    URL,
-    userpwd = userpwd,
-    httpauth = 1L
+download <-
+  function(URL,
+           userpwd = paste(get_credentials(), collapse = ":")) {
+    gsub(
+      pattern = "[[:space:]]+",
+      replacement = " ",
+      perl = TRUE,
+      # Prevent "lexical error: invalid character inside string."
+      # https://github.com/jeroen/jsonlite/issues/47
+      RCurl::getURL(URL,
+                    userpwd = userpwd,
+                    httpauth = 1L)
     )
-  )
-}
+  }
 
 
 #' Aggregate BacDive-IDs from a Paged List of Retrieved URLs
@@ -118,12 +104,19 @@ aggregate_result_IDs <- function(results) {
 #'
 #' @return An integer vector of all BacDive IDs within the results.
 aggregate_result_URLs <- function(results) {
+
+  if (length(results$url) == 1)
+    URLs <- results$url
+  else
+  {
   URLs <- c()
   while (TRUE) {
     URLs <- c(URLs, unlist(results$results, use.names = FALSE))
     if (!is.null(results$`next`))
       results <- jsonlite::fromJSON(download(results$`next`))
-    else break
+    else
+      break
+  }
   }
   return(paste0(URLs, "?format=json"))
 }
@@ -133,6 +126,18 @@ URLs_to_IDs <- function(URLs) {
   gsub(pattern = "\\D", "", URLs)
 }
 
-is_paged <- function(payload) {
-  identical(names(payload), c("count", "next", "previous", "results"))
+is_dataset <- function(payload) {
+  identical(
+    names(payload),
+    c(
+      "taxonomy_name",
+      "morphology_physiology",
+      "culture_growth_condition",
+      "environment_sampling_isolation_source",
+      "application_interaction",
+      "molecular_biology",
+      "strain_availability",
+      "references"
+    )
+  )
 }
diff --git a/tests/testthat/test-retrieve_data.R b/tests/testthat/test-retrieve_data.R
@@ -59,13 +59,21 @@ test_that("using the taxon search for a single dataset works", {
 
 test_that("downloading a single dataset via culturecollectionno works (#45)",
           {
-            expect_type(
-              object = retrieve_data(
+            expect_true(
+              identical(retrieve_data(
                 searchTerm = "DSM 319",
-                searchType = "culturecollectionno",
-                force_taxon_download = TRUE
-              ),
-              type = "list"
+                searchType = "culturecollectionno"
+              )$`717`$taxonomy_name$strains_tax_PNU$ID_reference,
+              retrieve_data(
+                searchTerm = "AJ000733",
+                searchType = "sequence"
+              )$`717`$taxonomy_name$strains_tax_PNU$ID_reference,
+              retrieve_data(
+                searchTerm = 717,
+                searchType = "bacdive_id"
+              )$`717`$taxonomy_name$strains_tax_PNU$ID_reference,
+              20215
+              )
             )
           })
 
@@ -74,8 +82,7 @@ test_that("extracting a single field from a taxon-wide search works", {
   expect_equal(30,
                unique(unlist(
                  purrr::map(
-                   .x = retrieve_data("Aminobacter aminovorans",
-                                      force_taxon_download = TRUE),
+                   .x = retrieve_data("Aminobacter aminovorans"),
                    .f = ~ as.numeric(.x$culture_growth_condition$culture_temp$temp)
                  )
                )))
@@ -84,12 +91,12 @@ test_that("extracting a single field from a taxon-wide search works", {
 
 # test set with 2 strains
 Bac_hal <- "Bacillus halotolerans"
-Bac_hal_data <- retrieve_data(searchTerm = Bac_hal,
-                              force_taxon_download = TRUE)
+Bac_hal_data <- retrieve_data(searchTerm = Bac_hal)
 
 test_that("any dataset returned by BacDiveR is named with its ID", {
-  expect_equal(retrieve_data(searchTerm = Bac_hal),
-               as.numeric(names(Bac_hal_data)))
+  expect_equal(names(Bac_hal_data),
+               c("1095", "1847"))
+  # https://bacdive.dsmz.de/advsearch?advsearch=search&site=advsearch&searchparams%5B73%5D%5Bcontenttype%5D=text&searchparams%5B73%5D%5Btypecontent%5D=exact&searchparams%5B73%5D%5Bsearchterm%5D=Bacillus+halotolerans&csv_bacdive_ids_advsearch=download
 })
 
 test_that("normalising invalid JSON newlines works", {
@@ -101,8 +108,7 @@ test_that("normalising invalid JSON newlines works", {
   # https://bacdive.dsmz.de/api/bacdive/bacdive_id/1847/?format=json
   # contains "medium_composition": "Name: ISP 2 / Yeast Malt Agar (5265); 5265\r\nComposition
   expect_type(
-    object = retrieve_data(searchTerm = "Bacillus cytotoxicus",
-                           force_taxon_download = TRUE),
+    object = retrieve_data(searchTerm = "Bacillus cytotoxicus"),
     type = "list"
   )
   # https://bacdive.dsmz.de/api/bacdive/bacdive_id/1323/?format=json

diff --git a/vignettes/BacDive-ing-in.Rmd b/vignettes/BacDive-ing-in.Rmd
@@ -29,11 +29,11 @@ returns a paginated list of strains that you can then access, download and analy
 ```{r taxon_Bac}
 library(BacDiveR)
 taxon_1 <- "Bacillus halodurans"
-Bac_IDs <- retrieve_data(searchTerm = taxon_1) 
+Bac_IDs <- retrieve_IDs(searchTerm = taxon_1) 
 head(Bac_IDs) 
 ```
 
-Calling `retrieve_data()` with just a `searchTerm` results in a vector of 
+Calling `retrieve_IDs()` with just a `searchTerm` results in a vector of the
 numeric BacDive IDs. You can use such ID downloads for meta-analyses of different 
 bacterial taxons such as comparisons of taxon sizes, as they are represented in 
 the DSMZ's collection.
@@ -45,15 +45,15 @@ we want to compare the optimal growth temperatures of strains from the taxon
 *`r taxon_1`* with another one. You can obtain that data of course by feeding 
 the ID vector obtained above into self-made loops that calls `retrieve_data(…, searchType = "bacdive_id")`.
 
-However, you can save yourself some time and effort by activating the parameter
-`force_taxon_download`. This will get you all taxon data in a single (albeit
+However, you can save yourself some time and effort by using `retrieve_data()` 
+with it's default `searchType = taxon`. This will get you all taxon data in a single (albeit
 large) list of dataframes. Feel free to take a break while the computers
 do some work for you:
 
 ```{r taxon_At}
 taxon_2 <- "Aneurinibacillus thermoaerophilus"
-Bac_data <- retrieve_data(taxon_1, force_taxon_download = TRUE)
-At_data <- retrieve_data(taxon_2, force_taxon_download = TRUE)
+Bac_data <- retrieve_data(taxon_1)
+At_data <- retrieve_data(taxon_2)
 ```