From 3d34208993331bf74f737c5a8b264ab1eb68da70 Mon Sep 17 00:00:00 2001
From: Martin Westgate <martinjwestgate@gmail.com>
Date: Fri, 8 Nov 2024 19:32:04 +1100
Subject: [PATCH] Ensure that `check_fields()` works consistently without using
 regex

- fix bug where `match.arg()` was always noisy in `request_metadata()`
- rewire `check_fields()` to use data captured from `filter()`, rather than parsing urls
- Place @daxkellie as second author from this version
---
 DESCRIPTION                             |  6 ++--
 R/atlas_media.R                         |  6 ++--
 R/build_query_set.R                     |  2 +-
 R/check.R                               | 34 +++++++++++-----------
 R/collapse_media.R                      |  3 +-
 R/collapse_occurrences.R                |  2 ++
 R/collapse_occurrences_count.R          | 24 ++++++++--------
 R/collapse_species.R                    |  1 +
 R/collapse_species_count.R              | 19 +++++++------
 R/galah_call.R                          |  7 +++--
 R/utilities_internal.R                  | 38 -------------------------
 man/galah.Rd                            |  2 +-
 tests/testthat/test-atlas_media.R       | 18 +++++++++---
 tests/testthat/test-atlas_occurrences.R |  2 +-
 tests/testthat/test-galah_filter.R      |  9 ++++--
 15 files changed, 78 insertions(+), 95 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a7f3f34c..3ddc612f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -7,13 +7,13 @@ Authors@R:
              family = "Westgate",
              email = "martin.westgate@csiro.au",
              role = c("aut", "cre")),
-      person(given = "Matilda",
-             family = "Stevenson",
-             role = "aut"),
       person(given = "Dax",
              family = "Kellie",
              email = "dax.kellie@csiro.au",
              role = "aut"),
+      person(given = "Matilda",
+             family = "Stevenson",
+             role = "aut"),
       person(given = "Peggy",
              family = "Newman",
              email = "peggy.newman@csiro.au",
diff --git a/R/atlas_media.R b/R/atlas_media.R
index f7a66022..44c7b678 100644
--- a/R/atlas_media.R
+++ b/R/atlas_media.R
@@ -71,9 +71,9 @@ atlas_media <- function(request = NULL,
   # note that behaviour here depends on whether we have run compute_checks() above
   if(inherits(.query, "data_request")){
     .query$filter <- bind_rows(.query$filter, 
-                              tibble(variable = "media",
-                                     logical = "==",
-                                     value = paste(present_fields, collapse = "|"),
+                              tibble(variable = present_fields,
+                                     logical = "!=",
+                                     value = "\"\"",
                                      query = as.character(media_fq)))  
   }else if(inherits(.query, "query")){ # i.e. if .query is already a `query`
     url <- url_parse(.query$url)
diff --git a/R/build_query_set.R b/R/build_query_set.R
index 7b240bb3..307f1ce1 100644
--- a/R/build_query_set.R
+++ b/R/build_query_set.R
@@ -75,7 +75,7 @@ build_query_set_data <- function(x, mint_doi, ...){
     if(!fields_absent[["select"]] | x$type %in% c("occurrences", "species")){
       result <- list(collapse_fields(), collapse_assertions())
     }else{
-      result <- list() 
+      result <- list()
     }
   }
   # handle `identify()`
diff --git a/R/check.R b/R/check.R
index 08e7eb49..496a0ebc 100644
--- a/R/check.R
+++ b/R/check.R
@@ -202,7 +202,7 @@ check_field_identities <- function(df, .query){
   df
 }
 
-#' sub-function to `check_fields()` for living atlases
+#' sub-function to `check_fields()` for GBIF
 #' @importFrom jsonlite fromJSON
 #' @importFrom purrr pluck
 #' @noRd
@@ -241,7 +241,7 @@ check_fields_gbif_counts <- function(.query){
   c(filter_invalid, group_by_invalid)
 }
 
-#' sub-function to `check_fields()` for living atlases
+#' sub-function to `check_fields()` for GBIF
 #' @importFrom jsonlite fromJSON
 #' @importFrom purrr pluck
 #' @noRd
@@ -272,12 +272,6 @@ check_fields_gbif_predicates <- function(.query){
 #' @noRd
 #' @keywords Internal
 check_fields_la <- function(.query){
-  if(inherits(.query$url, "data.frame")){
-    url <- url_parse(.query$url$url[1])
-  }else{
-    url <- url_parse(.query$url[1])  
-  }
-  queries <- url$query
   
   # set fields to check against
   # NOTE: These are retrieved in collapse()
@@ -287,20 +281,18 @@ check_fields_la <- function(.query){
   
   # extract fields from filter & identify
   filter_invalid <- NA
-  if(is.null(queries$fq)){
+  if(is.null(.query$filter)){
     # note: above was previously: `exists("fq", where = queries)`
     # Error in as.environment(where) : using 'as.environment(NULL)' is defunct
     filters <- NULL
   }else{
-    if (nchar(queries$fq) > 0) {
-      provided_fields <- string_to_tibble(queries$fq)
-      filters <- provided_fields |>
-        pull("value") |>
-        gsub("\\(|\\)|\\-|\\:", "", x = _)
-    } else {
-      filters <- NULL
-    }
-    
+    # extract field names
+    # note: `filter()` often concatenates field names with logical statements
+    # hence `strsplit()` step here
+    filters <- .query$filter$variable |>
+      strsplit("\\||\\&") |>
+      unlist() |>
+      unique()
     if (length(filters) > 0) {
       if (!all(filters %in% valid_any)) {
         invalid_fields <- filters[!(filters %in% valid_any)]
@@ -311,6 +303,12 @@ check_fields_la <- function(.query){
 
   # galah_group_by fields check
   group_by_invalid <- NA
+  if(inherits(.query$url, "data.frame")){
+    url <- url_parse(.query$url$url[1])
+  }else{
+    url <- url_parse(.query$url[1])
+  }
+  queries <- url$query
   if (!is.null(queries$facets)) {
     facets <- queries[names(queries) == "facets"] |> unlist() # NOTE: arrange() is missing
     if (length(facets) > 0) {
diff --git a/R/collapse_media.R b/R/collapse_media.R
index 31dae256..a6aae571 100644
--- a/R/collapse_media.R
+++ b/R/collapse_media.R
@@ -30,8 +30,9 @@ collapse_media <- function(.query){
   result <- list(
     type = "metadata/media",
     url = url_lookup("metadata/media"),
+    headers = build_headers(),
     body = toJSON(list(imageIds = media_ids)),
-    headers = build_headers())
+    filter = .query$filter)
   
   class(result) <- "query"
   return(result)
diff --git a/R/collapse_occurrences.R b/R/collapse_occurrences.R
index a6d5f321..70501f54 100644
--- a/R/collapse_occurrences.R
+++ b/R/collapse_occurrences.R
@@ -42,6 +42,7 @@ collapse_occurrences_uk <- function(.query){
     type = "data/occurrences",
     url = url_build(url),
     headers = build_headers(),
+    filter = .query$filter,
     select = .query$select)
   class(result) <- "query"
   return(result)
@@ -117,6 +118,7 @@ collapse_occurrences_la <- function(.query){
     type = "data/occurrences",
     url = url_build(url),
     headers = build_headers(),
+    filter = .query$filter,
     select = .query$select)
   class(result) <- "query"
   return(result)
diff --git a/R/collapse_occurrences_count.R b/R/collapse_occurrences_count.R
index 3f7c56e4..ee91eab7 100644
--- a/R/collapse_occurrences_count.R
+++ b/R/collapse_occurrences_count.R
@@ -34,15 +34,16 @@ collapse_occurrences_count_atlas <- function(identify = NULL,
                        data_profile = data_profile) 
   # set behaviour depending on `group_by()`
   if(is.null(group_by)){
-    result <- list(type = "data/occurrences-count")
     url <- url_lookup("data/occurrences-count") |> 
       url_parse()
     url$query <- c(query, pageSize = 0)
-    result$url <- url_build(url)
-    result$slot_name <- "totalRecords"
-    result$expand <- FALSE
+    result <- list(type = "data/occurrences-count",
+                   url = url_build(url),
+                   headers = build_headers(),
+                   filter = filter,
+                   slot_name = "totalRecords",
+                   expand = FALSE)
   }else{
-    result <- list(type = "data/occurrences-count-groupby")
     url <- url_lookup("data/occurrences-count-groupby") |> 
       url_parse()
     facets <- as.list(group_by$name)
@@ -56,17 +57,16 @@ collapse_occurrences_count_atlas <- function(identify = NULL,
     if(is.null(arrange)){
       arrange <- tibble(variable = "count", direction = "descending")
     }
-    
     slice_arrange <- bind_cols(slice, arrange) 
     arrange_list <- check_slice_arrange(slice_arrange)
     url$query <- c(query, facets, arrange_list)
-    result$url <- url_build(url)
-    result$expand <- ifelse(length(facets) > 1, TRUE, FALSE)
-    result$arrange <- slice_arrange
+    result <- list(type = "data/occurrences-count-groupby",
+                   url = url_build(url),
+                   headers = build_headers(),
+                   filter = filter,
+                   expand = ifelse(length(facets) > 1, TRUE, FALSE),
+                   arrange = slice_arrange)
   }
-  
-  # aggregate and return
-  result$headers <- build_headers()
   class(result) <- "query"
   return(result)
 }
diff --git a/R/collapse_species.R b/R/collapse_species.R
index 4e8b14aa..070a87e6 100644
--- a/R/collapse_species.R
+++ b/R/collapse_species.R
@@ -42,6 +42,7 @@ collapse_species_atlas <- function(.query){
     type = "data/species",
     url = url_build(url),
     headers = build_headers(),
+    filter = .query$filter,
     download = TRUE)
   class(result) <- "query"
   result
diff --git a/R/collapse_species_count.R b/R/collapse_species_count.R
index 4dd99a9e..a5a8e345 100644
--- a/R/collapse_species_count.R
+++ b/R/collapse_species_count.R
@@ -34,14 +34,16 @@ collapse_species_count_atlas <- function(identify = NULL,
                        filter, 
                        geolocate, 
                        data_profile = data_profile)
-  result <- list(type = "data/species-count")
   # set behaviour depending on `group_by()`
   if(is.null(group_by)){
     url$query <- c(query,
                    list(flimit = 1, 
                         facets = species_facets()))
-    result$url <- url_build(url)
-    result$expand <- FALSE
+    result <- list(type = "data/species-count",
+                   url = url_build(url),
+                   headers = build_headers(),
+                   filter = filter,
+                   expand = FALSE)
   }else{
     facets <- c(as.list(group_by$name), species_facets())
     names(facets) <- rep("facets", length(facets))
@@ -54,12 +56,13 @@ collapse_species_count_atlas <- function(identify = NULL,
     slice_arrange <- bind_cols(slice, arrange) 
     arrange_list <- check_slice_arrange(slice_arrange)
     url$query <- c(query, facets, arrange_list)
-    result$url <- url_build(url)
-    result$expand <- TRUE
-    result$arrange <- slice_arrange
+    result <- list(type = "data/species-count",
+                   url = url_build(url),
+                   headers = build_headers(),
+                   filter = filter,
+                   expand = TRUE,
+                   arrange = slice_arrange)
   }
-  # aggregate and return
-  result$headers <- build_headers()
   class(result) <- "query"
   return(result)
 }
diff --git a/R/galah_call.R b/R/galah_call.R
index 54a55881..1b4d1617 100644
--- a/R/galah_call.R
+++ b/R/galah_call.R
@@ -173,8 +173,9 @@ request_metadata <- function(type = c("fields",
                                       "reasons",
                                       "taxa",
                                       "identifiers")){
-  type <- try(match.arg(type))
-  if(inherits(type, "try-error")){
+  type_checked <- try(match.arg(type),
+                      silent = TRUE)
+  if(inherits(type_checked, "try-error")){
     bullets <- c(
       glue("Unrecognised metadata requested."),
       i = "See `?show_all()` for a list of valid metadata types.",
@@ -182,7 +183,7 @@ request_metadata <- function(type = c("fields",
     )
     abort(bullets)   
   }
-  x <- list(type = type)
+  x <- list(type = type_checked)
   class(x) <- "metadata_request"
   return(x)
 }
diff --git a/R/utilities_internal.R b/R/utilities_internal.R
index 296a4070..24ff347a 100644
--- a/R/utilities_internal.R
+++ b/R/utilities_internal.R
@@ -85,44 +85,6 @@ camel_to_snake_case <- function(x){
     tolower()
 }
 
-#' Simple internal function to split strings
-#' @importFrom stringr str_extract
-#' @importFrom stringr str_extract_all
-#' @importFrom stringr str_trim
-#' @importFrom stringr str_remove
-#' @importFrom tibble as_tibble
-#' @noRd
-#' @keywords Internal
-string_to_tibble <- function(string, split_by = c(":")){
-  # ChatGPT created the reprex in this function. 
-  # Possibly over-engineered, but it works
-  
-  # The pattern logic is:
-  #   (?<=\\(|OR|AND|-): A positive lookbehind to ensure the match is preceded by (, OR, AND, or -.
-  #   \\s*: Matches any whitespace characters.
-  #   ([-\\w\\(\\)]+): Captures one or more word characters, hyphens, or parentheses.
-  #   \\s*(?=:): Matches any whitespace characters followed by a colon, using a positive lookahead to ensure the match is followed by a colon.
-  extracted_strings <-
-    # new
-    stringr::str_extract_all(
-      string,
-      "(?<=\\(|OR|AND)\\s*([-\\w\\(\\)]+)\\s*(?=:)"
-    ) |>
-    # old (v2.0.2)
-    # stringr::str_extract_all(
-    #   string, 
-    #   "(?<=\\()[^\\*]*?(?=\\:)|(?<=OR\\s)[^\\*]*?(?=\\:)"
-    #   ) |>
-    unlist() |>
-    str_remove("-|\\(") |>
-    str_extract("[:alnum:]+") |>
-    str_trim() |>
-    as_tibble() |>
-    unique()
-  return(extracted_strings)
-  
-}
-
 ##---------------------------------------------------------------
 ##                   Other helpful functions                   --
 ##---------------------------------------------------------------
diff --git a/man/galah.Rd b/man/galah.Rd
index 0d67d816..17efead7 100644
--- a/man/galah.Rd
+++ b/man/galah.Rd
@@ -117,8 +117,8 @@ Useful links:
 
 Authors:
 \itemize{
-  \item Matilda Stevenson
   \item Dax Kellie \email{dax.kellie@csiro.au}
+  \item Matilda Stevenson
   \item Peggy Newman \email{peggy.newman@csiro.au}
 }
 
diff --git a/tests/testthat/test-atlas_media.R b/tests/testthat/test-atlas_media.R
index 312d640d..0ee477e2 100644
--- a/tests/testthat/test-atlas_media.R
+++ b/tests/testthat/test-atlas_media.R
@@ -56,14 +56,24 @@ test_that("`collapse()` and `collect()` work for `type = 'media'`", {
     filter(media == occ_collect) |>
     collapse()
   expect_true(inherits(media_collapse, "query"))
-  expect_equal(length(media_collapse), 4)
-  expect_equal(names(media_collapse), c("type", "url", "body", "headers"))
+  expect_equal(length(media_collapse), 5)
+  expect_equal(names(media_collapse), 
+               c("type", 
+                 "url",
+                 "headers",
+                 "body",
+                 "filter"))
   expect_true(media_collapse$type == "metadata/media")
   # compute
   media_compute <- compute(media_collapse)
   expect_true(inherits(media_compute, "computed_query"))
-  expect_equal(length(media_compute), 4)
-  expect_equal(names(media_compute), c("type", "url", "body", "headers"))
+  expect_equal(length(media_compute), 5)
+  expect_equal(names(media_compute), 
+               c("type", 
+                 "url",
+                 "headers",
+                 "body",
+                 "filter"))
   # collect
   media_collect <- collect(media_compute)
   expect_s3_class(media_collect, c("tbl_df", "tbl", "data.frame"))
diff --git a/tests/testthat/test-atlas_occurrences.R b/tests/testthat/test-atlas_occurrences.R
index fe884fae..22c7bee8 100644
--- a/tests/testthat/test-atlas_occurrences.R
+++ b/tests/testthat/test-atlas_occurrences.R
@@ -41,7 +41,7 @@ test_that("`compute(type = 'occurrences')` works", {
   # collapse
   query_collapse <- collapse(base_query)
   expect_true(inherits(query_collapse, "query"))
-  expect_equal(length(query_collapse), 3)
+  expect_equal(length(query_collapse), 4)
   expect_equal(query_collapse$type, "data/occurrences")
   # compute
   response <- compute(base_query)
diff --git a/tests/testthat/test-galah_filter.R b/tests/testthat/test-galah_filter.R
index 62d9106a..49c0ff51 100644
--- a/tests/testthat/test-galah_filter.R
+++ b/tests/testthat/test-galah_filter.R
@@ -256,8 +256,13 @@ test_that("galah_filter handles lsid as an input", {
     collapse()
   # number of taxa searches is 3, not 4
   expect_s3_class(query, "query")
-  expect_equal(length(query), 5)
-  expect_equal(names(query), c("type", "url", "slot_name", "expand", "headers"))
+  expect_equal(length(query), 6)
+  expect_equal(names(query), c("type", 
+                               "url", 
+                               "headers",
+                               "filter",
+                               "slot_name", 
+                               "expand"))
 })
 
 test_that("galah_filter handles different fields separated by OR", {