From 3d34208993331bf74f737c5a8b264ab1eb68da70 Mon Sep 17 00:00:00 2001 From: Martin Westgate Date: Fri, 8 Nov 2024 19:32:04 +1100 Subject: [PATCH] Ensure that `check_fields()` works consistently without using regex - fix bug where `match.arg()` was always noisy in `request_metadata()` - rewire `check_fields()` to use data captured from `filter()`, rather than parsing urls - Place @daxkellie as second author from this version --- DESCRIPTION | 6 ++-- R/atlas_media.R | 6 ++-- R/build_query_set.R | 2 +- R/check.R | 34 +++++++++++----------- R/collapse_media.R | 3 +- R/collapse_occurrences.R | 2 ++ R/collapse_occurrences_count.R | 24 ++++++++-------- R/collapse_species.R | 1 + R/collapse_species_count.R | 19 +++++++------ R/galah_call.R | 7 +++-- R/utilities_internal.R | 38 ------------------------- man/galah.Rd | 2 +- tests/testthat/test-atlas_media.R | 18 +++++++++--- tests/testthat/test-atlas_occurrences.R | 2 +- tests/testthat/test-galah_filter.R | 9 ++++-- 15 files changed, 78 insertions(+), 95 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a7f3f34c..3ddc612f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,13 +7,13 @@ Authors@R: family = "Westgate", email = "martin.westgate@csiro.au", role = c("aut", "cre")), - person(given = "Matilda", - family = "Stevenson", - role = "aut"), person(given = "Dax", family = "Kellie", email = "dax.kellie@csiro.au", role = "aut"), + person(given = "Matilda", + family = "Stevenson", + role = "aut"), person(given = "Peggy", family = "Newman", email = "peggy.newman@csiro.au", diff --git a/R/atlas_media.R b/R/atlas_media.R index f7a66022..44c7b678 100644 --- a/R/atlas_media.R +++ b/R/atlas_media.R @@ -71,9 +71,9 @@ atlas_media <- function(request = NULL, # note that behaviour here depends on whether we have run compute_checks() above if(inherits(.query, "data_request")){ .query$filter <- bind_rows(.query$filter, - tibble(variable = "media", - logical = "==", - value = paste(present_fields, collapse = "|"), + tibble(variable = present_fields, + logical = "!=", + value = "\"\"", query = as.character(media_fq))) }else if(inherits(.query, "query")){ # i.e. if .query is already a `query` url <- url_parse(.query$url) diff --git a/R/build_query_set.R b/R/build_query_set.R index 7b240bb3..307f1ce1 100644 --- a/R/build_query_set.R +++ b/R/build_query_set.R @@ -75,7 +75,7 @@ build_query_set_data <- function(x, mint_doi, ...){ if(!fields_absent[["select"]] | x$type %in% c("occurrences", "species")){ result <- list(collapse_fields(), collapse_assertions()) }else{ - result <- list() + result <- list() } } # handle `identify()` diff --git a/R/check.R b/R/check.R index 08e7eb49..496a0ebc 100644 --- a/R/check.R +++ b/R/check.R @@ -202,7 +202,7 @@ check_field_identities <- function(df, .query){ df } -#' sub-function to `check_fields()` for living atlases +#' sub-function to `check_fields()` for GBIF #' @importFrom jsonlite fromJSON #' @importFrom purrr pluck #' @noRd @@ -241,7 +241,7 @@ check_fields_gbif_counts <- function(.query){ c(filter_invalid, group_by_invalid) } -#' sub-function to `check_fields()` for living atlases +#' sub-function to `check_fields()` for GBIF #' @importFrom jsonlite fromJSON #' @importFrom purrr pluck #' @noRd @@ -272,12 +272,6 @@ check_fields_gbif_predicates <- function(.query){ #' @noRd #' @keywords Internal check_fields_la <- function(.query){ - if(inherits(.query$url, "data.frame")){ - url <- url_parse(.query$url$url[1]) - }else{ - url <- url_parse(.query$url[1]) - } - queries <- url$query # set fields to check against # NOTE: These are retrieved in collapse() @@ -287,20 +281,18 @@ check_fields_la <- function(.query){ # extract fields from filter & identify filter_invalid <- NA - if(is.null(queries$fq)){ + if(is.null(.query$filter)){ # note: above was previously: `exists("fq", where = queries)` # Error in as.environment(where) : using 'as.environment(NULL)' is defunct filters <- NULL }else{ - if (nchar(queries$fq) > 0) { - provided_fields <- string_to_tibble(queries$fq) - filters <- provided_fields |> - pull("value") |> - gsub("\\(|\\)|\\-|\\:", "", x = _) - } else { - filters <- NULL - } - + # extract field names + # note: `filter()` often concatenates field names with logical statements + # hence `strsplit()` step here + filters <- .query$filter$variable |> + strsplit("\\||\\&") |> + unlist() |> + unique() if (length(filters) > 0) { if (!all(filters %in% valid_any)) { invalid_fields <- filters[!(filters %in% valid_any)] @@ -311,6 +303,12 @@ check_fields_la <- function(.query){ # galah_group_by fields check group_by_invalid <- NA + if(inherits(.query$url, "data.frame")){ + url <- url_parse(.query$url$url[1]) + }else{ + url <- url_parse(.query$url[1]) + } + queries <- url$query if (!is.null(queries$facets)) { facets <- queries[names(queries) == "facets"] |> unlist() # NOTE: arrange() is missing if (length(facets) > 0) { diff --git a/R/collapse_media.R b/R/collapse_media.R index 31dae256..a6aae571 100644 --- a/R/collapse_media.R +++ b/R/collapse_media.R @@ -30,8 +30,9 @@ collapse_media <- function(.query){ result <- list( type = "metadata/media", url = url_lookup("metadata/media"), + headers = build_headers(), body = toJSON(list(imageIds = media_ids)), - headers = build_headers()) + filter = .query$filter) class(result) <- "query" return(result) diff --git a/R/collapse_occurrences.R b/R/collapse_occurrences.R index a6d5f321..70501f54 100644 --- a/R/collapse_occurrences.R +++ b/R/collapse_occurrences.R @@ -42,6 +42,7 @@ collapse_occurrences_uk <- function(.query){ type = "data/occurrences", url = url_build(url), headers = build_headers(), + filter = .query$filter, select = .query$select) class(result) <- "query" return(result) @@ -117,6 +118,7 @@ collapse_occurrences_la <- function(.query){ type = "data/occurrences", url = url_build(url), headers = build_headers(), + filter = .query$filter, select = .query$select) class(result) <- "query" return(result) diff --git a/R/collapse_occurrences_count.R b/R/collapse_occurrences_count.R index 3f7c56e4..ee91eab7 100644 --- a/R/collapse_occurrences_count.R +++ b/R/collapse_occurrences_count.R @@ -34,15 +34,16 @@ collapse_occurrences_count_atlas <- function(identify = NULL, data_profile = data_profile) # set behaviour depending on `group_by()` if(is.null(group_by)){ - result <- list(type = "data/occurrences-count") url <- url_lookup("data/occurrences-count") |> url_parse() url$query <- c(query, pageSize = 0) - result$url <- url_build(url) - result$slot_name <- "totalRecords" - result$expand <- FALSE + result <- list(type = "data/occurrences-count", + url = url_build(url), + headers = build_headers(), + filter = filter, + slot_name = "totalRecords", + expand = FALSE) }else{ - result <- list(type = "data/occurrences-count-groupby") url <- url_lookup("data/occurrences-count-groupby") |> url_parse() facets <- as.list(group_by$name) @@ -56,17 +57,16 @@ collapse_occurrences_count_atlas <- function(identify = NULL, if(is.null(arrange)){ arrange <- tibble(variable = "count", direction = "descending") } - slice_arrange <- bind_cols(slice, arrange) arrange_list <- check_slice_arrange(slice_arrange) url$query <- c(query, facets, arrange_list) - result$url <- url_build(url) - result$expand <- ifelse(length(facets) > 1, TRUE, FALSE) - result$arrange <- slice_arrange + result <- list(type = "data/occurrences-count-groupby", + url = url_build(url), + headers = build_headers(), + filter = filter, + expand = ifelse(length(facets) > 1, TRUE, FALSE), + arrange = slice_arrange) } - - # aggregate and return - result$headers <- build_headers() class(result) <- "query" return(result) } diff --git a/R/collapse_species.R b/R/collapse_species.R index 4e8b14aa..070a87e6 100644 --- a/R/collapse_species.R +++ b/R/collapse_species.R @@ -42,6 +42,7 @@ collapse_species_atlas <- function(.query){ type = "data/species", url = url_build(url), headers = build_headers(), + filter = .query$filter, download = TRUE) class(result) <- "query" result diff --git a/R/collapse_species_count.R b/R/collapse_species_count.R index 4dd99a9e..a5a8e345 100644 --- a/R/collapse_species_count.R +++ b/R/collapse_species_count.R @@ -34,14 +34,16 @@ collapse_species_count_atlas <- function(identify = NULL, filter, geolocate, data_profile = data_profile) - result <- list(type = "data/species-count") # set behaviour depending on `group_by()` if(is.null(group_by)){ url$query <- c(query, list(flimit = 1, facets = species_facets())) - result$url <- url_build(url) - result$expand <- FALSE + result <- list(type = "data/species-count", + url = url_build(url), + headers = build_headers(), + filter = filter, + expand = FALSE) }else{ facets <- c(as.list(group_by$name), species_facets()) names(facets) <- rep("facets", length(facets)) @@ -54,12 +56,13 @@ collapse_species_count_atlas <- function(identify = NULL, slice_arrange <- bind_cols(slice, arrange) arrange_list <- check_slice_arrange(slice_arrange) url$query <- c(query, facets, arrange_list) - result$url <- url_build(url) - result$expand <- TRUE - result$arrange <- slice_arrange + result <- list(type = "data/species-count", + url = url_build(url), + headers = build_headers(), + filter = filter, + expand = TRUE, + arrange = slice_arrange) } - # aggregate and return - result$headers <- build_headers() class(result) <- "query" return(result) } diff --git a/R/galah_call.R b/R/galah_call.R index 54a55881..1b4d1617 100644 --- a/R/galah_call.R +++ b/R/galah_call.R @@ -173,8 +173,9 @@ request_metadata <- function(type = c("fields", "reasons", "taxa", "identifiers")){ - type <- try(match.arg(type)) - if(inherits(type, "try-error")){ + type_checked <- try(match.arg(type), + silent = TRUE) + if(inherits(type_checked, "try-error")){ bullets <- c( glue("Unrecognised metadata requested."), i = "See `?show_all()` for a list of valid metadata types.", @@ -182,7 +183,7 @@ request_metadata <- function(type = c("fields", ) abort(bullets) } - x <- list(type = type) + x <- list(type = type_checked) class(x) <- "metadata_request" return(x) } diff --git a/R/utilities_internal.R b/R/utilities_internal.R index 296a4070..24ff347a 100644 --- a/R/utilities_internal.R +++ b/R/utilities_internal.R @@ -85,44 +85,6 @@ camel_to_snake_case <- function(x){ tolower() } -#' Simple internal function to split strings -#' @importFrom stringr str_extract -#' @importFrom stringr str_extract_all -#' @importFrom stringr str_trim -#' @importFrom stringr str_remove -#' @importFrom tibble as_tibble -#' @noRd -#' @keywords Internal -string_to_tibble <- function(string, split_by = c(":")){ - # ChatGPT created the reprex in this function. - # Possibly over-engineered, but it works - - # The pattern logic is: - # (?<=\\(|OR|AND|-): A positive lookbehind to ensure the match is preceded by (, OR, AND, or -. - # \\s*: Matches any whitespace characters. - # ([-\\w\\(\\)]+): Captures one or more word characters, hyphens, or parentheses. - # \\s*(?=:): Matches any whitespace characters followed by a colon, using a positive lookahead to ensure the match is followed by a colon. - extracted_strings <- - # new - stringr::str_extract_all( - string, - "(?<=\\(|OR|AND)\\s*([-\\w\\(\\)]+)\\s*(?=:)" - ) |> - # old (v2.0.2) - # stringr::str_extract_all( - # string, - # "(?<=\\()[^\\*]*?(?=\\:)|(?<=OR\\s)[^\\*]*?(?=\\:)" - # ) |> - unlist() |> - str_remove("-|\\(") |> - str_extract("[:alnum:]+") |> - str_trim() |> - as_tibble() |> - unique() - return(extracted_strings) - -} - ##--------------------------------------------------------------- ## Other helpful functions -- ##--------------------------------------------------------------- diff --git a/man/galah.Rd b/man/galah.Rd index 0d67d816..17efead7 100644 --- a/man/galah.Rd +++ b/man/galah.Rd @@ -117,8 +117,8 @@ Useful links: Authors: \itemize{ - \item Matilda Stevenson \item Dax Kellie \email{dax.kellie@csiro.au} + \item Matilda Stevenson \item Peggy Newman \email{peggy.newman@csiro.au} } diff --git a/tests/testthat/test-atlas_media.R b/tests/testthat/test-atlas_media.R index 312d640d..0ee477e2 100644 --- a/tests/testthat/test-atlas_media.R +++ b/tests/testthat/test-atlas_media.R @@ -56,14 +56,24 @@ test_that("`collapse()` and `collect()` work for `type = 'media'`", { filter(media == occ_collect) |> collapse() expect_true(inherits(media_collapse, "query")) - expect_equal(length(media_collapse), 4) - expect_equal(names(media_collapse), c("type", "url", "body", "headers")) + expect_equal(length(media_collapse), 5) + expect_equal(names(media_collapse), + c("type", + "url", + "headers", + "body", + "filter")) expect_true(media_collapse$type == "metadata/media") # compute media_compute <- compute(media_collapse) expect_true(inherits(media_compute, "computed_query")) - expect_equal(length(media_compute), 4) - expect_equal(names(media_compute), c("type", "url", "body", "headers")) + expect_equal(length(media_compute), 5) + expect_equal(names(media_compute), + c("type", + "url", + "headers", + "body", + "filter")) # collect media_collect <- collect(media_compute) expect_s3_class(media_collect, c("tbl_df", "tbl", "data.frame")) diff --git a/tests/testthat/test-atlas_occurrences.R b/tests/testthat/test-atlas_occurrences.R index fe884fae..22c7bee8 100644 --- a/tests/testthat/test-atlas_occurrences.R +++ b/tests/testthat/test-atlas_occurrences.R @@ -41,7 +41,7 @@ test_that("`compute(type = 'occurrences')` works", { # collapse query_collapse <- collapse(base_query) expect_true(inherits(query_collapse, "query")) - expect_equal(length(query_collapse), 3) + expect_equal(length(query_collapse), 4) expect_equal(query_collapse$type, "data/occurrences") # compute response <- compute(base_query) diff --git a/tests/testthat/test-galah_filter.R b/tests/testthat/test-galah_filter.R index 62d9106a..49c0ff51 100644 --- a/tests/testthat/test-galah_filter.R +++ b/tests/testthat/test-galah_filter.R @@ -256,8 +256,13 @@ test_that("galah_filter handles lsid as an input", { collapse() # number of taxa searches is 3, not 4 expect_s3_class(query, "query") - expect_equal(length(query), 5) - expect_equal(names(query), c("type", "url", "slot_name", "expand", "headers")) + expect_equal(length(query), 6) + expect_equal(names(query), c("type", + "url", + "headers", + "filter", + "slot_name", + "expand")) }) test_that("galah_filter handles different fields separated by OR", {