From 904e3927e4313ee86d805c78e6995f18566385b4 Mon Sep 17 00:00:00 2001 From: Philip Chase Date: Fri, 2 Jun 2023 16:25:55 -0400 Subject: [PATCH 1/3] Add objects_to_include param to unnest_job_summary_data_json_object() --- R/unnest_job_summary_data_json_object.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/unnest_job_summary_data_json_object.R b/R/unnest_job_summary_data_json_object.R index 1636528..bba576b 100644 --- a/R/unnest_job_summary_data_json_object.R +++ b/R/unnest_job_summary_data_json_object.R @@ -37,13 +37,19 @@ #' #' unnest_job_summary_data_json_object(log_data) #' } -unnest_job_summary_data_json_object <- function(log_data) { +unnest_job_summary_data_json_object <- function(log_data, objects_to_include = NA) { log_ids <- log_data$id names(log_ids) <- log_ids - result <- purrr::map2_dfr(log_ids, log_data$job_summary_data, ~ jsonlite::fromJSON(.y), .id = "id") %>% - dplyr::mutate(id = as.integer(.data$id)) %>% - dplyr::left_join(log_data %>% dplyr::select(-"job_summary_data"), by = "id") + if (is.na(objects_to_include[1]) & length(objects_to_include) == 1) { + result <- purrr::map2_dfr(log_ids, log_data$job_summary_data, ~ jsonlite::fromJSON(.y), .id = "id") %>% + dplyr::mutate(id = as.integer(.data$id)) %>% + dplyr::left_join(log_data %>% dplyr::select(-"job_summary_data"), by = "id") + } else { + result <- purrr::map2_dfr(log_ids, log_data$job_summary_data, ~ jsonlite::fromJSON(.y)[objects_to_include], .id = "id") %>% + dplyr::mutate(id = as.integer(.data$id)) %>% + dplyr::left_join(log_data %>% dplyr::select(-"job_summary_data"), by = "id") + } return(result) } From 60771db4c9c60bf757c27fe2304b236857db0fb5 Mon Sep 17 00:00:00 2001 From: Philip Chase Date: Mon, 23 Oct 2023 12:39:08 -0400 Subject: [PATCH 2/3] Update docs and tests for unnest_job_summary_data_json_object Add docs and tests for the objects_to_include parameter. --- DESCRIPTION | 2 +- R/unnest_job_summary_data_json_object.R | 28 ++++++++++++++-- man/unnest_job_summary_data_json_object.Rd | 31 +++++++++++++++--- ...test-unnest_job_summary_data_json_object.R | 8 ++--- .../log_data.rds | Bin 460 -> 2562 bytes .../make_test_data.R | 26 +++++++-------- 6 files changed, 68 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1b530df..065f055 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,6 +66,6 @@ Suggests: testthat (>= 3.0.0) VignetteBuilder: knitr Config/testthat/edition: 3 -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 Depends: R (>= 2.10) diff --git a/R/unnest_job_summary_data_json_object.R b/R/unnest_job_summary_data_json_object.R index bba576b..50c288a 100644 --- a/R/unnest_job_summary_data_json_object.R +++ b/R/unnest_job_summary_data_json_object.R @@ -7,13 +7,19 @@ #' frames into a JSON object, so this has become the common practice. #' #' Yet using the data can be a bit more complex because `jsonlite::fromJSON` is -#' not vectorized. If you need read multiple job log records, iteration is +#' not vectorized. If you need to read multiple job log records, iteration is #' required to use JSON object in `job_summary_data`. #' #' This function addresses that need accepting a dataframe of job summary logs -#' and unnesting the JSON object(s) in `job_summary_data` into the equivalent R objects. +#' and unnesting the JSON object(s) in `job_summary_data` into the equivalent +#' R objects. +#' +#' Use `objects_to_include` to limit the unnested objects to those with similar +#' geometry. Dissimilar objects cannot be unnested together. #' #' @param log_data - a dataframe of job summary logs +#' @param objects_to_include - an optional vector of JSON objects in the +#' job_summary_data to be un-nested #' #' @return - the input dataframe of job summary logs with `job_summary_data` #' replaced by the R objects that were JSON-encoded therein. @@ -32,10 +38,26 @@ #' log_conn <- get_package_scope_var("log_con") #' #' log_data <- tbl(log_conn, "rcc_job_log") %>% -#' dplyr::filter(script_name == "warn_owners_of_impending_bill") %>% +#' dplyr::filter(script_name == "warn_owners_of_impending_bill") |> #' collect() #' #' unnest_job_summary_data_json_object(log_data) +#' +#' # Read a more complex object +#' log_data_2 <- tbl(log_conn, "rcc_job_log") |> +#' dplyr::filter( +#' script_name == "create_and_send_new_redcap_prod_per_project_line_items" & +#' level == "SUCCESS" +#' ) %>% +#' collect() +#' +#' # Fails due to dissimilar objects +#' unnest_job_summary_data_json_object(log_data_2) +#' # error is "Can't recycle `service_instance` (size 27) to match `invoice_line_item` (size 79)." +#' +#' # When separated, the objects can be unnested +#' unnest_job_summary_data_json_object(log_data_2, objects_to_include = c("service_instance")) +#' unnest_job_summary_data_json_object(log_data_2, objects_to_include = c("invoice_line_item")) #' } unnest_job_summary_data_json_object <- function(log_data, objects_to_include = NA) { log_ids <- log_data$id diff --git a/man/unnest_job_summary_data_json_object.Rd b/man/unnest_job_summary_data_json_object.Rd index 11b20f2..6439239 100644 --- a/man/unnest_job_summary_data_json_object.Rd +++ b/man/unnest_job_summary_data_json_object.Rd @@ -4,10 +4,13 @@ \alias{unnest_job_summary_data_json_object} \title{unnest_job_summary_data_json_object} \usage{ -unnest_job_summary_data_json_object(log_data) +unnest_job_summary_data_json_object(log_data, objects_to_include = NA) } \arguments{ \item{log_data}{- a dataframe of job summary logs} + +\item{objects_to_include}{- an optional vector of JSON objects in the +job_summary_data to be un-nested} } \value{ - the input dataframe of job summary logs with `job_summary_data` @@ -20,11 +23,15 @@ but in practice, it is very easy to encode a data frame or a list of data frames into a JSON object, so this has become the common practice. Yet using the data can be a bit more complex because `jsonlite::fromJSON` is -not vectorized. If you need read multiple job log records, iteration is +not vectorized. If you need to read multiple job log records, iteration is required to use JSON object in `job_summary_data`. This function addresses that need accepting a dataframe of job summary logs -and unnesting the JSON object(s) in `job_summary_data` into the equivalent R objects. +and unnesting the JSON object(s) in `job_summary_data` into the equivalent +R objects. + +Use `objects_to_include` to limit the unnested objects to those with similar +geometry. Dissimilar objects cannot be unnested together. } \examples{ \dontrun{ @@ -39,9 +46,25 @@ init_etl("pbc_scratch") log_conn <- get_package_scope_var("log_con") log_data <- tbl(log_conn, "rcc_job_log") \%>\% - dplyr::filter(script_name == "warn_owners_of_impending_bill") \%>\% + dplyr::filter(script_name == "warn_owners_of_impending_bill") |> collect() unnest_job_summary_data_json_object(log_data) + +# Read a more complex object +log_data_2 <- tbl(log_conn, "rcc_job_log") |> + dplyr::filter( + script_name == "create_and_send_new_redcap_prod_per_project_line_items" & + level == "SUCCESS" + ) \%>\% + collect() + +# Fails due to dissimilar objects +unnest_job_summary_data_json_object(log_data_2) +# error is "Can't recycle `service_instance` (size 27) to match `invoice_line_item` (size 79)." + +# When separated, the objects can be unnested +unnest_job_summary_data_json_object(log_data_2, objects_to_include = c("service_instance")) +unnest_job_summary_data_json_object(log_data_2, objects_to_include = c("invoice_line_item")) } } diff --git a/tests/testthat/test-unnest_job_summary_data_json_object.R b/tests/testthat/test-unnest_job_summary_data_json_object.R index cecee99..d627fac 100644 --- a/tests/testthat/test-unnest_job_summary_data_json_object.R +++ b/tests/testthat/test-unnest_job_summary_data_json_object.R @@ -2,21 +2,21 @@ testthat::test_that("unnest_job_summary_data_json_object works", { test_data_dir <- "unnest_job_summary_data_json_object" log_data <- readRDS(testthat::test_path(test_data_dir, "log_data.rds")) - sample_size <- 3 - ncols_in_unnested_data <- 3 + ncols_in_unnested_data <- 5 - output <- unnest_job_summary_data_json_object(log_data) + output <- unnest_job_summary_data_json_object(log_data, objects_to_include = "iris") testthat::expect_equal( output %>% dplyr::distinct(id) %>% nrow(), - sample_size + 1 ) testthat::expect_equal( output %>% dplyr::select(-dplyr::any_of(names(log_data))) %>% + pull(iris) %>% ncol(), ncols_in_unnested_data ) diff --git a/tests/testthat/unnest_job_summary_data_json_object/log_data.rds b/tests/testthat/unnest_job_summary_data_json_object/log_data.rds index 97c14c3b233a3e46bfb5ddca5454e5583c939e06..a04526ad6c68c7c1f167727a8abce2fce88ecad8 100644 GIT binary patch literal 2562 zcmV+d3jOsTiwFP!000001HD{ZZ`{Te-pEN^rH$Ru2}J*gfZ)bNS9Uvoha( zx>zh8Ej~UtSUmbfUM@a)`{w1dK;HX8zIZHu|62b3XtDeG?LxkD`}*U>lj_Z@G}>(B z#Z%?xlO!wSi@5N(eDR;g!k7Pg^^eb=S4JueAAP7lbWnfj@#{aFe*JD$)-QjyeOdP1 zl|SZL+;jKn?VIJ!(cbM}l<}rKyU8;7OU3Y;#V8C1A4qP$`sZKGt8x_;8}sPT^78t^ zJYvqFxq6?PN8X{iPBz!(k>lBi=4Ex^$cuFmmFAIag@@*yykM5c4$U7n@v3^G#pS1N zr8W7+=t|x>G%w<)Fps!28Wrc;i?brXGmp$y(U0rM_~Scn{&YCjci@ZGxo_D_-?{N^ zbfEx{CTj@1gP2+D87l(B+69h+;Z=Y0pm&x zs*>F~7k(Q9VFi3323;v0-)zdbFr;~<5O`Y%T^(|-N=${IYPkl07d8N#uv9U9!WC7pu>dZ42bFUO&P^Ioc_q^vm zP~H3O;>WrlT0L~XRQ!8w=G)3+zG#B)1S$#mJO6xp)%M>|>ACy;(Xn{ty7y z@?8ahrw`yn0U+7}&;by185rSS&43LvzxiAuv^f#Tdo zoQuY}V4d@(CLE~+ghC2}4Y*f_VkWi8GPzAw3WBZb=^=2c5xHHy=nyd0AP8Jl8ieHa zzscX{r9_Yyxh&2-c_P4Ikdw9s5&B&%_)QA9CzP3NJLH+HfXJdfb`W4PI}JAo8 zRj7eaRwu{#`XV-#mrI=>d{wItLGU^VaZyA?VjL$~uEqdTK^+0t zRon}XY!$e5&yKluA1IsFja++~7{AMl2;DU*mDp5)9d4}Ha8s3 zq+tB+&`gSCcXDtVUq`8RB3ZJ$tOS6xvAgwcvaas9RghoDrShgN=-sXQO?E{or`Pc+ ziKSI@6PNjBx6FO>ysFxL^K%;a0d@w>yQ9r}z`O&^i;?C9Fz(G48c}_+XFi1HePEuC z4h^TEcSnW>F+Mi#9E65pp_#GbX;tHlPOLv?Z~{u~z4^>jB2Np=_8e%)xHCHLedI`3 z*ywKU6B(yfJgr-aWB_@RYJTJhA(8>Y@&Sd)sOBMh#5{~W)jW(nl3IwKS~(Csi4Ml| zJe*Q9Vn;d_hDOXuW#L%#!(20#B}wBC8mD$Eqjc+NRfCu(DT7%h-;DDK8{VT6OCy8v zJl~H!pI3VH3zK89|Gx{Tu}y_wA3wmMd8Dbv??hl|NSg(!Z~|pHA9jr8XH@ee<4zmi z9HfPU_ zdfS0&g62V@HSiCfA#@qN2RUiVDv&f! zA#Qkthx7plCxkkwhK)A!@WE0IR2|mKliCeiMF3m`lX?QqhFK>A5~gW0nzja{ubGAm z!Z#Hh4j-*R5t?pFS9?g)Az^~mq~ii`Q{Mq>Ie)@*Kyomxr=d~;yZdPa6skQ$#K0WP z?gDhfyyT4$pGLb4I}mU>XoyMV_;2Q|$?!=F=77Eeq6~nfr^Bm|x=Ea*BZvv42NSL1 z0Coc$KQlW3lz|0FXZV0a63VtjNAJOohHIFx0>U18%FYdwVu+lC8xx&04^B4Bfkuds z(SuC4DQmz)@ysFyp66krc+zwuon=g+4ANOfzuj1mSCir*$&wZA?-z*UEu?zzhY3S~ z`OJD*c)&h37(gKjgpvZfhrus(1yD(>*KQ9+XmIX$0KZ0ru)`{3*E1xBA6x+;2OiSQ z8aaTi2G#(xpy9zszM%$l#{-?9(Q8N@!5uX0HF6zc`$D9bVHU`}40mlvC-mz`qf#+K zg;^dOAB^h&T_f@Di5IO5pc(FGfb|B#R4#&@tkDI8zpC4US zyQv2=KsW~zNQB7vF$Iap3J{?w8KC5N%0xyKFx-<*m5!nJ_c8_!REa zz&yb8J9#Q2GmWC_G-fi~qk#i9EDMu#1vp=uH1`Q7Uz50q@V5~H_JPhWkZ=MyF$bhI zOsJ#({azf=@0fO(qP@mWW(~d={#pl|Oik)$!X88`2LuHV@Q>FOC>Oy_)2PzHKfAzc z^ggcm?x)3Kv3*w3^DmzN{olK%KBd5R;$$?%)yU|$Vp<@y4i?+kXl^L z(86G$fnggBjd~iphXse?Y=I`&C$rh`eg9>6C!(Niu%aO-SSHs-ZMUhj_PIUQtz)Z0 zbjIS)6o;_q=)`)5vn|)qyI#8u@e#`aIX=#C)5DNht`ikuqXT5KNqYkqkLFuL1ONc5 C;@rdl diff --git a/tests/testthat/unnest_job_summary_data_json_object/make_test_data.R b/tests/testthat/unnest_job_summary_data_json_object/make_test_data.R index 7c929d6..f6c43ba 100644 --- a/tests/testthat/unnest_job_summary_data_json_object/make_test_data.R +++ b/tests/testthat/unnest_job_summary_data_json_object/make_test_data.R @@ -5,35 +5,31 @@ library(lubridate) library(dotenv) # unnest_job_summary_data_json_object/make_test_data.R -# This script is designed to extract a sample of records from a redcapcustodian log table -# cycle from each of the 9 redcap_log_event tables. # -# Create a .env file according to the specs of -# redcapcustodian::connect_to_redcap_db with credentials. Save it at the root -# of this R Project. +# Create a .env file according to the specs of redcapcustodian +# Assure it defines a log database. +# Save it at the root of this R Project. # -# The redcap system read should be reasonably complex so that the returned -# data is similarly complex. -# -# With that in mind, you can run library(redcapcustodian) library(RMariaDB) library(DBI) library(tidyverse) library(dotenv) -load_dot_env("../rcc.billing/prod.env") +load_dot_env(".env") + +init_etl("test_unnest") -init_etl("pbc_scratch") +# Write some log data for us to read back +log_list <- lst(mtcars, iris) +log_job_success(jsonlite::toJSON(log_list)) log_conn <- get_package_scope_var("log_con") test_data_dir <- "unnest_job_summary_data_json_object" -sample_size <- 3 log_data <- tbl(log_conn, "rcc_job_log") %>% - dplyr::filter(script_name == "update_project_billable_attribute") %>% - dplyr::collect() %>% - dplyr::sample_n(size = sample_size) + dplyr::filter(script_name == "test_unnest") %>% + dplyr::collect() log_data %>% saveRDS(testthat::test_path(test_data_dir, "log_data.rds")) From 66da3a8f7b376764372cdd3c7a9860f796ca7014 Mon Sep 17 00:00:00 2001 From: Philip Chase Date: Tue, 24 Oct 2023 12:35:04 -0400 Subject: [PATCH 3/3] Add job_logging vignette --- vignettes/job_logging.Rmd | 151 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 vignettes/job_logging.Rmd diff --git a/vignettes/job_logging.Rmd b/vignettes/job_logging.Rmd new file mode 100644 index 0000000..da93c82 --- /dev/null +++ b/vignettes/job_logging.Rmd @@ -0,0 +1,151 @@ +--- +title: "Job Logging" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Job Logging} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, eval = T, include = F } +library(redcapcustodian) +library(tidyverse) +library(kableExtra) +dotenv::load_dot_env(here::here(".env")) +``` + +REDCap Custodian includes its own job logging facility. The system is designed to log each job run to a single row in a MYSQL database. With as little a four lines of code, you can log job success or failure along with a message. e.g., + +```{r minimal-example-of-success, eval = F, include = T, fig.cap = "A minimal example of logging a successful job"} +library(redcapcustodian) +library(dotenv) + +init_etl("my_useful_script", log_db_drv = duckdb::duckdb()) +# do some stuff ... + +log_job_success("It worked!") +``` + +The code in above would produce a row in the logging database like that shown in below. + +```{r mock_db_duckdb, eval = T, include = F} +init_etl("my_useful_script", log_db_drv = duckdb::duckdb()) +log_job_success("It worked!") +``` + +```{r read-log-data, eval = T, include = F} +log_conn <- get_package_scope_var("log_con") + +log_data <- + dplyr::tbl(log_conn, "rcc_job_log") |> + dplyr::filter(script_name == "my_useful_script") |> + dplyr::collect() +``` + +```{r minimal-example-of-success-output, eval = T, echo = F, fig.cap = "Data written by `my_useful_script`"} +log_data |> + kbl() |> + kable_styling() +``` + +## Behind the scenes + +That's a tiny script, but a lot is happening. Let's decode what's going on in those few lines. + +### `library(dotenv)` + +REDCap Custodian uses the `dotenv` package to read files of name-value pairs into the environment of the script. By default it reads `.env` from the current directory. To make logging work correctly, that `.env` file will need a stanza like this: + +``` ini +LOG_DB_NAME=rcc_log +LOG_DB_HOST=127.0.0.1 +LOG_DB_USER=rcc_log +LOG_DB_PASSWORD=password +LOG_DB_PORT=9000 +``` + +The "LOG_DB\_\*" strings are the environment variable names `redcapcustodian` uses to locate the MySQL DB where it stores the logging data. You need to use these names and the values need to point at a MySQL table on a MySQL server. + +### `init_etl()` + +`init_etl("my_useful_script")` sets the `script_name` env variable to "my_useful_script", sets the `script_run_time` env variable to the current date and time, and it verifies the connection to the database described by the "LOG_DB\_\*" environment variables. If the database is there, `init_etl()` quietly exits. If the database is not there you will see + +``` r +Warning: Failed to connect to log DB: rcc_log_bad +The reason given was: + Error: Failed to connect: Access denied for user 'rcc_log'@'%' to database 'rcc_log_bad' +``` + +### `log_job_success()` + +`log_job_success("It worked!")` writes a row to the logging database like the example above. + +## Failed jobs + +When jobs fail you might want to log that condition, for that, call `log_job_failure()`. As with `log_job_success()`, you can provide a message to give more detail about what went wrong. + +```r +log_job_failure("input dataframe was empty") +``` + +## More detailed messages + +Logging "It worked!" or "input dataframe was empty" is fine, but often, you want to provide more detail about a job's activities. Chances are you have some useful facts in R objects already. It is a simple matter to assemble those objects into a list, convert the list object into a text string with using JSON libraries, and use that string as the job's message. If you wanted report some updates you made to the 'iris' database, you might log it like this: + +```{r iris-updates, eval = T, echo = T} +init_etl("iris_updater") +# Do some stuff +# ... +iris_updates <- iris |> dplyr::sample_n(3) |> select(Sepal.Length, Species) +# ... +# success! +log_message <- dplyr::lst(iris_updates) +log_job_success(jsonlite::toJSON(log_message)) +``` + +The `summary` value can contain any number of JSON-encoded objects. That said, plan them carefully. Ask what questions you will want to answer later after a job has run. The package authors have found the most common questions posed are "Why did this value change?", "Who (or what) set the value?", and "When did this change?" Store as little information as is needed to answer those questions succinctly. If you provide lots of different objects, it might make for a pile of log data to sift through later. We've found we can usually meet these needs by including minimal dataframes of the data written by the job. + +If you have multiple jobs writing to the same dataset, consider the value of self-consistency across those jobs. If every update operation of the _invoice_ table is logged in an object named _invoice_updates_, you can more easily search across those jobs. + +## Reading back log data + +Reading back the log data presents a few challenges. When you are answering the "Who? What? When? Why?" questions you often have to search across multiple log events to find the job run that made the change. You can filter the log data to the time frame or jobs of interest, but you will still have to search multiple job summaries to find the one that made the change you care about. + +If you encoded the job summary data with JSON, you'll need to decode it, but `jsonlite::fromJSON` is not vectorized. You'll have to iterate over the log records. Also, you'll want to unnest the job summary objects returned by `jsonlite::fromJSON`. REDCap Custodian provides the function `unnest_job_summary_data_json_object()` to help with these operations. e.g., + +```{r mock-updates-to-iris, eval = T, echo = F} +make_updates <- function() { + Sys.sleep(1) + iris_updates <- iris |> dplyr::sample_n(3) |> select(Sepal.Length, Species) + log_job_success(jsonlite::toJSON(dplyr::lst(iris_updates))) +} + +make_updates() +make_updates() +``` + +```{r } +log_conn <- get_package_scope_var("log_con") + +log_data <- + dplyr::tbl(log_conn, "rcc_job_log") |> + dplyr::filter(script_name == "iris_updater") |> + dplyr::collect() |> + dplyr::slice_tail(n = 3) + +unnest_job_summary_data_json_object(log_data) |> + unnest(cols = "iris_updates") |> + kbl() |> + kable_styling() +``` + +```{r, eval = T, include = F} +DBI::dbDisconnect(log_conn) +```