✨ Implement multi-data loading

NONONOexe · Jan 19, 2025 · 8ede8e7 · 8ede8e7
1 parent a381b29
commit 8ede8e7
Show file tree

Hide file tree

Showing 23 changed files with 729 additions and 482 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,10 +10,12 @@ RoxygenNote: 7.3.2
 Imports: 
     cli,
     curl,
+    dplyr,
     lubridate,
     readr,
     sf,
     stats,
+    tidyr,
     utils,
     yaml
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,28 +1,35 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(post_process,accident_data)
 S3method(print,jpaccidents_config)
+S3method(summary,jpaccidents_config)
 export(convert_deg)
+export(detect_dataset_name)
 export(download_accident_data)
-export(filter_data_by_tag)
+export(extract_schema_columns)
 export(get_config)
 export(get_default_config_path)
 export(load_config)
 export(lookup_column_names)
-export(lookup_dataset_name)
 export(post_process_highway)
 export(post_process_main)
 export(post_process_sub)
-export(read_accident_data)
+export(read_accidents_data)
 export(reset_config)
-export(select_post_processor)
 importFrom(cli,cli_abort)
+importFrom(cli,cli_alert_danger)
 importFrom(cli,cli_alert_success)
+importFrom(cli,cli_alert_warning)
 importFrom(curl,multi_download)
+importFrom(dplyr,bind_rows)
 importFrom(lubridate,make_datetime)
+importFrom(readr,cols)
 importFrom(readr,locale)
 importFrom(readr,read_csv)
 importFrom(sf,st_as_sf)
 importFrom(stats,setNames)
+importFrom(tidyr,ends_with)
+importFrom(tidyr,pivot_longer)
 importFrom(utils,download.file)
 importFrom(utils,read.csv)
 importFrom(yaml,read_yaml)
diff --git a/R/config.R b/R/config.R
@@ -56,29 +56,59 @@ reset_config <- function() {
 
 #' @export
 print.jpaccidents_config <- function(x, ...) {
-  print_cols <- function(cols) {
-    for (name in names(x$main_data$columns)) {
-      column <- x$main_data$columns[[name]]
-      cat("    -", name)
-      cat(":", column$original_name, "\n")
-    }
+  cat("Accident Data Configuration\n")
+  cat("---------------------------\n")
+
+  # File Types
+  cat("File types:\n")
+  for (name in names(x$file_types)) {
+    info <- x$file_types[[name]]
+    pattern <- info$pattern
+    column_count <- length(info$columns)
+    cat(sprintf("  - %s: %s (%d columns)\n", name, pattern, column_count))
   }
 
-  cat("Accident Data Configuration\n")
-  cat("Path:", paste0("\"", attr(x, "path"), "\"\n"))
+  # Schemas
+  cat("\nSchemas:\n")
+  for (name in names(x$schemas)) {
+    schema <- x$schemas[[name]]
+    primary_key <- paste(schema$primary_key, collapse = ", ")
+    column_count <- length(schema$data_columns)
+    cat(sprintf("  - %s: Primary Key = %s (%d columns)\n", name, primary_key, column_count))
+  }
+}
 
-  cat("Main data:\n")
-  cat("  File pattern:", paste0("\"", x$main_data$file_pattern, "\"\n"))
-  cat("  Columns:\n")
-  print_cols(x$main_data$columns)
+#' @export
+summary.jpaccidents_config <- function(object, ...) {
+  cat("Accident Data Configuration Summary\n")
+  cat("-----------------------------------\n")
+
+  # File Types
+  cat("File types:\n")
+  for (name in names(object$file_types)) {
+    info <- object$file_types[[name]]
+    cat(sprintf("  - %s\n", name))
+    cat(sprintf("      Pattern: %s\n", info$pattern))
+    cat(sprintf("      Columns: [%s]\n", paste(info$columns, collapse = ", ")))
+  }
 
-  cat("Sub data:\n")
-  cat("  File pattern:", paste0("\"", x$sub_data$file_pattern, "\"\n"))
-  cat("  Columns:\n")
-  print_cols(x$sub_data$columns)
+  # Schemas
+  cat("\nSchemas:\n")
+  for (name in names(object$schemas)) {
+    schema <- object$schemas[[name]]
+    cat(sprintf("  - %s\n", name))
+    cat(sprintf("      Primary Key: [%s]\n", paste(schema$primary_key, collapse = ", ")))
+    cat(sprintf("      Data Columns: [%s]\n", paste(schema$data_columns, collapse = ", ")))
+    cat(sprintf("      Source Files: [%s]\n", paste(schema$source_files, collapse = ", ")))
+    if (!is.null(schema$suffixes)) {
+      cat(sprintf("    Suffixes: [%s]\n", paste(schema$suffixes, collapse = ", ")))
+    }
+  }
 
-  cat("Highway data:\n")
-  cat("  File pattern:", paste0("\"", x$highway_data$file_pattern, "\"\n"))
-  cat("  Columns:\n")
-  print_cols(x$highway_data$columns)
+  # Column Name Mapping
+  cat("\nColumn Name Mapping:\n")
+  for (name in names(object$columns)) {
+    mapping <- object$columns[[name]]
+    cat(sprintf("  - %s: [%s]\n", name, paste(mapping, collapse = ", ")))
+  }
 }
diff --git a/R/detect-dataset-name.R b/R/detect-dataset-name.R
@@ -0,0 +1,46 @@
+#' Identify dataset names from file paths
+#'
+#' This function identifies dataset names based on the file paths and
+#' configuration patterns.
+#'
+#' Invalid files are replaced with `NA`, and a warning is issued.
+#'
+#' @param files A character string specifying the file paths.
+#' @param quiet A logical indicating whether to suppress warnings.
+#' @return A character vector of dataset names
+#' @export
+#' @examples
+#' detect_dataset_name(c("example/honhyo_2022.csv", "example/honhyo_2023.csv"))
+detect_dataset_name <- function(files, quiet = FALSE) {
+  # Get configuration
+  config <- get_config()
+
+  # Determine dataset name based on file pattern
+  patterns <- sapply(config$file_types, `[[`, "pattern")
+  file_type_names <- names(config$file_types)
+
+  # Helper function: Match file path to a pattern
+  detect_file_type <- function(file) {
+    matches <- sapply(patterns, grepl, logical(1), x = file)
+    if (any(matches)) {
+      return(file_type_names[which(matches)[1]]) # Return the first match
+    }
+    NA_character_
+  }
+
+  # Apply detection logic to all files
+  dataset_names <- vapply(
+    files,
+    detect_file_type,
+    character(1),
+    USE.NAMES = FALSE
+  )
+
+  # Warn about files with no matching dataset
+  invalid_files <- files[is.na(dataset_names)]
+  if (0 < length(invalid_files) && !quiet) {
+    cli_alert_warning("Unknown dataset for file(s): {invalid_files}. These have been replaced with NA")
+  }
+
+  return(dataset_names)
+}
diff --git a/R/extract-schema-columns.R b/R/extract-schema-columns.R
@@ -0,0 +1,56 @@
+#' Filter data by schema
+#'
+#' This function filters a data frame based on a specified schema, which
+#' defines the columns to keep. It also handles special case where data
+#' needs to be reshaped to long format before filtering.
+#'
+#' @param data An accident data object.
+#' @param schema_name A character string specifying the schema to use for
+#'   filtering.
+#' @return A data frame containing only the columns defined in the schema, or
+#'   `NULL`.
+#' @export
+#' @examples
+#' \dontrun{
+#' persons_info <- extract_schema_columns(accident_data, "persons_info")
+#' }
+extract_schema_columns <- function(data, schema_name) {
+  # Load configuration settings containing to access schema definitions
+  config <- get_config()
+  schema <- config$schemas[[schema_name]]
+
+  # Validate schema existence
+  if (is.null(schema)) {
+    cli_alert_danger("The schema does not exist: {schema_name}")
+    return(NULL)
+  }
+
+  # Retrieve dataset name and validate against schema's supported files
+  dataset_name <- attr(data, "dataset_name")
+  source_files <- schema$source_files
+
+  if (!dataset_name %in% source_files) {
+    cli_alert_warning("This schema does not support the current dataset: dataset_name")
+    return(NULL)
+  }
+
+  # Handle specific transformation requirements for certain schemas
+  if (dataset_name == "main_data" && schema_name == "persons_info") {
+    data <- pivot_longer(
+      data,
+      cols          = ends_with(c("_a", "_b")),
+      names_to      = c(".value", "sub_id"),
+      names_pattern = "^(.+)[_]([a|b])$"
+    )
+  }
+
+  # Extract columns based on schema's primary key and data columns
+  schema_columns <- c(schema$primary_key, schema$data_columns)
+  selected_columns <- names(data) %in% schema_columns
+  filtered_data <- data[, selected_columns, drop = FALSE]
+
+  # Attach schema name to the resulting data
+  attr(filtered_data, "schema_name") <- schema_name
+
+  return(filtered_data)
+}
diff --git a/R/filter-data-by-tag.R b/R/filter-data-by-tag.R
diff --git a/R/jpaccidents-package.R b/R/jpaccidents-package.R
@@ -3,13 +3,19 @@
 
 ## usethis namespace: start
 #' @importFrom cli cli_abort
+#' @importFrom cli cli_alert_danger
 #' @importFrom cli cli_alert_success
+#' @importFrom cli cli_alert_warning
 #' @importFrom curl multi_download
+#' @importFrom dplyr bind_rows
 #' @importFrom lubridate make_datetime
+#' @importFrom readr cols
 #' @importFrom readr locale
 #' @importFrom readr read_csv
 #' @importFrom sf st_as_sf
 #' @importFrom stats setNames
+#' @importFrom tidyr ends_with
+#' @importFrom tidyr pivot_longer
 #' @importFrom utils download.file
 #' @importFrom utils read.csv
 #' @importFrom yaml read_yaml

diff --git a/R/lookup-column-names.R b/R/lookup-column-names.R
@@ -7,28 +7,31 @@
 #' characters), which can be difficult to handle in programs.
 #' This function converts them to corresponding English names.
 #'
-#' @param dataset_name The name of the dataset.
 #' @param original_names A vector of original column names.
 #' @return A vector of new column names corresponding to the provided
-#'   original names.
+#'   original names, or `NA` for unmatched names.
 #' @export
-lookup_column_names <- function(dataset_name, original_names) {
-  # Retrieve column configuration for the specified dataset
-  column_config <- get_config()[[dataset_name]]$columns
+lookup_column_names <- function(original_names) {
+  # Load column name configuration
+  config <- get_config()
 
-  # Check if column configuration is available
-  if (is.null(column_config)) {
-    warning("No column configuration found for dataset: ", dataset_name)
-    return(original_names)
+  # Helper function to find a matching column name
+  find_column_name <- function(name) {
+    matches <- vapply(
+      config$columns,
+      function(column) name %in% column,
+      logical(1)
+    )
+
+    if (any(matches)) {
+      names(config$columns)[matches]
+    } else {
+      NA_character_
+    }
   }
 
-  # Extract original names and their corresponding new names
-  original_names_list <- sapply(column_config, `[[`, "original_name")
-  new_names <- setNames(
-    rep(names(original_names_list), lengths(original_names_list)),
-    unlist(original_names_list, use.names = FALSE)
-  )
+  # Apply the helper function to all provided original names
+  new_names <- vapply(original_names, find_column_name, character(1))
 
-  # Return the new names corresponding to the provided original names
-  return(unname(new_names[original_names]))
+  return(new_names)
 }
diff --git a/R/lookup-dataset-name.R b/R/lookup-dataset-name.R
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,10 +10,12 @@ RoxygenNote: 7.3.2 @@
     Imports:
         cli,
         curl,
+        dplyr,
         lubridate,
         readr,
         sf,
         stats,
+        tidyr,
         utils,
         yaml
     Suggests:
@@ Expand Down @@