Skip to content

Commit

Permalink
✨ Implement multi-data loading
Browse files Browse the repository at this point in the history
  • Loading branch information
NONONOexe committed Jan 19, 2025
1 parent a381b29 commit 8ede8e7
Show file tree
Hide file tree
Showing 23 changed files with 729 additions and 482 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ RoxygenNote: 7.3.2
Imports:
cli,
curl,
dplyr,
lubridate,
readr,
sf,
stats,
tidyr,
utils,
yaml
Suggests:
Expand Down
15 changes: 11 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,28 +1,35 @@
# Generated by roxygen2: do not edit by hand

S3method(post_process,accident_data)
S3method(print,jpaccidents_config)
S3method(summary,jpaccidents_config)
export(convert_deg)
export(detect_dataset_name)
export(download_accident_data)
export(filter_data_by_tag)
export(extract_schema_columns)
export(get_config)
export(get_default_config_path)
export(load_config)
export(lookup_column_names)
export(lookup_dataset_name)
export(post_process_highway)
export(post_process_main)
export(post_process_sub)
export(read_accident_data)
export(read_accidents_data)
export(reset_config)
export(select_post_processor)
importFrom(cli,cli_abort)
importFrom(cli,cli_alert_danger)
importFrom(cli,cli_alert_success)
importFrom(cli,cli_alert_warning)
importFrom(curl,multi_download)
importFrom(dplyr,bind_rows)
importFrom(lubridate,make_datetime)
importFrom(readr,cols)
importFrom(readr,locale)
importFrom(readr,read_csv)
importFrom(sf,st_as_sf)
importFrom(stats,setNames)
importFrom(tidyr,ends_with)
importFrom(tidyr,pivot_longer)
importFrom(utils,download.file)
importFrom(utils,read.csv)
importFrom(yaml,read_yaml)
70 changes: 50 additions & 20 deletions R/config.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,29 +56,59 @@ reset_config <- function() {

#' @export
print.jpaccidents_config <- function(x, ...) {
print_cols <- function(cols) {
for (name in names(x$main_data$columns)) {
column <- x$main_data$columns[[name]]
cat(" -", name)
cat(":", column$original_name, "\n")
}
cat("Accident Data Configuration\n")
cat("---------------------------\n")

# File Types
cat("File types:\n")
for (name in names(x$file_types)) {
info <- x$file_types[[name]]
pattern <- info$pattern
column_count <- length(info$columns)
cat(sprintf(" - %s: %s (%d columns)\n", name, pattern, column_count))
}

cat("Accident Data Configuration\n")
cat("Path:", paste0("\"", attr(x, "path"), "\"\n"))
# Schemas
cat("\nSchemas:\n")
for (name in names(x$schemas)) {
schema <- x$schemas[[name]]
primary_key <- paste(schema$primary_key, collapse = ", ")
column_count <- length(schema$data_columns)
cat(sprintf(" - %s: Primary Key = %s (%d columns)\n", name, primary_key, column_count))
}
}

cat("Main data:\n")
cat(" File pattern:", paste0("\"", x$main_data$file_pattern, "\"\n"))
cat(" Columns:\n")
print_cols(x$main_data$columns)
#' @export
summary.jpaccidents_config <- function(object, ...) {
cat("Accident Data Configuration Summary\n")
cat("-----------------------------------\n")

# File Types
cat("File types:\n")
for (name in names(object$file_types)) {
info <- object$file_types[[name]]
cat(sprintf(" - %s\n", name))
cat(sprintf(" Pattern: %s\n", info$pattern))
cat(sprintf(" Columns: [%s]\n", paste(info$columns, collapse = ", ")))
}

cat("Sub data:\n")
cat(" File pattern:", paste0("\"", x$sub_data$file_pattern, "\"\n"))
cat(" Columns:\n")
print_cols(x$sub_data$columns)
# Schemas
cat("\nSchemas:\n")
for (name in names(object$schemas)) {
schema <- object$schemas[[name]]
cat(sprintf(" - %s\n", name))
cat(sprintf(" Primary Key: [%s]\n", paste(schema$primary_key, collapse = ", ")))
cat(sprintf(" Data Columns: [%s]\n", paste(schema$data_columns, collapse = ", ")))
cat(sprintf(" Source Files: [%s]\n", paste(schema$source_files, collapse = ", ")))
if (!is.null(schema$suffixes)) {
cat(sprintf(" Suffixes: [%s]\n", paste(schema$suffixes, collapse = ", ")))
}
}

cat("Highway data:\n")
cat(" File pattern:", paste0("\"", x$highway_data$file_pattern, "\"\n"))
cat(" Columns:\n")
print_cols(x$highway_data$columns)
# Column Name Mapping
cat("\nColumn Name Mapping:\n")
for (name in names(object$columns)) {
mapping <- object$columns[[name]]
cat(sprintf(" - %s: [%s]\n", name, paste(mapping, collapse = ", ")))
}
}
46 changes: 46 additions & 0 deletions R/detect-dataset-name.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' Identify dataset names from file paths
#'
#' This function identifies dataset names based on the file paths and
#' configuration patterns.
#'
#' Invalid files are replaced with `NA`, and a warning is issued.
#'
#' @param files A character string specifying the file paths.
#' @param quiet A logical indicating whether to suppress warnings.
#' @return A character vector of dataset names
#' @export
#' @examples
#' detect_dataset_name(c("example/honhyo_2022.csv", "example/honhyo_2023.csv"))
detect_dataset_name <- function(files, quiet = FALSE) {
# Get configuration
config <- get_config()

# Determine dataset name based on file pattern
patterns <- sapply(config$file_types, `[[`, "pattern")
file_type_names <- names(config$file_types)

# Helper function: Match file path to a pattern
detect_file_type <- function(file) {
matches <- sapply(patterns, grepl, logical(1), x = file)
if (any(matches)) {
return(file_type_names[which(matches)[1]]) # Return the first match
}
NA_character_
}

# Apply detection logic to all files
dataset_names <- vapply(
files,
detect_file_type,
character(1),
USE.NAMES = FALSE
)

# Warn about files with no matching dataset
invalid_files <- files[is.na(dataset_names)]
if (0 < length(invalid_files) && !quiet) {
cli_alert_warning("Unknown dataset for file(s): {invalid_files}. These have been replaced with NA")
}

return(dataset_names)
}
56 changes: 56 additions & 0 deletions R/extract-schema-columns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#' Filter data by schema
#'
#' This function filters a data frame based on a specified schema, which
#' defines the columns to keep. It also handles special case where data
#' needs to be reshaped to long format before filtering.
#'
#' @param data An accident data object.
#' @param schema_name A character string specifying the schema to use for
#' filtering.
#' @return A data frame containing only the columns defined in the schema, or
#' `NULL`.
#' @export
#' @examples
#' \dontrun{
#' persons_info <- extract_schema_columns(accident_data, "persons_info")
#' }
extract_schema_columns <- function(data, schema_name) {
# Load configuration settings containing to access schema definitions
config <- get_config()
schema <- config$schemas[[schema_name]]

# Validate schema existence
if (is.null(schema)) {
cli_alert_danger("The schema does not exist: {schema_name}")
return(NULL)
}

# Retrieve dataset name and validate against schema's supported files
dataset_name <- attr(data, "dataset_name")
source_files <- schema$source_files

if (!dataset_name %in% source_files) {
cli_alert_warning("This schema does not support the current dataset: dataset_name")
return(NULL)
}

# Handle specific transformation requirements for certain schemas
if (dataset_name == "main_data" && schema_name == "persons_info") {
data <- pivot_longer(
data,
cols = ends_with(c("_a", "_b")),
names_to = c(".value", "sub_id"),
names_pattern = "^(.+)[_]([a|b])$"
)
}

# Extract columns based on schema's primary key and data columns
schema_columns <- c(schema$primary_key, schema$data_columns)
selected_columns <- names(data) %in% schema_columns
filtered_data <- data[, selected_columns, drop = FALSE]

# Attach schema name to the resulting data
attr(filtered_data, "schema_name") <- schema_name

return(filtered_data)
}
27 changes: 0 additions & 27 deletions R/filter-data-by-tag.R

This file was deleted.

6 changes: 6 additions & 0 deletions R/jpaccidents-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@

## usethis namespace: start
#' @importFrom cli cli_abort
#' @importFrom cli cli_alert_danger
#' @importFrom cli cli_alert_success
#' @importFrom cli cli_alert_warning
#' @importFrom curl multi_download
#' @importFrom dplyr bind_rows
#' @importFrom lubridate make_datetime
#' @importFrom readr cols
#' @importFrom readr locale
#' @importFrom readr read_csv
#' @importFrom sf st_as_sf
#' @importFrom stats setNames
#' @importFrom tidyr ends_with
#' @importFrom tidyr pivot_longer
#' @importFrom utils download.file
#' @importFrom utils read.csv
#' @importFrom yaml read_yaml
Expand Down
37 changes: 20 additions & 17 deletions R/lookup-column-names.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,31 @@
#' characters), which can be difficult to handle in programs.
#' This function converts them to corresponding English names.
#'
#' @param dataset_name The name of the dataset.
#' @param original_names A vector of original column names.
#' @return A vector of new column names corresponding to the provided
#' original names.
#' original names, or `NA` for unmatched names.
#' @export
lookup_column_names <- function(dataset_name, original_names) {
# Retrieve column configuration for the specified dataset
column_config <- get_config()[[dataset_name]]$columns
lookup_column_names <- function(original_names) {
# Load column name configuration
config <- get_config()

# Check if column configuration is available
if (is.null(column_config)) {
warning("No column configuration found for dataset: ", dataset_name)
return(original_names)
# Helper function to find a matching column name
find_column_name <- function(name) {
matches <- vapply(
config$columns,
function(column) name %in% column,
logical(1)
)

if (any(matches)) {
names(config$columns)[matches]
} else {
NA_character_
}
}

# Extract original names and their corresponding new names
original_names_list <- sapply(column_config, `[[`, "original_name")
new_names <- setNames(
rep(names(original_names_list), lengths(original_names_list)),
unlist(original_names_list, use.names = FALSE)
)
# Apply the helper function to all provided original names
new_names <- vapply(original_names, find_column_name, character(1))

# Return the new names corresponding to the provided original names
return(unname(new_names[original_names]))
return(new_names)
}
24 changes: 0 additions & 24 deletions R/lookup-dataset-name.R

This file was deleted.

Loading

0 comments on commit 8ede8e7

Please sign in to comment.