diff --git a/DESCRIPTION b/DESCRIPTION index 659c2a5..edf9d27 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,10 +10,12 @@ RoxygenNote: 7.3.2 Imports: cli, curl, + dplyr, lubridate, readr, sf, stats, + tidyr, utils, yaml Suggests: diff --git a/NAMESPACE b/NAMESPACE index ab4ae9c..d1e5631 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,28 +1,35 @@ # Generated by roxygen2: do not edit by hand +S3method(post_process,accident_data) S3method(print,jpaccidents_config) +S3method(summary,jpaccidents_config) export(convert_deg) +export(detect_dataset_name) export(download_accident_data) -export(filter_data_by_tag) +export(extract_schema_columns) export(get_config) export(get_default_config_path) export(load_config) export(lookup_column_names) -export(lookup_dataset_name) export(post_process_highway) export(post_process_main) export(post_process_sub) -export(read_accident_data) +export(read_accidents_data) export(reset_config) -export(select_post_processor) importFrom(cli,cli_abort) +importFrom(cli,cli_alert_danger) importFrom(cli,cli_alert_success) +importFrom(cli,cli_alert_warning) importFrom(curl,multi_download) +importFrom(dplyr,bind_rows) importFrom(lubridate,make_datetime) +importFrom(readr,cols) importFrom(readr,locale) importFrom(readr,read_csv) importFrom(sf,st_as_sf) importFrom(stats,setNames) +importFrom(tidyr,ends_with) +importFrom(tidyr,pivot_longer) importFrom(utils,download.file) importFrom(utils,read.csv) importFrom(yaml,read_yaml) diff --git a/R/config.R b/R/config.R index 065a86d..428453a 100644 --- a/R/config.R +++ b/R/config.R @@ -56,29 +56,59 @@ reset_config <- function() { #' @export print.jpaccidents_config <- function(x, ...) { - print_cols <- function(cols) { - for (name in names(x$main_data$columns)) { - column <- x$main_data$columns[[name]] - cat(" -", name) - cat(":", column$original_name, "\n") - } + cat("Accident Data Configuration\n") + cat("---------------------------\n") + + # File Types + cat("File types:\n") + for (name in names(x$file_types)) { + info <- x$file_types[[name]] + pattern <- info$pattern + column_count <- length(info$columns) + cat(sprintf(" - %s: %s (%d columns)\n", name, pattern, column_count)) } - cat("Accident Data Configuration\n") - cat("Path:", paste0("\"", attr(x, "path"), "\"\n")) + # Schemas + cat("\nSchemas:\n") + for (name in names(x$schemas)) { + schema <- x$schemas[[name]] + primary_key <- paste(schema$primary_key, collapse = ", ") + column_count <- length(schema$data_columns) + cat(sprintf(" - %s: Primary Key = %s (%d columns)\n", name, primary_key, column_count)) + } +} - cat("Main data:\n") - cat(" File pattern:", paste0("\"", x$main_data$file_pattern, "\"\n")) - cat(" Columns:\n") - print_cols(x$main_data$columns) +#' @export +summary.jpaccidents_config <- function(object, ...) { + cat("Accident Data Configuration Summary\n") + cat("-----------------------------------\n") + + # File Types + cat("File types:\n") + for (name in names(object$file_types)) { + info <- object$file_types[[name]] + cat(sprintf(" - %s\n", name)) + cat(sprintf(" Pattern: %s\n", info$pattern)) + cat(sprintf(" Columns: [%s]\n", paste(info$columns, collapse = ", "))) + } - cat("Sub data:\n") - cat(" File pattern:", paste0("\"", x$sub_data$file_pattern, "\"\n")) - cat(" Columns:\n") - print_cols(x$sub_data$columns) + # Schemas + cat("\nSchemas:\n") + for (name in names(object$schemas)) { + schema <- object$schemas[[name]] + cat(sprintf(" - %s\n", name)) + cat(sprintf(" Primary Key: [%s]\n", paste(schema$primary_key, collapse = ", "))) + cat(sprintf(" Data Columns: [%s]\n", paste(schema$data_columns, collapse = ", "))) + cat(sprintf(" Source Files: [%s]\n", paste(schema$source_files, collapse = ", "))) + if (!is.null(schema$suffixes)) { + cat(sprintf(" Suffixes: [%s]\n", paste(schema$suffixes, collapse = ", "))) + } + } - cat("Highway data:\n") - cat(" File pattern:", paste0("\"", x$highway_data$file_pattern, "\"\n")) - cat(" Columns:\n") - print_cols(x$highway_data$columns) + # Column Name Mapping + cat("\nColumn Name Mapping:\n") + for (name in names(object$columns)) { + mapping <- object$columns[[name]] + cat(sprintf(" - %s: [%s]\n", name, paste(mapping, collapse = ", "))) + } } diff --git a/R/detect-dataset-name.R b/R/detect-dataset-name.R new file mode 100644 index 0000000..812d211 --- /dev/null +++ b/R/detect-dataset-name.R @@ -0,0 +1,46 @@ +#' Identify dataset names from file paths +#' +#' This function identifies dataset names based on the file paths and +#' configuration patterns. +#' +#' Invalid files are replaced with `NA`, and a warning is issued. +#' +#' @param files A character string specifying the file paths. +#' @param quiet A logical indicating whether to suppress warnings. +#' @return A character vector of dataset names +#' @export +#' @examples +#' detect_dataset_name(c("example/honhyo_2022.csv", "example/honhyo_2023.csv")) +detect_dataset_name <- function(files, quiet = FALSE) { + # Get configuration + config <- get_config() + + # Determine dataset name based on file pattern + patterns <- sapply(config$file_types, `[[`, "pattern") + file_type_names <- names(config$file_types) + + # Helper function: Match file path to a pattern + detect_file_type <- function(file) { + matches <- sapply(patterns, grepl, logical(1), x = file) + if (any(matches)) { + return(file_type_names[which(matches)[1]]) # Return the first match + } + NA_character_ + } + + # Apply detection logic to all files + dataset_names <- vapply( + files, + detect_file_type, + character(1), + USE.NAMES = FALSE + ) + + # Warn about files with no matching dataset + invalid_files <- files[is.na(dataset_names)] + if (0 < length(invalid_files) && !quiet) { + cli_alert_warning("Unknown dataset for file(s): {invalid_files}. These have been replaced with NA") + } + + return(dataset_names) +} diff --git a/R/extract-schema-columns.R b/R/extract-schema-columns.R new file mode 100644 index 0000000..0bb90d7 --- /dev/null +++ b/R/extract-schema-columns.R @@ -0,0 +1,56 @@ +#' Filter data by schema +#' +#' This function filters a data frame based on a specified schema, which +#' defines the columns to keep. It also handles special case where data +#' needs to be reshaped to long format before filtering. +#' +#' @param data An accident data object. +#' @param schema_name A character string specifying the schema to use for +#' filtering. +#' @return A data frame containing only the columns defined in the schema, or +#' `NULL`. +#' @export +#' @examples +#' \dontrun{ +#' persons_info <- extract_schema_columns(accident_data, "persons_info") +#' } +extract_schema_columns <- function(data, schema_name) { + # Load configuration settings containing to access schema definitions + config <- get_config() + schema <- config$schemas[[schema_name]] + + # Validate schema existence + if (is.null(schema)) { + cli_alert_danger("The schema does not exist: {schema_name}") + return(NULL) + } + + # Retrieve dataset name and validate against schema's supported files + dataset_name <- attr(data, "dataset_name") + source_files <- schema$source_files + + if (!dataset_name %in% source_files) { + cli_alert_warning("This schema does not support the current dataset: dataset_name") + return(NULL) + } + + # Handle specific transformation requirements for certain schemas + if (dataset_name == "main_data" && schema_name == "persons_info") { + data <- pivot_longer( + data, + cols = ends_with(c("_a", "_b")), + names_to = c(".value", "sub_id"), + names_pattern = "^(.+)[_]([a|b])$" + ) + } + + # Extract columns based on schema's primary key and data columns + schema_columns <- c(schema$primary_key, schema$data_columns) + selected_columns <- names(data) %in% schema_columns + filtered_data <- data[, selected_columns, drop = FALSE] + + # Attach schema name to the resulting data + attr(filtered_data, "schema_name") <- schema_name + + return(filtered_data) +} diff --git a/R/filter-data-by-tag.R b/R/filter-data-by-tag.R deleted file mode 100644 index 0cb0380..0000000 --- a/R/filter-data-by-tag.R +++ /dev/null @@ -1,27 +0,0 @@ -#' Filter data by tag -#' -#' This function filters data by a specific tag. -#' -#' @param data An accident data object. -#' @param tag A character string specifying the tag to filter by. -#' @return A data frame containing only the columns with the specified tag. -#' @export -#' @examples -#' \dontrun{ -#' person_data <- filter_data_by_tag(accident_data, "person") -#' } -filter_data_by_tag <- function(data, tag) { - # Get all columns for the dataset registered in the configuration - config <- get_config() - dataset_name <- attr(data, "dataset_name") - all_columns <- config[[dataset_name]]$columns - - # Get the columns having the specified tag - tag_columns <- names(all_columns[sapply(all_columns, `[[`, "tag") %in% tag]) - - # Select column from the data that match the tag columns - selected_cols <- names(data) %in% tag_columns - - # Return the filtered data - return(data[, selected_cols, drop = FALSE]) -} diff --git a/R/jpaccidents-package.R b/R/jpaccidents-package.R index 5efd492..28cd035 100644 --- a/R/jpaccidents-package.R +++ b/R/jpaccidents-package.R @@ -3,13 +3,19 @@ ## usethis namespace: start #' @importFrom cli cli_abort +#' @importFrom cli cli_alert_danger #' @importFrom cli cli_alert_success +#' @importFrom cli cli_alert_warning #' @importFrom curl multi_download +#' @importFrom dplyr bind_rows #' @importFrom lubridate make_datetime +#' @importFrom readr cols #' @importFrom readr locale #' @importFrom readr read_csv #' @importFrom sf st_as_sf #' @importFrom stats setNames +#' @importFrom tidyr ends_with +#' @importFrom tidyr pivot_longer #' @importFrom utils download.file #' @importFrom utils read.csv #' @importFrom yaml read_yaml diff --git a/R/lookup-column-names.R b/R/lookup-column-names.R index 2431ca0..26e654a 100644 --- a/R/lookup-column-names.R +++ b/R/lookup-column-names.R @@ -7,28 +7,31 @@ #' characters), which can be difficult to handle in programs. #' This function converts them to corresponding English names. #' -#' @param dataset_name The name of the dataset. #' @param original_names A vector of original column names. #' @return A vector of new column names corresponding to the provided -#' original names. +#' original names, or `NA` for unmatched names. #' @export -lookup_column_names <- function(dataset_name, original_names) { - # Retrieve column configuration for the specified dataset - column_config <- get_config()[[dataset_name]]$columns +lookup_column_names <- function(original_names) { + # Load column name configuration + config <- get_config() - # Check if column configuration is available - if (is.null(column_config)) { - warning("No column configuration found for dataset: ", dataset_name) - return(original_names) + # Helper function to find a matching column name + find_column_name <- function(name) { + matches <- vapply( + config$columns, + function(column) name %in% column, + logical(1) + ) + + if (any(matches)) { + names(config$columns)[matches] + } else { + NA_character_ + } } - # Extract original names and their corresponding new names - original_names_list <- sapply(column_config, `[[`, "original_name") - new_names <- setNames( - rep(names(original_names_list), lengths(original_names_list)), - unlist(original_names_list, use.names = FALSE) - ) + # Apply the helper function to all provided original names + new_names <- vapply(original_names, find_column_name, character(1)) - # Return the new names corresponding to the provided original names - return(unname(new_names[original_names])) + return(new_names) } diff --git a/R/lookup-dataset-name.R b/R/lookup-dataset-name.R deleted file mode 100644 index b5d20d4..0000000 --- a/R/lookup-dataset-name.R +++ /dev/null @@ -1,24 +0,0 @@ -#' Find dataset name from file path -#' -#' This function determines the dataset name based on the file path and -#' configuration. -#' -#' @param file A character string specifying the file path. -#' @return Dataset name -#' @export -#' @examples -#' lookup_dataset_name("example/honhyo_2023.csv") -lookup_dataset_name <- function(file) { - # Get configuration - config <- get_config() - - # Determine dataset name based on file pattern - dataset_name <- names(config)[ - sapply(config, function(x) grepl(x$file_pattern, file)) - ] - if (is.null(dataset_name)) { - stop("Unknown dataset for file: ", dataset_name) - } - - return(dataset_name) -} diff --git a/R/post-process.R b/R/post-process.R index b632cbb..327c337 100644 --- a/R/post-process.R +++ b/R/post-process.R @@ -1,24 +1,3 @@ -#' Select post-processor function based on dataset name -#' -#' This function selects the post-processor function based on -#' the given dataset name. -#' -#' @param dataset_name The name of the dataset. -#' @return The post-processing function corresponding to the dataset name. -#' @export -select_post_processor <- function(dataset_name) { - # Select post-processor - post_processor <- switch( - dataset_name, - "main_data" = post_process_main, - "sub_data" = post_process_sub, - "highway_data" = post_process_highway, - warning("Unknown dataset name: ", dataset_name) - ) - - return(post_processor) -} - #' Post-process data #' #' These function perform post-processing operations on different datasets. @@ -30,82 +9,88 @@ select_post_processor <- function(dataset_name) { #' @name post_process #' @param data The data to be processed. #' @return A processed data. -NULL +post_process <- function(data) { + UseMethod("post_process") +} #' @rdname post_process #' @export -post_process_main <- function(data) { - # Filter accident data - accident_data <- filter_data_by_tag(data, "accident") - - # Create datetime from individual columns in the original data - accident_data$occurrence_time <- make_datetime( - year = as.integer(data$occurrence_year), - month = as.integer(data$occurrence_month), - day = as.integer(data$occurrence_day), - hour = as.integer(data$occurrence_hour), - min = as.integer(data$occurrence_min), - tz = "Asia/Tokyo" +post_process.accident_data <- function(data) { + dataset_name <- attr(data, "dataset_name") + post_processor <- switch ( + dataset_name, + "main_data" = post_process_main, + "sub_data" = post_process_sub, + "highway_data" = post_process_highway, + function(data) { + cli_alert_warning("Unknown dataset name: {dataset_name}") + return(NULL) + } ) + processed_data <- post_processor(data) + attr(processed_data, "dataset_name") <- dataset_name + + return(processed_data) +} +#' @rdname post_process +#' @export +post_process_main <- function(data) { # Convert latitude and longitude from DMS to decimal - accident_data$latitude <- suppressWarnings( - convert_deg(accident_data$latitude) - ) - accident_data$longitude <- suppressWarnings( - convert_deg(accident_data$longitude) - ) + data$latitude <- suppressWarnings(convert_deg(data$latitude)) + data$longitude <- suppressWarnings(convert_deg(data$longitude)) # Filter valid rows based on non-missing coordinates - valid_rows <- !is.na(accident_data$latitude) & !is.na(accident_data$longitude) - accident_data <- accident_data[valid_rows, ] + valid_rows <- !is.na(data$latitude) & !is.na(data$longitude) + location_data <- data[valid_rows, ] + + # Convert data to spatial data format (sf object) + location_data_sf <- st_as_sf( + location_data, + coords = c("longitude", "latitude"), + crs = 4326 + ) - if (nrow(accident_data) < nrow(data)) { - warning("Invalid coordinate format detected. Some rows have been excluded.") + # Alert user if any rows were removed due to invalid coordinates + removed_rows <- nrow(data) - nrow(location_data_sf) + file_path <- attr(data, "file_path") + if (0 < removed_rows) { + cli_alert_warning("Invalid coordinate format detected in file: {file_path}. {removed_rows} rows have been excluded.") } - # Function to filter and clean person data - filter_person_data <- function(data, tag, suf) { - person_data <- filter_data_by_tag(data, tag) - names(person_data) <- sub(paste0(suf, "$"), "", names(person_data)) - - return(person_data) - } + # Extract accident related columns from spatial data + accidents_info <- extract_schema_columns(location_data_sf, "accidents_info") - # Filter person data - person_a_data <- filter_person_data(data, "person_a", "_a")[valid_rows, ] - person_b_data <- filter_person_data(data, "person_b", "_b")[valid_rows, ] + # Create datetime from individual columns in the original data + accidents_info$occurrence_time <- make_datetime( + year = as.integer(location_data_sf$occurrence_year), + month = as.integer(location_data_sf$occurrence_month), + day = as.integer(location_data_sf$occurrence_day), + hour = as.integer(location_data_sf$occurrence_hour), + min = as.integer(location_data_sf$occurrence_min), + tz = "Asia/Tokyo" + ) - # Filter key data - key_data <- filter_data_by_tag(data, "key")[valid_rows, ] + # Extract person related columns from the original data + persons_info <- extract_schema_columns(location_data, "persons_info") - # Combine process data + # Combine process accident and person data into a list processed_data <- list( - accident = st_as_sf( - cbind(key_data, accident_data), - coords = c("longitude", "latitude"), - crs = 4326 - ), - person = rbind( - cbind(key_data, person_a_data), - cbind(key_data, person_b_data) - ) + accidents_info = accidents_info, + persons_info = persons_info ) - # Set dataset name attribute - attr(processed_data, "dataset_name") <- attr(accident_data, "dataset_name") - return(processed_data) } #' @rdname post_process #' @export post_process_sub <- function(data) { - return(data) + return(extract_schema_columns(data, "persons_info")) } #' @rdname post_process #' @export post_process_highway <- function(data) { - return(data) + return(extract_schema_columns(data, "highways_info")) } diff --git a/R/read-accident-data.R b/R/read-accident-data.R index 860f550..6a91b53 100644 --- a/R/read-accident-data.R +++ b/R/read-accident-data.R @@ -1,46 +1,92 @@ -#' Read accident data from a CSV file +#' Read accident data from CSV files #' -#' This function reads accident data from a CSV file, -#' renames the columns, adds attributes and class, and applies -#' post-processing if provided. +#' This function reads accident data from multiple CSV files, +#' processes each file, and combines the data into three categories: +#' * `accidents_info` +#' * `persons_info` +#' * `highways_info` #' -#' @param file The path to the CSV file. -#' @param post_process A function to apply post-processing to the data. -#' @return A data frame containing the accident data. +#' @param file_paths A character vector of file paths to the CSV files. +#' @return A list containing the accident data split into three data frames: +#' `accidents_info`, `persons_info`, and `highways_info`. #' @export #' @examples #' \dontrun{ -#' read_accident_data("example/honhyo_2023.csv") +#' read_accidents_data(c( +#' "example/honhyo_2023.csv", +#' "example/hojuhyo_2023.csv" +#' )) #' } -read_accident_data <- function( - file, - post_process = select_post_processor(lookup_dataset_name(file)) - ) { +read_accidents_data <- function(file_paths) { + # Process all files and extract valid data + data_list <- lapply(file_paths, process_single_file) + valid_data <- Filter(Negate(is.null), data_list) + skipped_files <- file_paths[vapply(data_list, is.null, logical(1))] - # Read file - accident_data <- read_csv( - file, - locale = locale(encoding = "Shift_JIS"), - show_col_types = FALSE - ) + # Warn about skipped files + if (0 < length(skipped_files)) { + cli_alert_warning("The following files were skipped because their format is not supported: {skipped_files}") + } - # Get dataset name and configuration - dataset_name <- lookup_dataset_name(file) + # Post-process valid data + processed_data <- lapply(valid_data, post_process) + + # Extract data categories + accidents_info <- combine_data(processed_data, "accidents_info") + persons_info <- combine_data(processed_data, "persons_info") + highways_info <- combine_data(processed_data, "highways_info") + + return(list( + accidents_info = accidents_info, + persons_info = persons_info, + highways_info = highways_info + )) +} + +# Function to process a single file +process_single_file <- function(file) { + dataset_name <- detect_dataset_name(file, quiet = TRUE) + if (is.na(dataset_name)) { + return(NULL) + } - # Rename columns - names(accident_data) <- lookup_column_names( - dataset_name, - names(accident_data) + # Read configuration and file with proper encoding + config <- get_config() + encoding <- config$file_types[[dataset_name]]$encoding + accident_data <- read_csv( + file, + col_types = cols(.default = "c"), + locale = locale(encoding = encoding), + show_col_types = FALSE, ) - # Add attributes and class + # Rename columns and add attributes + accident_data <- rename_columns(accident_data) + attr(accident_data, "file_path") <- file attr(accident_data, "dataset_name") <- dataset_name class(accident_data) <- c("accident_data", class(accident_data)) - # Apply post-processing if provided - if (is.function(post_process)) { - accident_data <- post_process(accident_data) - } - return(accident_data) } + +# Rename columns using lookup table +rename_columns <- function(data) { + colnames(data) <- lookup_column_names(names(data)) + return(data) +} + +# Combine data based on dataset name and attribute +combine_data <- function(data_list, schema_name) { + filtered_data <- Filter( + Negate(is.null), + lapply(data_list, function(x) { + if (is.data.frame(x) && attr(x, "schema_name") == schema_name) { + return(x) + } else { + return(x[[schema_name]]) + } + NULL + }) + ) + return(do.call(bind_rows, filtered_data)) +} diff --git a/R/sysdata.rda b/R/sysdata.rda index edce957..de61aad 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/data-raw/default_config.R b/data-raw/default_config.R index 5da40c0..416b636 100644 --- a/data-raw/default_config.R +++ b/data-raw/default_config.R @@ -5,7 +5,6 @@ library(usethis) config_path <- get_default_config_path() default_config <- read_yaml(config_path) -attr(default_config, "path") <- config_path class(default_config) <- "jpaccidents_config" use_data(default_config, overwrite = TRUE, internal = TRUE) diff --git a/inst/extdata/jpaccidents-config.yaml b/inst/extdata/jpaccidents-config.yaml index 7c940ec..65e59db 100644 --- a/inst/extdata/jpaccidents-config.yaml +++ b/inst/extdata/jpaccidents-config.yaml @@ -1,287 +1,408 @@ -main_data: - file_pattern: "honhyo[_][0-9]{4}[.]csv" - columns: +metadata: + version: "0.1.0" + last_updated: "2025-01-18" + author: "Keisuke ANDO" + +# File type definitions +file_types: + main_data: + encoding: "Shift_JIS" + delimiter: "," + pattern: "honhyo[_][0-9]{4}[.]csv" + columns: + - "資料区分" + - "都道府県コード" + - "警察署等コード" + - "本票番号" + - "事故内容" + - "死者数" + - "負傷者数" + - "路線コード" + - "上下線" + - "地点コード" + - "市区町村コード" + - "発生日時  年" + - "発生日時  月" + - "発生日時  日" + - "発生日時  時" + - "発生日時  分" + - "昼夜" + - "日の出時刻  時" + - "日の出時刻  分" + - "日の入り時刻  時" + - "日の入り時刻  分" + - "天候" + - "環状交差点の直径" + - "地形" + - "路面状態" + - "道路形状" + - "信号機" + - "一時停止規制 標識(当事者A)" + - "一時停止規制 表示(当事者A)" + - "一時停止規制 標識(当事者B)" + - "一時停止規制 表示(当事者B)" + - "車道幅員" + - "道路線形" + - "衝突地点" + - "ゾーン規制" + - "中央分離帯施設等" + - "歩車道区分" + - "事故類型" + - "年齢(当事者A)" + - "年齢(当事者B)" + - "当事者種別(当事者A)" + - "当事者種別(当事者B)" + - "用途別(当事者A)" + - "用途別(当事者B)" + - "車両形状(当事者A)" + - "車両形状等(当事者A)" + - "車両形状(当事者B)" + - "車両形状等(当事者B)" + - "オートマチック車(当事者A)" + - "オートマチック車(当事者B)" + - "サポカー(当事者A)" + - "サポカー(当事者B)" + - "速度規制(指定のみ)(当事者A)" + - "速度規制(指定のみ)(当事者B)" + - "車両の衝突部位(当事者A)" + - "車両の衝突部位(当事者B)" + - "車両の損壊程度(当事者A)" + - "車両の損壊程度(当事者B)" + - "エアバッグの装備(当事者A)" + - "エアバッグの装備(当事者B)" + - "サイドエアバッグの装備(当事者A)" + - "サイドエアバッグの装備(当事者B)" + - "人身損傷程度(当事者A)" + - "人身損傷程度(当事者B)" + - "地点 緯度(北緯)" + - "地点 経度(東経)" + - "曜日(発生年月日)" + - "祝日(発生年月日)" + - "認知機能検査経過日数(当事者A)" + - "認知機能検査経過日数(当事者B)" + - "運転練習の方法(当事者A)" + - "運転練習の方法(当事者B)" + sub_data: + encoding: "Shift_JIS" + delimiter: "," + pattern: "hojuhyo[_][0-9]{4}[.]csv" + columns: + - "資料区分" + - "都道府県コード" + - "警察署等コード" + - "本票番号" + - "補充票番号" + - "当事者種別" + - "用途別" + - "車両形状等" + - "乗車別" + - "乗車等の区分" + - "サポカー" + - "エアバッグの装備" + - "サイドエアバッグの装備" + - "人身損傷程度" + - "車両の衝突部位" + - "車両の損壊程度" + highway_data: + encoding: "Shift_JIS" + delimiter: "," + pattern: "kosokuhyo[_][0-9]{4}[.]csv" + columns: + - "資料区分" + - "都道府県コード" + - "警察署等コード" + - "本票番号" + - "発生地点" + - "道路管理者区分" + - "道路区分" + - "曲線半径" + - "縦断勾配" + - "トンネル番号" + - "当事者車両台数" + - "事故類型" + - "車両単独事故の対象物" + - "臨時速度規制の有無" + - "速度規制(臨時のみ)" + - "トンネル延長距離" + +# Logical schema definitions +schemas: + accidents_info: + primary_key: + - document_type + - prefecture_code + - police_code + - main_id + data_columns: + - injury_pattern + - faitality_number + - injury_number + - road_code + - road_direction + - kilopost_number + - city_code + - occurrence_time + - day_night + - sunrise_hour + - sunrise_min + - sunset_hour + - sunset_min + - weather + - roundabout_diameter + - region_type + - road_surface + - road_shape + - traffic_signal + - road_width + - road_alignment + - collision_position + - zone_regulation + - center_divider + - road_verge + - impact_type + - day_of_week + - holiday + - geometry + source_files: + - main_data + persons_info: + primary_key: + - document_type + - prefecture_code + - police_code + - main_id + - sub_id + data_columns: + - stop_sign + - stop_mark + - age + - party_type + - use_type + - car_type + - automatic_car + - support_car + - speed_limit + - impact_part + - damage_level + - airbag + - side_airbag + - injury_level + - cognitive_test + - driving_practice + source_files: + - main_data + - sub_data + highways_info: + primary_key: + - document_type + - prefecture_code + - police_code + - main_id + data_columns: + - kilopost_number + - road_manager + - highway_type + - curve_radius + - longitudinal_slope + - tunnel_number + - car_count + - impact_type + - damaged_property + - limit_type + - temp_limit + - tunnel_length + source_files: + - highway_data + +# Column name variations mapping +columns: document_type: - original_name: "資料区分" - tag: "key" + - "資料区分" prefecture_code: - original_name: "都道府県コード" - tag: "key" + - "都道府県コード" police_code: - original_name: "警察署等コード" - tag: "key" + - "警察署等コード" main_id: - original_name: "本票番号" - tag: "key" + - "本票番号" injury_pattern: - original_name: "事故内容" - tag: "accident" + - "事故内容" faitality_number: - original_name: "死者数" - tag: "accident" + - "死者数" injury_number: - original_name: "負傷者数" - tag: "accident" + - "負傷者数" road_code: - original_name: "路線コード" - tag: "accident" + - "路線コード" road_direction: - original_name: "上下線" - tag: "accident" + - "上下線" kilopost_number: - original_name: "地点コード" - tag: "accident" + - "地点コード" + - "発生地点" city_code: - original_name: "市区町村コード" - tag: "accident" + - "市区町村コード" occurrence_year: - original_name: "発生日時  年" + - "発生日時  年" occurrence_month: - original_name: "発生日時  月" + - "発生日時  月" occurrence_day: - original_name: "発生日時  日" + - "発生日時  日" occurrence_hour: - original_name: "発生日時  時" + - "発生日時  時" occurrence_min: - original_name: "発生日時  分" + - "発生日時  分" day_night: - original_name: "昼夜" - tag: "accident" + - "昼夜" sunrise_hour: - original_name: "日の出時刻  時" - tag: "accident" + - "日の出時刻  時" sunrise_min: - original_name: "日の出時刻  分" - tag: "accident" + - "日の出時刻  分" sunset_hour: - original_name: "日の入り時刻  時" - tag: "accident" + - "日の入り時刻  時" sunset_min: - original_name: "日の入り時刻  分" - tag: "accident" + - "日の入り時刻  分" weather: - original_name: "天候" - tag: "accident" + - "天候" roundabout_diameter: - original_name: "環状交差点の直径" - tag: "accident" + - "環状交差点の直径" region_type: - original_name: "地形" - tag: "accident" + - "地形" road_surface: - original_name: "路面状態" - tag: "accident" + - "路面状態" road_shape: - original_name: "道路形状" - tag: "accident" + - "道路形状" traffic_signal: - original_name: "信号機" - tag: "accident" + - "信号機" stop_sign_a: - original_name: "一時停止規制 標識(当事者A)" - tag: "person_a" + - "一時停止規制 標識(当事者A)" stop_mark_a: - original_name: "一時停止規制 表示(当事者A)" - tag: "person_a" + - "一時停止規制 表示(当事者A)" stop_sign_b: - original_name: "一時停止規制 標識(当事者B)" - tag: "person_b" + - "一時停止規制 標識(当事者B)" stop_mark_b: - original_name: "一時停止規制 表示(当事者B)" - tag: "person_b" + - "一時停止規制 表示(当事者B)" road_width: - original_name: "車道幅員" - tag: "accident" + - "車道幅員" road_alignment: - original_name: "道路線形" - tag: "accident" + - "道路線形" collision_position: - original_name: "衝突地点" - tag: "accident" + - "衝突地点" zone_regulation: - original_name: "ゾーン規制" - tag: "accident" + - "ゾーン規制" center_divider: - original_name: "中央分離帯施設等" - tag: "accident" + - "中央分離帯施設等" road_verge: - original_name: "歩車道区分" - tag: "accident" + - "歩車道区分" impact_type: - original_name: "事故類型" - tag: "accident" + - "事故類型" age_a: - original_name: "年齢(当事者A)" - tag: "person_a" + - "年齢(当事者A)" age_b: - original_name: "年齢(当事者B)" - tag: "person_b" + - "年齢(当事者B)" party_type_a: - original_name: "当事者種別(当事者A)" - tag: "person_a" + - "当事者種別(当事者A)" party_type_b: - original_name: "当事者種別(当事者B)" - tag: "person_b" + - "当事者種別(当事者B)" use_type_a: - original_name: "用途別(当事者A)" - tag: "person_a" + - "用途別(当事者A)" use_type_b: - original_name: "用途別(当事者B)" - tag: "person_b" + - "用途別(当事者B)" car_type_a: - original_name: - - "車両形状(当事者A)" - - "車両形状等(当事者A)" - tag: "person_a" + - "車両形状(当事者A)" + - "車両形状等(当事者A)" car_type_b: - original_name: - - "車両形状(当事者B)" - - "車両形状等(当事者B)" - tag: "person_b" + - "車両形状(当事者B)" + - "車両形状等(当事者B)" automatic_car_a: - original_name: "オートマチック車(当事者A)" - tag: "person_a" + - "オートマチック車(当事者A)" automatic_car_b: - original_name: "オートマチック車(当事者B)" - tag: "person_b" + - "オートマチック車(当事者B)" support_car_a: - original_name: "サポカー(当事者A)" - tag: "person_a" + - "サポカー(当事者A)" support_car_b: - original_name: "サポカー(当事者B)" - tag: "person_b" + - "サポカー(当事者B)" speed_limit_a: - original_name: "速度規制(指定のみ)(当事者A)" - tag: "person_a" + - "速度規制(指定のみ)(当事者A)" speed_limit_b: - original_name: "速度規制(指定のみ)(当事者B)" - tag: "person_b" + - "速度規制(指定のみ)(当事者B)" impact_part_a: - original_name: "車両の衝突部位(当事者A)" - tag: "person_a" + - "車両の衝突部位(当事者A)" impact_part_b: - original_name: "車両の衝突部位(当事者B)" - tag: "person_b" + - "車両の衝突部位(当事者B)" damage_level_a: - original_name: "車両の損壊程度(当事者A)" - tag: "person_a" + - "車両の損壊程度(当事者A)" damage_level_b: - original_name: "車両の損壊程度(当事者B)" - tag: "person_b" + - "車両の損壊程度(当事者B)" airbag_a: - original_name: "エアバッグの装備(当事者A)" - tag: "person_a" + - "エアバッグの装備(当事者A)" airbag_b: - original_name: "エアバッグの装備(当事者B)" - tag: "person_b" + - "エアバッグの装備(当事者B)" side_airbag_a: - original_name: "サイドエアバッグの装備(当事者A)" - tag: "person_a" + - "サイドエアバッグの装備(当事者A)" side_airbag_b: - original_name: "サイドエアバッグの装備(当事者B)" - tag: "person_b" + - "サイドエアバッグの装備(当事者B)" injury_level_a: - original_name: "人身損傷程度(当事者A)" - tag: "person_a" + - "人身損傷程度(当事者A)" injury_level_b: - original_name: "人身損傷程度(当事者B)" - tag: "person_b" + - "人身損傷程度(当事者B)" latitude: - original_name: "地点 緯度(北緯)" - tag: "accident" + - "地点 緯度(北緯)" longitude: - original_name: "地点 経度(東経)" - tag: "accident" + - "地点 経度(東経)" day_of_week: - original_name: "曜日(発生年月日)" - tag: "accident" + - "曜日(発生年月日)" holiday: - original_name: "祝日(発生年月日)" - tag: "accident" + - "祝日(発生年月日)" cognitive_test_a: - original_name: "認知機能検査経過日数(当事者A)" - tag: "person_a" + - "認知機能検査経過日数(当事者A)" cognitive_test_b: - original_name: "認知機能検査経過日数(当事者B)" - tag: "person_b" + - "認知機能検査経過日数(当事者B)" driving_practice_a: - original_name: "運転練習の方法(当事者A)" - tag: "person_a" + - "運転練習の方法(当事者A)" driving_practice_b: - original_name: "運転練習の方法(当事者B)" - tag: "person_b" - -sub_data: - file_pattern: "hojuhyo[_][0-9]{4}[.]csv" - columns: - document_type: - original_name: "資料区分" - prefecture_code: - original_name: "都道府県コード" - police_code: - original_name: "警察署等コード" - main_id: - original_name: "本票番号" + - "運転練習の方法(当事者B)" sub_id: - original_name: "補充票番号" + - "補充票番号" party_type: - original_name: "当事者種別" + - "当事者種別" use_type: - original_name: "用途別" + - "用途別" car_type: - original_name: "車両形状等" + - "車両形状等" boarding_type: - original_name: "乗車別" + - "乗車別" boarding_tag: - original_name: "乗車等の区分" + - "乗車等の区分" support_car: - original_name: "サポカー" + - "サポカー" airbag: - original_name: "エアバッグの装備" + - "エアバッグの装備" side_airbag: - original_name: "サイドエアバッグの装備" + - "サイドエアバッグの装備" injury_level: - original_name: "人身損傷程度" + - "人身損傷程度" impact_part: - original_name: "車両の衝突部位" + - "車両の衝突部位" damage_level: - original_name: "車両の損壊程度" - -highway_data: - file_pattern: "kosokuhyo[_][0-9]{4}[.]csv" - columns: - document_type: - original_name: "資料区分" - prefecture_code: - original_name: "都道府県コード" - police_code: - original_name: "警察署等コード" - main_id: - original_name: "本票番号" - kilopost_number: - original_name: "発生地点" + - "車両の損壊程度" road_manager: - original_name: "道路管理者区分" + - "道路管理者区分" highway_type: - original_name: "道路区分" + - "道路区分" curve_radius: - original_name: "曲線半径" + - "曲線半径" longitudinal_slope: - original_name: "縦断勾配" + - "縦断勾配" tunnel_number: - original_name: "トンネル番号" + - "トンネル番号" car_count: - original_name: "当事者車両台数" - impact_type: - original_name: "事故類型" + - "当事者車両台数" damaged_property: - original_name: "車両単独事故の対象物" + - "車両単独事故の対象物" limit_type: - original_name: "臨時速度規制の有無" + - "臨時速度規制の有無" temp_limit: - original_name: "速度規制(臨時のみ)" + - "速度規制(臨時のみ)" tunnel_length: - original_name: "トンネル延長距離" + - "トンネル延長距離" # Code mapping concepts: diff --git a/man/detect_dataset_name.Rd b/man/detect_dataset_name.Rd new file mode 100644 index 0000000..a5c92d4 --- /dev/null +++ b/man/detect_dataset_name.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/detect-dataset-name.R +\name{detect_dataset_name} +\alias{detect_dataset_name} +\title{Identify dataset names from file paths} +\usage{ +detect_dataset_name(files, quiet = FALSE) +} +\arguments{ +\item{files}{A character string specifying the file paths.} + +\item{quiet}{A logical indicating whether to suppress warnings.} +} +\value{ +A character vector of dataset names +} +\description{ +This function identifies dataset names based on the file paths and +configuration patterns. +} +\details{ +Invalid files are replaced with \code{NA}, and a warning is issued. +} +\examples{ +detect_dataset_name(c("example/honhyo_2022.csv", "example/honhyo_2023.csv")) +} diff --git a/man/extract_schema_columns.Rd b/man/extract_schema_columns.Rd new file mode 100644 index 0000000..0399765 --- /dev/null +++ b/man/extract_schema_columns.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract-schema-columns.R +\name{extract_schema_columns} +\alias{extract_schema_columns} +\title{Filter data by schema} +\usage{ +extract_schema_columns(data, schema_name) +} +\arguments{ +\item{data}{An accident data object.} + +\item{schema_name}{A character string specifying the schema to use for +filtering.} +} +\value{ +A data frame containing only the columns defined in the schema, or +\code{NULL}. +} +\description{ +This function filters a data frame based on a specified schema, which +defines the columns to keep. It also handles special case where data +needs to be reshaped to long format before filtering. +} +\examples{ +\dontrun{ +persons_info <- extract_schema_columns(accident_data, "persons_info") +} +} diff --git a/man/filter_data_by_tag.Rd b/man/filter_data_by_tag.Rd deleted file mode 100644 index 8bc6c77..0000000 --- a/man/filter_data_by_tag.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter-data-by-tag.R -\name{filter_data_by_tag} -\alias{filter_data_by_tag} -\title{Filter data by tag} -\usage{ -filter_data_by_tag(data, tag) -} -\arguments{ -\item{data}{An accident data object.} - -\item{tag}{A character string specifying the tag to filter by.} -} -\value{ -A data frame containing only the columns with the specified tag. -} -\description{ -This function filters data by a specific tag. -} -\examples{ -\dontrun{ -person_data <- filter_data_by_tag(accident_data, "person") -} -} diff --git a/man/lookup_column_names.Rd b/man/lookup_column_names.Rd index 0bc0725..99c975b 100644 --- a/man/lookup_column_names.Rd +++ b/man/lookup_column_names.Rd @@ -4,16 +4,14 @@ \alias{lookup_column_names} \title{Lookup new column names for an accident dataset} \usage{ -lookup_column_names(dataset_name, original_names) +lookup_column_names(original_names) } \arguments{ -\item{dataset_name}{The name of the dataset.} - \item{original_names}{A vector of original column names.} } \value{ A vector of new column names corresponding to the provided -original names. +original names, or \code{NA} for unmatched names. } \description{ This function retrieves the new column names corresponding to diff --git a/man/lookup_dataset_name.Rd b/man/lookup_dataset_name.Rd deleted file mode 100644 index 4c78e12..0000000 --- a/man/lookup_dataset_name.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lookup-dataset-name.R -\name{lookup_dataset_name} -\alias{lookup_dataset_name} -\title{Find dataset name from file path} -\usage{ -lookup_dataset_name(file) -} -\arguments{ -\item{file}{A character string specifying the file path.} -} -\value{ -Dataset name -} -\description{ -This function determines the dataset name based on the file path and -configuration. -} -\examples{ -lookup_dataset_name("example/honhyo_2023.csv") -} diff --git a/man/post_process.Rd b/man/post_process.Rd index abf3813..75cb36f 100644 --- a/man/post_process.Rd +++ b/man/post_process.Rd @@ -2,11 +2,16 @@ % Please edit documentation in R/post-process.R \name{post_process} \alias{post_process} +\alias{post_process.accident_data} \alias{post_process_main} \alias{post_process_sub} \alias{post_process_highway} \title{Post-process data} \usage{ +post_process(data) + +\method{post_process}{accident_data}(data) + post_process_main(data) post_process_sub(data) diff --git a/man/read_accident_data.Rd b/man/read_accident_data.Rd deleted file mode 100644 index c93b99a..0000000 --- a/man/read_accident_data.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read-accident-data.R -\name{read_accident_data} -\alias{read_accident_data} -\title{Read accident data from a CSV file} -\usage{ -read_accident_data( - file, - post_process = select_post_processor(lookup_dataset_name(file)) -) -} -\arguments{ -\item{file}{The path to the CSV file.} - -\item{post_process}{A function to apply post-processing to the data.} -} -\value{ -A data frame containing the accident data. -} -\description{ -This function reads accident data from a CSV file, -renames the columns, adds attributes and class, and applies -post-processing if provided. -} -\examples{ -\dontrun{ -read_accident_data("example/honhyo_2023.csv") -} -} diff --git a/man/read_accidents_data.Rd b/man/read_accidents_data.Rd new file mode 100644 index 0000000..31956de --- /dev/null +++ b/man/read_accidents_data.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read-accident-data.R +\name{read_accidents_data} +\alias{read_accidents_data} +\title{Read accident data from a CSV file} +\usage{ +read_accidents_data(file_paths) +} +\arguments{ +\item{file_paths}{A character vector of file paths to the CSV files.} +} +\value{ +A list containing the accident data split into three data frames: +\code{accidents_info}, \code{persons_info}, and \code{highways_info}. +} +\description{ +This function reads accident data from multiple CSV files, +processes each file, and combines the data into three categories: +\itemize{ +\item \code{accidents_info} +\item \code{persons_info} +\item \code{highways_info} +} +} +\examples{ +\dontrun{ +read_accidents_data(c( + "example/honhyo_2023.csv", + "example/hojuhyo_2023.csv" +)) +} +} diff --git a/man/select_post_processor.Rd b/man/select_post_processor.Rd deleted file mode 100644 index 60f69de..0000000 --- a/man/select_post_processor.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/post-process.R -\name{select_post_processor} -\alias{select_post_processor} -\title{Select post-processor function based on dataset name} -\usage{ -select_post_processor(dataset_name) -} -\arguments{ -\item{dataset_name}{The name of the dataset.} -} -\value{ -The post-processing function corresponding to the dataset name. -} -\description{ -This function selects the post-processor function based on -the given dataset name. -}