-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhomework #11.Rmd
180 lines (146 loc) · 4.97 KB
/
homework #11.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
---
title: 'homework #11'
author: "Maryam Nouri-Aiin"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#' Clear the environment
rm(list = ls())
# Packages
```{r include=FALSE}
# loading R packages
library(log4r)
library(TeachingDemos)
library(tidyverse)
library(pracma)
library(ggmosaic)
library(dplyr)
library(readr)
library(tidyr)
```
# Developping functions to batch processing BART dataset
```{r echo=TRUE}
# List all files recursively with full paths
files <- list.files("/Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData",
full.names = TRUE,
recursive = TRUE)
# Filter files to only include those with "countdata" in the name
countdata_files <- files[grepl("countdata.*\\.csv$", files)]
# Print the countdata files to verify
# print(countdata_files)
# print(countdata_files) # if any files are being collected
if (length(countdata_files) == 0) {
print("No countdata files found. Check the directory path and file pattern.")
}
if (length(countdata_files) > 0) {
example_data <- read_csv(countdata_files[[1]])
print(colnames(example_data))
} else {
print("No files found.")
}
# Define functions
clean_data <- function(data) {
# Drop rows where 'scientificName' or 'clusterSize' are NA
data %>% filter(!is.na(scientificName) & !is.na(clusterSize))
}
# clean_data <- function(data) {
# tidyr::drop_na(data) # using tidyr for drop_na()
# }
extract_year <- function(filename) {
# This regex looks specifically for a pattern where four digits are followed by "-MM"
matches <- regmatches(filename, regexpr("\\d{4}(?=-\\d{2})", filename, perl = TRUE))
if (length(matches) > 0 && !is.na(matches[1])) {
as.integer(matches[1])
} else {
NA_integer_ # Return NA if no valid year is found
}
}
# Example filename
example_file <- "NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv"
extract_year(example_file) # print 2017
calculate_abundance <- function(data) {
if ("clusterSize" %in% names(data)) {
# Convert clusterSize to numeric to ensure sum works correctly
numeric_cluster_size <- as.numeric(data$clusterSize)
if (any(is.na(numeric_cluster_size))) {
print("NA introduced by coercion when converting clusterSize to numeric.")
}
sum(numeric_cluster_size, na.rm = TRUE)
} else {
0 # Return 0 if the column doesn't exist
}
}
calculate_species_richness <- function(data) {
if ("scientificName" %in% names(data)) {
length(unique(data$scientificName))
} else {
0 # Return 0 if the column doesn't exist
}
}
```
# List of the functions
``` {}
# source function files ----
source(clean_data)
source(extract_year)
source(calculate_species_richness)
source(calculate_abundance)
```
# Printing the results and visualizing the data
```{r echo=TRUE, message=FALSE, warning=FALSE}
# Initialize summary data frame
summary_df <- data.frame(FileName = character(),
Abundance = integer(),
SpeciesRichness = integer(),
Year = integer())
for (file in countdata_files) {
print(paste("Processing:", file))
data <- readr::read_csv(file, show_col_types = FALSE)
print("Data read successfully.")
# Apply cleaning function
cleaned_data <- clean_data(data)
print(paste("Data after cleaning:", nrow(cleaned_data), "rows remaining."))
# Convert clusterSize to numeric if necessary
cleaned_data$clusterSize <- as.numeric(cleaned_data$clusterSize)
if (any(is.na(cleaned_data$clusterSize))) {
print("NA values found in clusterSize after conversion to numeric.")
}
# Extract year, calculate abundance and species richness
year <- extract_year(file)
abundance <- calculate_abundance(cleaned_data)
species_richness <- calculate_species_richness(cleaned_data)
# Append to summary data frame
summary_df <- rbind(summary_df, data.frame(FileName = basename(file),
Abundance = abundance,
SpeciesRichness = species_richness,
Year = year))
}
# Print the final summary data frame
print(summary_df)
ggplot(summary_df, aes(x = Year, y = Abundance, group = 1)) +
geom_line() +
geom_point() +
labs(title = "Abundance Over Years",
x = "Year",
y = "Abundance") +
theme_minimal()
ggplot(summary_df, aes(x = Year, y = SpeciesRichness, group = 1)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(title = "Species Richness Over Years",
x = "Year",
y = "Species Richness") +
theme_minimal()
summary_df_long <- tidyr::pivot_longer(summary_df, cols = c("Abundance", "SpeciesRichness"))
ggplot(summary_df_long, aes(x = Year, y = value, group = name, color = name)) +
geom_line() +
geom_point() +
facet_wrap(~ name, scales = "free_y") +
labs(title = "Abundance and Species Richness Over Years",
x = "Year",
y = "Value") +
theme_minimal()
```