-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
190 lines (135 loc) · 5.13 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# run_analysis.R -- Transformation script for Coursera Getting & Cleaning Data
# Course project.
#
# Lee Tibbert
# 2016-01-27
#
## Prologue
## Define and, if necessary, set up the world.
library(assertthat)
suppressPackageStartupMessages(library(dplyr))
library(magrittr) # for forward pipe %>% operator
library(readr)
outputDir <- "./output"
# In a course/production enviroment, may need to
# specify or fix-up directory protections.
sapply(c(outputDir), function(dir) {
if (!file.exists(dir))
dir.create(dir)
})
# Where in the world are we? Leave tracks in log.
cat(sprintf("\nStarting course project at: %s\n\n", date()))
cat(sprintf("Current working directory:\n %s\n\n", getwd()))
step_1 <- function()
{
# Create & return a data frame containing the data from
# the original test & training data sets. Add subject.id
# and activity.id fields using original data.
# Create unique column names from original names. Aids merging.
xColLabels <- read_table("features.txt",
col_names = "feature.label") %>%
mutate(feature.label =
sub("^([0-9]+)[ ]* ", "\\1.",
feature.label)) %>%
extract2("feature.label")
# Read Test Data
subjectTestDf <- read_table("subject_test.txt",
col_names = "subject.id")
yTestDf <- read_table("y_test.txt",
col_names = "activity.id")
xTestDf <- read_table("X_test.txt",
col_names = xColLabels)
testDf <- bind_cols(subjectTestDf, yTestDf, xTestDf)
# Read Train Data
subjectTrainDf <- read_table("subject_train.txt",
col_names = "subject.id")
yTrainDf <- read_table("y_train.txt",
col_names = "activity.id")
xTrainDf <- read_table("X_train.txt",
col_names = xColLabels)
trainDf <- bind_cols(subjectTrainDf, yTrainDf, xTrainDf)
# now merge test & train data frames
mergedDf <- bind_rows(trainDf, testDf)
mergedDf
}
step_2 <- function(inputDf)
{
# Retain only columns for subject.id, activity.id, *mean(), & *std().
# Uuse matches() rather than contains() to keep the original
# column left to right ordering. Reduce opportunities for
# confusion
outputDf <- inputDf %>%
select(subject.id,
activity.id,
matches("(.+mean\\(\\).*)|(.*std\\(\\).*)"))
outputDf
}
step_3 <- function(inputDf)
{
# Create a factor named "activity" based on activity.id.
# Give column "activity" descriptive levels/names read from
# the original data.
outputDf <- inputDf %>%
mutate(activity.id = as.factor(activity.id)) %>%
rename(activity = activity.id)
activityLevels <- read_table("activity_labels.txt",
col_names = c("activity.id", "activities")) %>%
extract2("activities")
levels(outputDf$activity) <- activityLevels
outputDf
}
step_4 <- function(inputDf) {
# inputDf <- step3Df
# Convert column names to be descriptive. Use full words, convert to
# lower case, remove parentheses, and use dots to separate words.
# Bacause of the column naming scheme in the original data,
# I still end up with anomalies such as foo.body.body.
# Might as well transform the whole mess in one go.
s4Names <- names(inputDf) %>%
gsub("-std\\(\\)",
"\\.standard\\.deviation\\.", .) %>%
gsub("-", ".", .) %>%
gsub("\\(\\)", "", .) %>%
gsub("^[0-9]+\\.", "", .) %>%
gsub("Acc", "\\.accelerometer\\.", .) %>%
gsub("Gyro", "\\.gyroscope\\.", .) %>%
gsub("Mag", "\\.magnitude\\.", .) %>%
tolower() %>%
gsub("^tbody", "time\\.domain\\.body\\.", .) %>%
gsub("^fbody", "frequency\\.domain\\.body\\.", .) %>%
gsub("^tgravity", "time\\.gravity\\.", .) %>%
gsub("\\.\\.", "\\.", .)
outputDf <- inputDf
colnames(outputDf) <- s4Names
outputDf
}
step_5 <- function(inputDf)
{
# Calculate the mean of each variable for each activity
# and subject. Column names will be updated to reflect
# new contents.
outputDf <- inputDf %>%
group_by(activity, subject.id) %>%
summarize_each(funs(mean))
# Update the column names to reflect the fact that
# they are not direct observations but rather the mean of
# such observations.
s5Names <- names(outputDf) %>%
sub("^([ft].*$)", "\\1\\.mean", .) %>%
gsub("\\.\\.", "\\.", .)
colnames(outputDf) <- s5Names
outputDf
}
## ------------------------------------------------- ##
## Begin main program
step1Df <- step_1()
step2Df <- step_2(step1Df)
step3Df <- step_3(step2Df)
step4Df <- step_4(step3Df)
step5Df <- step_5(step4Df)
outputFile <- file.path(getwd(), sub("\\./", "", outputDir),
"courseProjectOutput.txt")
write.table(step5Df, outputFile, row.names = FALSE)
cat(sprintf("\nOutput data frame written to file: %s\n", outputFile))
cat(sprintf("\nEnding course project at: %s\n\n", date()))
## -30- ##