-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
125 lines (108 loc) · 5.17 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
## Converts the raw Samsung data into a tidy dataset for future analysis.
## Assumes the data folder has the same structure as the original data provided
## by the project.
## Core function that will return the cleaned dataset
run_analysis <- function(data_directory = 'data'){
# Load the dataframes
dataframes <- load_dataframes(data_directory)
# convert the y_* dataframes to friendly names and relabel
dataframes$y_train$V1 <- merge(dataframes$y_train,
dataframes$activity_labels)[,2]
colnames(dataframes$y_train) <- 'activities'
dataframes$y_test$V1 <- merge(dataframes$y_test,
dataframes$activity_labels)[,2]
colnames(dataframes$y_test) <- 'activities'
# remove activity_labels from memory
dataframes$activity_labels <- NULL
# label the X_* dataframes
colnames(dataframes$X_train) <- dataframes$features$V2
colnames(dataframes$X_test) <- dataframes$features$V2
# remove features from memory
dataframes$activity_labels <- NULL
# label the subject_* dataframes
colnames(dataframes$subject_test) <- 'subject'
colnames(dataframes$subject_train) <- 'subject'
# Assemble the test and train dataframes
test_dataframe <- do.call(cbind,
list(dataframes$subject_test,
dataframes$y_test,
dataframes$X_test))
train_dataframe <- do.call(cbind,
list(dataframes$subject_train,
dataframes$y_train,
dataframes$X_train))
# Combine the test and train dataframes
tidy_data = rbind(test_dataframe, train_dataframe)
# Remove original dataframes form memory
dataframes <- NULL
# Drop all variables that are not related to mean() or std(). As the
# documentation for the project is ambiguous only those features directly
# referencing -mean() or -std() are kept. Angle features such as
# angle(tBodyGyroJerkMean,gravityMean) have been dropped.
features <- c(1, 2, grep('-mean', names(tidy_data), ignore.case=TRUE),
grep('-std', names(tidy_data), ignore.case=TRUE))
tidy_data <- tidy_data[,features]
# write our the cleaned data
write.csv(tidy_data, file='tidy_data_large.csv', row.names=FALSE)
# Summarize the data
tidy_data <- aggregate(tidy_data[,-c(1,2)],
by=list(tidy_data$subject, tidy_data$activities),
FUN=sum)
colnames(tidy_data)[1:2] <- c('subject', 'activity')
# write out our summary data
write.csv(tidy_data, file='tidy_data.csv', row.names=FALSE)
# return the summary data for further analysis
tidy_data
}
# Load all the various data files into memory
load_dataframes <- function(data_directory) {
activity_labels <- read.table(sprintf('%s/activity_labels.txt',
data_directory),
quote='',
nrows=6,
colClasses = c('integer', 'character'))
features <- read.table(sprintf('%s/features.txt',
data_directory),
quote='',
nrows=561,
colClasses = c('NULL', 'character'))
subject_test <- read.table(sprintf('%s/test/subject_test.txt',
data_directory),
quote='',
nrows=2947,
colClasses = c('integer'))
X_test <- read.table(sprintf('%s/test/X_test.txt',
data_directory),
quote='',
nrows=2947,
colClasses = c('numeric'))
y_test <- read.table(sprintf('%s/test/y_test.txt',
data_directory),
quote='',
nrows=2947,
colClasses = c('integer'))
subject_train <- read.table(sprintf('%s/train/subject_train.txt',
data_directory),
quote='',
nrows=7352,
colClasses = c('integer'))
X_train <- read.table(sprintf('%s/train/X_train.txt',
data_directory),
quote='',
nrows=7352,
colClasses = c('numeric'))
y_train <- read.table(sprintf('%s/train/y_train.txt',
data_directory),
quote='',
nrows=7352,
colClasses = c('integer'))
# return a list of all the loaded datasets.
list(activity_labels = activity_labels,
features = features,
subject_test = subject_test,
X_test = X_test,
y_test = y_test,
subject_train = subject_train,
X_train = X_train,
y_train = y_train)
}