-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
78 lines (57 loc) · 3.71 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Merges the training and the test sets to create one data set.
# Extracts only the measurements on the mean and standard deviation for each measurement.
# Uses descriptive activity names to name the activities in the data set
# Appropriately labels the data set with descriptive variable names.
# From the data set in step 4, creates a second, independent tidy data set with the average of each variable for each activity and each subject.
# Given path to features file, activity labels file and dubject file
# this method extracts the mean and standard deviation from the features file
# and labels the headers for the features using the headers from features.txt file
# It then add to each observation the subject that performed this observation and the
# type of activity the subject performed during this observation
ExtractDataSet <- function(featuresPath, labelsPath, subjectsPath) {
labels <- tbl_df(read.table(labelsPath))
features <- tbl_df(read.table(featuresPath, col.names=header))
subjects <- tbl_df(read.table(subjectsPath, col.names=c("Subject")))
activities <- mutate(labels, V1 = activitiesHeaders[V1])
colnames(activities) = c("Activity")
activities = tbl_df(activities)
features <- select(features, matches("\\W+(mean|std)\\W+", ignore.case = TRUE))
dataset = tbl_df(cbind(cbind(subjects, activities), features))
dataset
}
# read the column headers from features.txt
header <- as.vector(read.table("getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/features.txt", sep=" ")[[2]])
# read the activity labels from activity_labels.txt
activitiesHeaders <- as.vector(read.table("getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/activity_labels.txt", sep=" "))[[2]]
# exctracts the test data from test folder
test <- ExtractDataSet("getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/test/x_test.txt", "getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/test/y_test.txt", "getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/test/subject_test.txt")
# extracts train data from train folder
train <- ExtractDataSet("getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/train/x_train.txt", "getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/train/y_train.txt", "getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset/train/subject_train.txt")
# merges train and test datasets into 1 dataset
merged <- tbl_df(rbind(test,train))
# use descritptive names for the headers (replace with readable names)
# we will replace abbreviations with the full words suffixed and prefixed with '.'
names(merged) <- gsub('^t',".Time.",names(merged))
names(merged) <- gsub('^f',".Frequency.",names(merged))
names(merged) <- gsub('Acc',".Acceleration.",names(merged))
names(merged) <- gsub('GyroJerk',".AngularAcceleration.",names(merged))
names(merged) <- gsub('Gyro',".AngularVelocity.",names(merged))
names(merged) <- gsub('Mag',".Magnitude.",names(merged))
names(merged) <- gsub('Freq\\.',".Frequency.",names(merged))
# reformat the mean and standard deviation headers
names(merged) <- gsub('\\.mean',".Mean.",names(merged))
names(merged) <- gsub('\\.std',".StandardDeviation.",names(merged))
# remove extra 'Body' word feom the headers
names(merged) <- gsub("BodyBody",".Body.",names(merged))
#remove dots at the beggining
names(merged) <- gsub("^(\\.)+","",names(merged))
# remove dots at the end
names(merged) <- gsub("(\\.)+$","",names(merged))
# remove extra dots at the middle
names(merged) <- gsub("(\\.)+",".",names(merged))
# now group by the Activity and Subject the mean of each observation we have in the merged data
grouped <- merged %>%
group_by(Activity, Subject) %>%
summarise_each(funs(mean))
# write the grouped data in a txt file
write.table(merged, row.name=FALSE, file="tidyData.txt", quote = FALSE)