")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="
"+y.value+"
hljs.initHighlightingOnLoad(); </script>
Weight Lifting Classe Prediction
Yu Fang, 06/2014
- Initialization
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
- Read data from csv files
training=data.frame(read.csv(file="/Volumes/WATERMELON/Study/practicalMachinLearning/pml-training.csv",head=TRUE,sep=","))
testing=data.frame(read.csv(file="/Volumes/WATERMELON/Study/practicalMachinLearning/pml-testing.csv",head=TRUE,sep=","))
- Data cleaning and preprocessing
#Exclude the columns containing blank or NA, this step leaves 60 columns of each data set.
training[training==""]=NA
training=training[,colSums(is.na(training))==0]
testing[testing==""]=NA
testing=testing[,colSums(is.na(testing))==0]
#Use "nearZeroVar" function to detect zero covariates.
nsv=nearZeroVar(training,saveMetrics=TRUE)
#For variable "new_window", nzv=TRUE. In training set, "new_window" has 19216 "no" and 406 "yes"; in testing set, "new_window" are all "no". So remove samples with new_window=="yes" in training set.
useTrain=training[ which(training$new_window=="no"),]
#Exclude some other variables that obviously not relevant to class prediction:"X","cvtd_timestamp","raw_timestamp_part_1","raw_timestamp_part_2", and also from last section, "new_window"
excludeVars=names(useTrain) %in% c("X","cvtd_timestamp","raw_timestamp_part_1","raw_timestamp_part_2","new_window")
useTrain=useTrain[!excludeVars]
useTest=testing[!excludeVars]
#Considering there might be variance between subjects, dummy code the "user_name"
dummies1=dummyVars(classe ~ user_name,data=useTrain)
dummycols1=predict(dummies1,newdata=useTrain)
useTrain=cbind(dummycols1,useTrain)
dummies2=dummyVars(problem_id ~ user_name,data=useTest)
dummycols2=predict(dummies2,newdata=useTest)
useTest=cbind(dummycols2,useTest)
excludeVar=names(useTrain) %in% c("user_name")
useTrain=useTrain[!excludeVar]
useTest=useTest[!excludeVar]
#Standardize the data, and use PCA to further pick the principle components explaining 95% of the variance in the predictors.
preProc=preProcess(useTrain[,-60],method=c("center","scale","pca"),thresh=0.95)
trainPC=predict(preProc,useTrain[,-60])
# It results in 25 principle components.
preProc
##
## Call:
## preProcess.default(x = useTrain[, -60], method = c("center",
## "scale", "pca"), thresh = 0.95)
##
## Created from 19216 samples and 59 variables
## Pre-processing: centered, scaled, principal component signal extraction
##
## PCA needed 25 components to capture 95 percent of the variance
- Fit the model, and do prediction.
#Make 10-fold cross validation
tc=trainControl("cv",10,savePred=T)
#As far as I understand, this case is to determine the quality of weight lifting by various measured parameters. So it is likely to be a tree-like selection rather than a linear regression model. Also higher accuracy is desired. Therefore random forest model is selected.
rfModelFit=train(useTrain$classe ~ .,method="rf",data=trainPC,trControl=tc)
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
#The fit model results summary
rfModelFit
## Random Forest
##
## 19216 samples
## 24 predictors
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 17296, 17293, 17295, 17296, 17293, 17294, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 1 1 0.002 0.002
## 10 1 1 0.003 0.003
## 20 1 1 0.004 0.005
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
- Predict the testing data
testPC=predict(preProc,useTest[,-60])
pred=predict(rfModelFit,testPC)
# The predict results on testing data
pred
## [1] B A C A A E D B A A B C B A E E A B B B
## Levels: A B C D E