library(knitr) opts_chunk$set(fig.width=8, fig.height=8)
This is the course project for Coursera Practical Machine Learning.
library(caret) ## set seed set.seed(32343) wle_data <- read.csv("data/pml-training.csv", na.strings = c("", "NA", "#DIV/0!"))
predictors <- colnames(wle_data) predictors <- predictors[colSums(is.na(wle_data)) == 0] predictors <- predictors[-(1:7)] # nsv <- nearZeroVar(wle_data[, predictors]) # predictors <- predictors[-nsv] classes <- unique(wle_data$classe) class_colors <- 1 + as.integer(classes) fitControl <- trainControl(method="repeatedcv", number=5, repeats=1, verboseIter=FALSE)
inBuild <- createDataPartition(y=wle_data$classe, p=0.7, list=FALSE) validation <- wle_data[-inBuild, predictors] buildData <- wle_data[inBuild, predictors] inTrain <- createDataPartition(y=buildData$classe, p=0.7, list=FALSE) training <- buildData[inTrain, ] testing <- buildData[-inTrain, ] rm(buildData, wle_data, inBuild, inTrain) clean <- gc(FALSE) rm(clean)
modeltree <- train(classe ~., data=training, method="rpart", trControl=fitControl) library(rattle) fancyRpartPlot(modeltree$finalModel)
predicttree <- predict(modeltree, newdata=testing) cmtree <- confusionMatrix(predicttree, testing$classe) plot(cmtree$table, col = class_colors, main = paste("Decision Tree Confusion Matrix: Accuracy=", round(cmtree$overall['Accuracy'], 2)))
kable(cmtree$byClass, digits = 2, caption = "Per Class Metrics")
Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | Prevalence | Detection Rate | Detection Prevalence | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
Class: A | 0.62 | 0.96 | 0.87 | 0.86 | 0.28 | 0.18 | 0.20 | 0.79 |
Class: B | 0.46 | 0.87 | 0.46 | 0.87 | 0.19 | 0.09 | 0.19 | 0.66 |
Class: C | 0.77 | 0.84 | 0.51 | 0.94 | 0.17 | 0.13 | 0.26 | 0.80 |
Class: D | 0.49 | 0.78 | 0.31 | 0.89 | 0.16 | 0.08 | 0.26 | 0.63 |
Class: E | 0.43 | 1.00 | 0.98 | 0.89 | 0.18 | 0.08 | 0.08 | 0.71 |
modellda <- train(classe ~., data=training, method="lda", trControl=fitControl) predictlda <- predict(modellda, newdata=testing) cmlda <- confusionMatrix(predictlda, testing$classe) plot(cmlda$table, col = class_colors, main = paste("LDA Confusion Matrix: Accuracy=", round(cmlda$overall['Accuracy'], 2)))
kable(cmlda$byClass, digits = 2, caption = "Per Class Metrics")
Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | Prevalence | Detection Rate | Detection Prevalence | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
Class: A | 0.84 | 0.91 | 0.79 | 0.93 | 0.28 | 0.24 | 0.30 | 0.87 |
Class: B | 0.65 | 0.93 | 0.70 | 0.92 | 0.19 | 0.13 | 0.18 | 0.79 |
Class: C | 0.68 | 0.90 | 0.59 | 0.93 | 0.17 | 0.12 | 0.20 | 0.79 |
Class: D | 0.71 | 0.92 | 0.64 | 0.94 | 0.16 | 0.12 | 0.18 | 0.82 |
Class: E | 0.62 | 0.98 | 0.86 | 0.92 | 0.18 | 0.11 | 0.13 | 0.80 |
modelgbm <- train(classe ~., data=training, method="gbm", trControl=fitControl, verbose = FALSE) predictgbm <- predict(modelgbm, newdata=testing) cmgbm <- confusionMatrix(predictgbm, testing$classe) plot(cmgbm$table, col = class_colors, main = paste("GBM Confusion Matrix: Accuracy=", round(cmgbm$overall['Accuracy'], 2)))
kable(cmgbm$byClass, digits = 2, caption = "Per Class Metrics")
Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | Prevalence | Detection Rate | Detection Prevalence | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
Class: A | 0.98 | 0.99 | 0.98 | 0.99 | 0.28 | 0.28 | 0.29 | 0.99 |
Class: B | 0.94 | 0.99 | 0.95 | 0.99 | 0.19 | 0.18 | 0.19 | 0.97 |
Class: C | 0.97 | 0.98 | 0.93 | 0.99 | 0.17 | 0.17 | 0.18 | 0.98 |
Class: D | 0.95 | 0.99 | 0.96 | 0.99 | 0.16 | 0.16 | 0.16 | 0.97 |
Class: E | 0.95 | 1.00 | 0.98 | 0.99 | 0.18 | 0.18 | 0.18 | 0.97 |
predicttesting <- data.frame(predicttree, predictgbm, predictlda, classe = testing$classe) modelensemble <- train(classe ~ ., data = predicttesting, method = "rf") predictvalidation <- data.frame(predicttree = predict(modeltree, newdata=validation), predictgbm = predict(modelgbm, newdata=validation), predictlda = predict(modellda, newdata=validation), classe = validation$classe) predictensemble <- predict(modelensemble, predictvalidation) cmensemble <- confusionMatrix(predictensemble, validation$classe) plot(cmensemble$table, col = class_colors, main = paste("Ensemble Confusion Matrix: Accuracy=", round(cmensemble$overall['Accuracy'], 2)))
kable(cmensemble$byClass, digits = 2, caption = "Per Class Metrics")
Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | Prevalence | Detection Rate | Detection Prevalence | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
Class: A | 0.98 | 0.99 | 0.98 | 0.99 | 0.28 | 0.28 | 0.28 | 0.99 |
Class: B | 0.95 | 0.98 | 0.94 | 0.99 | 0.19 | 0.18 | 0.20 | 0.97 |
Class: C | 0.96 | 0.99 | 0.93 | 0.99 | 0.17 | 0.17 | 0.18 | 0.97 |
Class: D | 0.95 | 0.99 | 0.97 | 0.99 | 0.16 | 0.16 | 0.16 | 0.97 |
Class: E | 0.97 | 1.00 | 0.99 | 0.99 | 0.18 | 0.18 | 0.18 | 0.98 |
modelrf <- train(classe ~ roll_belt + pitch_forearm + magnet_dumbbell_z + yaw_belt + magnet_dumbbell_y + roll_forearm + pitch_belt, data=training, method="rf", ntree = 100) predictrf <- predict(modelrf, newdata=testing) cmrf <- confusionMatrix(predictrf, testing$classe) plot(cmrf$table, col = class_colors, main = paste("Random Forest Confusion Matrix: Accuracy=", round(cmrf$overall['Accuracy'], 2)))
kable(cmrf$byClass, digits = 2, caption = "Per Class Metrics")
Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | Prevalence | Detection Rate | Detection Prevalence | Balanced Accuracy | |
---|---|---|---|---|---|---|---|---|
Class: A | 0.99 | 1.00 | 0.99 | 0.99 | 0.28 | 0.28 | 0.28 | 0.99 |
Class: B | 0.96 | 0.99 | 0.98 | 0.99 | 0.19 | 0.19 | 0.19 | 0.98 |
Class: C | 0.99 | 0.99 | 0.96 | 1.00 | 0.17 | 0.17 | 0.18 | 0.99 |
Class: D | 0.99 | 1.00 | 0.98 | 1.00 | 0.16 | 0.16 | 0.17 | 0.99 |
Class: E | 0.98 | 1.00 | 0.99 | 1.00 | 0.18 | 0.18 | 0.18 | 0.99 |