LOOCV
LOOCV (leave one out cross validation)
Download this excellent book: http://www-bcf.usc.edu/~gareth/ISL/ and search “LOOCV” or just scroll to section 5.1.2. on p. 178.
Here’s the dataset we’re using: http://www-bcf.usc.edu/~gareth/ISL/Auto.csv
# LOOCV (leave one out cross validation)
install.packages("boot")
require(boot)
?cv.glm
# cv.glm(data, glmfit, cost, K)
# this runs k-fold cross validation. When k = the number of observations in your dataset, then that's LOOCV
# to run LOOCV, set k=n or just don't specify (its default is k=n)
# cost specifies how to evaluate how good the model is; the default is the average squared error function.
# it requires a glm object as input, so you need to run your model first and then validate it.
Auto <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv") str(Auto) # uh-oh. it interpreted horsepower as a factor, but it shoudl be numeric. Auto$horsepower <- as.numeric(Auto$horsepower) # run a simple regression, predicting mpg from horsepower model1 <- glm(mpg ~ horsepower, data=Auto) summary(model1) # okay, now let's cross-validate that model. # note: this takes a long time! it's running nearly 400 models. if you want it to be faster, you can set k to something smaller than n loocv1 <- cv.glm(data=Auto, glmfit=model1) # the only thing you really need in this loocv is an object called delta. it's got two items in it. The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation. Since we ran LOOCV, the two values should be super similar. loocv1$delta # okay great. we have an estimate of model fit for the simple linear regession. let's play around with adding polynomial terms and see if we can improve the fit. # there's a handy way to add polynomial terms using poly() model2 <- glm(mpg ~ poly(horsepower, 2), data=Auto) model3 <- glm(mpg ~ poly(horsepower, 3), data=Auto) model4 <- glm(mpg ~ poly(horsepower, 4), data=Auto) model5 <- glm(mpg ~ poly(horsepower, 5), data=Auto) model6 <- glm(mpg ~ poly(horsepower, 6), data=Auto) # i'm going to switch from LOOCV to a 20-fold cv, to save a little processing time squared_errs <- rep(0,6) loocv <- cv.glm(data=Auto, glmfit=model1, K=20) squared_errs[1] <- loocv$delta[2] loocv <- cv.glm(data=Auto, glmfit=model2, K=20) squared_errs[2] <- loocv$delta[2] loocv <- cv.glm(data=Auto, glmfit=model3, K=20) squared_errs[3] <- loocv$delta[2] loocv <- cv.glm(data=Auto, glmfit=model4, K=20) squared_errs[4] <- loocv$delta[2] loocv <- cv.glm(data=Auto, glmfit=model5, K=20) squared_errs[5] <- loocv$delta[2] loocv <- cv.glm(data=Auto, glmfit=model6, K=20) squared_errs[6] <- loocv$delta[2] # I'm sure there's a way better way to do that with a for loop or something, but i couldn't get it to work. # this is a vector of the squared error for each model squared_errs # let's plot it to get a sense for how fit changes as a function of polynomial order plot(squared_errs) # looks like there's a big drop in error when we go from 2nd order to 3rd order (i.e. allowing a cubic function) # that makes sense, given the data: plot(Auto$mpg~Auto$horsepower) # looks cubic # pretty plotting time :) library(ggplot2) library(RColorBrewer) ggplot(data=Auto, aes(x=horsepower, y=mpg)) + geom_point() # to plot the lines from each function, use predict() to get the predicted values from each regression equation Auto$model1 <- predict(model1) Auto$model2 <- predict(model2) Auto$model3 <- predict(model3) Auto$model4 <- predict(model4) Auto$model5 <- predict(model5) Auto$model6 <- predict(model6) # pick a palette display.brewer.all() colors <- brewer.pal(9,"Spectral") colors <- c(colors[1:3], colors[7:9]) #take just the 6 colors from the two ends (they're brighter) # how to check out a color palette n <- 6 # the number of colors to plot pie(rep(1,n), col=colors) # plot all of the 6 models on the data. notice that the model doesn't improve much after cubic. ggplot(data=Auto, aes(x=horsepower, y=mpg)) + geom_point() + geom_line(aes(x=horsepower, y=model1, colour="linear"), size = 2, alpha = .5 ) + geom_line(aes(x=horsepower, y=model2, colour="quadratic"), size = 2, alpha = .5 ) + geom_line(aes(x=horsepower, y=model3, colour="cubic"), size = 2, alpha = .5 ) + geom_line(aes(x=horsepower, y=model4, colour="quartic"), size = 2, alpha = .5 ) + geom_line(aes(x=horsepower, y=model5, colour="fiveish"), size = 2, alpha = .5 ) + geom_line(aes(x=horsepower, y=model6, colour="sixtacular"), size = 2, alpha = .5 ) + guides(color=guide_legend(title="model"))