# LOOCV

LOOCV (leave one out cross validation)

Download this excellent book: http://www-bcf.usc.edu/~gareth/ISL/ and search “LOOCV” or just scroll to section 5.1.2. on p. 178.

Here’s the dataset we’re using: http://www-bcf.usc.edu/~gareth/ISL/Auto.csv

`# LOOCV (leave one out cross validation)`

install.packages("boot")

require(boot)

?cv.glm

```
```# cv.glm(data, glmfit, cost, K)

# this runs k-fold cross validation. When k = the number of observations in your dataset, then that's LOOCV

# to run LOOCV, set k=n or just don't specify (its default is k=n)

# cost specifies how to evaluate how good the model is; the default is the average squared error function.

# it requires a glm object as input, so you need to run your model first and then validate it.

```
Auto <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv")
str(Auto)
# uh-oh. it interpreted horsepower as a factor, but it shoudl be numeric.
Auto$horsepower <- as.numeric(Auto$horsepower)
# run a simple regression, predicting mpg from horsepower
model1 <- glm(mpg ~ horsepower, data=Auto)
summary(model1)
# okay, now let's cross-validate that model.
# note: this takes a long time! it's running nearly 400 models. if you want it to be faster, you can set k to something smaller than n
loocv1 <- cv.glm(data=Auto, glmfit=model1)
# the only thing you really need in this loocv is an object called delta. it's got two items in it. The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation. Since we ran LOOCV, the two values should be super similar.
loocv1$delta
# okay great. we have an estimate of model fit for the simple linear regession. let's play around with adding polynomial terms and see if we can improve the fit.
# there's a handy way to add polynomial terms using poly()
model2 <- glm(mpg ~ poly(horsepower, 2), data=Auto)
model3 <- glm(mpg ~ poly(horsepower, 3), data=Auto)
model4 <- glm(mpg ~ poly(horsepower, 4), data=Auto)
model5 <- glm(mpg ~ poly(horsepower, 5), data=Auto)
model6 <- glm(mpg ~ poly(horsepower, 6), data=Auto)
# i'm going to switch from LOOCV to a 20-fold cv, to save a little processing time
squared_errs <- rep(0,6)
loocv <- cv.glm(data=Auto, glmfit=model1, K=20)
squared_errs[1] <- loocv$delta[2]
loocv <- cv.glm(data=Auto, glmfit=model2, K=20)
squared_errs[2] <- loocv$delta[2]
loocv <- cv.glm(data=Auto, glmfit=model3, K=20)
squared_errs[3] <- loocv$delta[2]
loocv <- cv.glm(data=Auto, glmfit=model4, K=20)
squared_errs[4] <- loocv$delta[2]
loocv <- cv.glm(data=Auto, glmfit=model5, K=20)
squared_errs[5] <- loocv$delta[2]
loocv <- cv.glm(data=Auto, glmfit=model6, K=20)
squared_errs[6] <- loocv$delta[2]
# I'm sure there's a way better way to do that with a for loop or something, but i couldn't get it to work.
# this is a vector of the squared error for each model
squared_errs
# let's plot it to get a sense for how fit changes as a function of polynomial order
plot(squared_errs)
# looks like there's a big drop in error when we go from 2nd order to 3rd order (i.e. allowing a cubic function)
# that makes sense, given the data:
plot(Auto$mpg~Auto$horsepower)
# looks cubic
# pretty plotting time :)
library(ggplot2)
library(RColorBrewer)
ggplot(data=Auto, aes(x=horsepower, y=mpg)) +
geom_point()
# to plot the lines from each function, use predict() to get the predicted values from each regression equation
Auto$model1 <- predict(model1)
Auto$model2 <- predict(model2)
Auto$model3 <- predict(model3)
Auto$model4 <- predict(model4)
Auto$model5 <- predict(model5)
Auto$model6 <- predict(model6)
# pick a palette
display.brewer.all()
colors <- brewer.pal(9,"Spectral")
colors <- c(colors[1:3], colors[7:9]) #take just the 6 colors from the two ends (they're brighter)
# how to check out a color palette
n <- 6 # the number of colors to plot
pie(rep(1,n), col=colors)
# plot all of the 6 models on the data. notice that the model doesn't improve much after cubic.
ggplot(data=Auto, aes(x=horsepower, y=mpg)) +
geom_point() +
geom_line(aes(x=horsepower, y=model1, colour="linear"), size = 2, alpha = .5 ) +
geom_line(aes(x=horsepower, y=model2, colour="quadratic"), size = 2, alpha = .5 ) +
geom_line(aes(x=horsepower, y=model3, colour="cubic"), size = 2, alpha = .5 ) +
geom_line(aes(x=horsepower, y=model4, colour="quartic"), size = 2, alpha = .5 ) +
geom_line(aes(x=horsepower, y=model5, colour="fiveish"), size = 2, alpha = .5 ) +
geom_line(aes(x=horsepower, y=model6, colour="sixtacular"), size = 2, alpha = .5 ) +
guides(color=guide_legend(title="model"))
```