We setyp the simulation with the number of observations, the number os simulations, the standard deviation of \(x\), and the number of folds.

library(splines)
set.seed(333)
N <- 5000
n.sim <- 100
SD.x <- 4
fold <- 5

We set up the true data generating process and plot the relationship between \(x\) and \(y\).

x <- runif(N,-3,3)
e <- rnorm(N, sd=SD.x)
y <-  x + 3* x^2 +.3*x^3 -.3*x^4 +.03*x^5 + 0.003*x^6+ e
plot(x,y, pch=19, cex=0.7, col=rgb(180,0,180,30, maxColorValue = 255))

We set up the range of degrees of freedom to loop over and a container that holds the MSE’s for each loop iteration.

freedom <- c(1:10)
mse.container <- array(NA,c(n.sim,length(freedom), fold))
startL <- Sys.time()

We loop over the different combinations of parameters we vary and simulations.

for (j in 1:length(freedom)){
  for (i in 1:n.sim){
    x <- runif(N,-3,4)
    e <- rnorm(N, sd=SD.x)
    y <-  x + 3* x^2 +.3*x^3 -.3*x^4 +.03*x^5 + 0.003*x^6+ e
    fold.id <- sample(c(1:fold),N, replace = TRUE)
    for (k in 1:fold){
      test.set <- which(fold.id==k)
      y.test <- y[test.set]
      y1 <- y[-test.set]
      x1 <- x[-test.set]
      mod1 <- lm(y1 ~ ns(x1,df=freedom[j]))  
      X1 <- (x[test.set])
      y.hat <- coef(mod1) %*% t(as.matrix(cbind(1,ns(X1,df = freedom[j])))) 
      mse.container[i,j,k] <- mean((y.test-y.hat)^2)
      }
    }
  print(j)  
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

we record how much time it took.

endL <- Sys.time()
endL - startL
## Time difference of 41.85669 secs

We get MSE and plot it with the degrees of freedom on the \(x\)-axis.

MSE <- apply(mse.container,c(1,2), mean)
MSE1 <- colMeans(MSE)
plot(freedom, MSE1, type="b")