Prediction Interval for Regression Models with Weights

by Alexey Burnakov   Last Updated November 14, 2017 18:19 PM

Courtesy Prediction Interval for Neural Net With Hessian :: nnet in R

I would like to understand how to manually derive a point-wise prediction confidence interval applicable to neural models.

Following http://cdn.intechopen.com/pdfs-wm/14915.pdf Confidence Intervals for Neural Networks and Applications to Modeling Engineering Materials,

my first aim was to construct a CI based on the Jacobian matrix related to model weights, using a simple lm model in R. By doing so I hope to get a comparable figure and then generalize it to non-linear (multilayer) set.

Below is my try. I am feeling really dumb in the very core of these calculations, and what I got did not fit with what I expected.

I am mostly concerned with my possible misunderstanding of: enter image description here enter image description here enter image description here enter image description here

Question 1: is the Jacobian here relates to partial derivatives of the neural network output a rather then to a loss function of a?

Question 2: similarly, when it reads

the gradient of function a

does it relate to neural network output rather then it's loss?

rm(list=ls())

# load libraries ----------------------------------------------------------

list.of.packages <- c('data.table',
                      'numDeriv'
)
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

library(numDeriv)
library(data.table)


## create dummy data

nrows <- 1000

dat <- data.frame(x1 = rnorm(nrows, 0, 1)
                  , x2 = rnorm(nrows, 0, 1)
                  , x3 = rnorm(nrows, 0, 1)
                  )

dat$y = dat$x1 * 1 + 
     dat$x2 * 1 + 
     dat$x3 * 1 +
     rnorm(nrows, 0, 5)

lm_model <- lm(y ~ . -1, data = dat)

summary(lm_model)


## create function a (model output | W)

rm(fit_function)

fit_function <- function(x)
{

     ww <- x

     dat_copy <- as.data.table(copy(dat))

     dat_copy[, (paste0('w', seq(1, 3, 1))) := lapply(1:3, function(x) ww[[x]])]

     dat_copy[, y_hat := Reduce(`+`, Map(
                                        function(x, y) x * y, .SD[, paste0('x', seq(1, 3, 1)), with = F], .SD[, paste0('w', seq(1, 3, 1)), with = F]
                                         )
                                )
              ]

     # y <- mean(dat_copy[, (y - y_hat) ^ 2])

     # X <- as.matrix(dat_copy[, y - y_hat])
     # 
     # Xt <- t(X)
     # 
     # XtX <- X %*% Xt
     # 
     # sum_XtX <- sum(XtX) / 2

     y <- unlist(dat_copy[, y_hat])

     return(y)

}


## create function a (model output for one line | W)

rm(fit_function2)

fit_function2 <- function(x)
{

     ww <- x

     dat_copy <- as.data.table(copy(dat))

     dat_copy[, (paste0('w', seq(1, 3, 1))) := lapply(1:3, function(x) ww[[x]])]

     dat_copy[, y_hat := Reduce(`+`, Map(function(x, y)
          x * y, .SD[, paste0('x', seq(1, 3, 1)), with = F], .SD[, paste0('w', seq(1, 3, 1)), with = F]))]

     y <- unlist(dat_copy[1, y_hat])

     return(y)

}

Ww <- as.numeric(lm_model$coefficients) #I am using exactly fitted weights

Jac_val <- numDeriv::jacobian(
     func = fit_function
     , x = Ww
     , method = "Richardson"
     , side = NULL
     )

T_Jac <- t(Jac_val)

XtX <- T_Jac %*% Jac_val

inv_X <- solve(XtX) #inversed matrix


## lets try derive a CI

t_stats <- 3

dat_copy <- as.data.table(copy(dat))

dat_copy[, (paste0('w', seq(1, 3, 1))) := lapply(1:3, function(x) Ww[[x]])]

dat_copy[, y_hat := Reduce(`+`, Map(function(x, y) x * y, .SD[, paste0('x', seq(1, 3, 1)), with = F], .SD[, paste0('w', seq(1, 3, 1)), with = F]))]

y_hat <- unlist(dat_copy[1, y_hat])

S <- sqrt(sum(dat_copy[, (y - y_hat) ^ 2]) / (nrow(dat) - (ncol(dat) - 1)))

Grad_val <- numDeriv::grad(
          func = fit_function2
          , x = Ww
          , method = "Richardson"
          , side = NULL
          )

StErr <- S * as.numeric(sqrt(1 + t(Grad_val) %*% inv_X %*% Grad_val))

preds <- stats::predict.lm(
     lm_model
     , newdata = dat[, 1:3]
     , se.fit = T
     , interval = "prediction"
)

StErr_lm <- preds$se.fit[[1]]

plot(preds$se.fit, type = 'l'); lines(preds$se.fit, type = 'p', col = 'red')

It's a clear mismatch in what I have done with what the lm SE is:

> StErr
[1] 4.962455
> StErr_lm
[1] 0.228429


Related Questions