Poisson and Negative Binomial Regression

 

# The purpose of this session is to show you how to use R's procedures for count models

# including Poisson and Negative Binomial Regression. We also show how to do various tests

# for overdispersion and for discriminating between models.

 

library(car)

library(stats)

library(MASS)

 

# Now let's read in the data. Note that there are some missing data defined by ".". R expects

# missing data to be coded NA. So we have changed the missing value code on input here.

 

rm(list = ls())

ShipAccidents <- read.table("C:/users/Wood/Documents/My Teaching/Maximum Likelihood/Data/ShipAccidents.txt",

    header=TRUE, na.strings=".")

attach(ShipAccidents)

 

# Let's go ahead and remove the missing values from the dataset.

 

detach(ShipAccidents)

ShipAccidents <- na.omit(ShipAccidents)

attach(ShipAccidents)

 

# First let's do some descriptive statistics on the data just to confirm what is

# obvious from looking at the data (there are a lot of zeros).

 

summary(ShipAccidents)

library(e1071)

skewness(Acc)

kurtosis(Acc)

 

qqPlot(Acc, dist= "norm", labels=FALSE)

 

# These are discrete data, so let's do a histogram of their distribution.

# And, just for fun, let's also superimpose non-parametric regression lines.

 

hist(Acc, probability=T, ylab='Density')

lines(density(Acc), lwd=2)

points(Acc, rep(0, length(Acc)), pch="|")

box()

lines(density(Acc, adjust=.5), lwd=1)

 

# Note that the discrete data are non-normal and cannot be approximated by a normal distribution.

# They are both skewed and kurtotic. This implies the need for some model other than a normal

# distribution. Which model? is the next question. We'll attempt to answer this question later.

 

# Let's begin by doing the estimation using the Poisson model. Again, we use the Zelig interface

# to R. Replicating McCullaugh and Nelder, we include months of service logged in the analysis

 

ShipAccidents$Logmonths <- log(Months)

detach(ShipAccidents)

attach(ShipAccidents)

 

library(Zelig)

poisson.out <- zelig(Acc ~ TB + TC + TD + TE + T6569+ T7074 + T7579 + O7579 + Logmonths,

    model = "poisson", data = ShipAccidents)

summary(poisson.out)

 

poissonsummary.out <- summary(poisson.out)

 

# McCullagh and Nelder assert that the coefficient on Logmonths is known to be 1. Let's do a

# Wald test for whether the coefficient on Logmonths is consistent with their assertion.

 

b <- poisson.out$coefficients

v <- poissonsummary.out$cov.unscaled

r <- rbind(c(0,0,0,0,0,0,0,0,0,1))

q <- 1

j <- nrow(r)

Wald <- t(r%*%b-q) %*% solve (r %*% v %*% t(r)) %*% (r%*%b-q)

Wald

Waldpvalue <- 1-pchisq(Wald, df=j)

Waldpvalue

 

# Based on the Wald statistic we are unable to reject the null that the coefficients on Logmonths

# is one.

 

# Note, McCullaugh and Nelder assert that the coefficient on Logmonths is known

# to be 1. In a sense, they cheat by not taking into account the sampling variability

# of this coefficient. However, to replicate their results we need to fix the coefficient

# on Logmonths at 1. This can be done using the offset command. Note that you can also

# restrict to other values than 1 by making a scalar transformation of the variable to

# be restricted. Note that R does not report a coefficient for the fixed parameter.

 

poissonmn.out <- zelig(Acc ~ TB + TC + TD + TE + T6569+ T7074 + T7579 + O7579 + offset(Logmonths),

    model = "poisson", data = ShipAccidents)

summary(poissonmn.out)

 

poissonmnsummary.out <- summary(poissonmn.out)

 

# Let's do some interpretation of the results. First, the hard way.

# Get the vector of coefficients and data matrix

 

x <- as.matrix(cbind(1,ShipAccidents[3:6],ShipAccidents[8:10],ShipAccidents[12:12],

    ShipAccidents[15:15]))

y <- as.vector(Acc)

beta <- c(poissonmn.out$coefficients,1)

beta

 

# Predicted accidents for each observation.

 

lambda <- exp(x %*% beta)

lambda

 

# Predicted accidents with all independent variables at their means.

 

xbar <- as.matrix(mean(cbind(1,ShipAccidents[3:6],ShipAccidents[8:10],ShipAccidents[12:12],

    ShipAccidents[15:15])))

Expectation <- exp(t(xbar) %*% beta)

Expectation

 

# We could use the preceding to do a first differences analysis. However, let's use Zelig

# to do the work for us. First, set all variables at their means.

 

means.out <- setx(poisson.out)

means.out

 

# Simulate expected value as above. Compare the results to the preceding.

 

fitted.out <- sim(poisson.out, x = means.out)

summary(fitted.out)

 

# Simulated first difference when shifting Logmonths one standard deviation above the mean

 

Logmonths.low <- setx(poisson.out, Logmonths = mean(Logmonths))

Logmonths.high <- setx(poisson.out, Logmonths = mean(Logmonths) + sd(Logmonths))

fd.Logmonth <- sim(poisson.out, x = Logmonths.low, x1 = Logmonths.high)

summary(fd.Logmonth)

 

# Now let's test the hypothesis that the period of manufacture has no effect on accidents.

# Let's use a Wald statistic.

 

r <- rbind(c(0,0,0,0,0,1,0,0,0),c(0,0,0,0,0,0,1,0,0),c(0,0,0,0,0,0,0,1,0))

q <- c(0,0,0)

b <- poissonmn.out$coefficients

v <- poissonmnsummary.out$cov.unscaled

j <- nrow(r)

Wald <- t(r%*%b-q) %*% solve (r %*% v %*% t(r)) %*% (r%*%b-q)

Wald

Waldpvalue <- 1-pchisq(Wald, df=j)

Waldpvalue

 

# We can reject the null hypothesis that the period of manufacture has no effect.

 

# We can also do this test using a likelihood ratio approach. Re-estimate the model leaving out

# the period effects.

 

poissonR.out <- zelig(Acc ~ TB + TC + TD + TE + O7579 + Logmonths,

    model = "poisson", data = ShipAccidents)

summary(poissonR.out)

names(poissonR.out)

 

# Now construct the likelihood ratio test

 

j <- poissonR.out$df.residual - poissonmn.out$df.residual

j

lu <- -.5*poissonmn.out$deviance

lr <- -.5*poissonR.out$deviance

lrtest <- -2*(lr - lu)

lrtest

lrpvalue <- 1-pchisq(lrtest, df=j)

lrpvalue

 

# We generated lambda above as the model predictions. Let's also generate the set of errors.

 

err <- Acc-lambda

err

 

# McCullagh and Nelder also assert that the Poisson model exhibits overdispersion. Let's test

# for overdispersion using a regression based approach as suggested by Cameron and Trivedi

 

z <- ((Acc-lambda)^2-Acc)/(sqrt(2)*lambda)

 

# First the regression on a constant

 

test1 <- lm(z ~ 1)

summary(test1)

 

# Now the regression without a constant

 

test2 <- lm(z ~ lambda -1)

summary(test2)

 

# Since neither t-statistic is significant, there is no evidence of overdispersion in the data

# using a regression based approach.

 

# We can also test for overdispersion using the more general approach suggested on pp. 743-44

# of Greene.

 

ei2 <- as.vector(err*err)

vi <- as.vector(ei2 - lambda)

vi2 <- as.vector(vi*vi)

eivi <- as.vector(err*vi)

zi <- cbind(TB,TC,TD,TE,T6569,T7074,T7579,O7579,Logmonths)

mm <- t(zi) %*% diag(vi2) %*% zi

dd <- t(x) %*% diag(ei2) %*% x

md <- t(zi) %*% diag(eivi) %*% x

q <- mm - md %*% solve(dd) %*% t(md)

r <- t(zi) %*% vi

cm <- t(r) %*% solve(q) %*% r

cm

 

# CM is a chi square statistic with k degrees of freedom, where k is the number of regressors.

# Here we reject the null that the variance is unrelated to the the regressors in a way that is not

# completely accounted for by the expected value of y. In other words, we have evidence

# of heterogeneity due to the regressors. We also reject the null that the mean equals the variance.

 

# Now let's test for overdispersion using a LaGrange Multiplier test as given by

# Green on p. 744.This is a test for poisson versus negative binomial.

 

mean(Acc)

 

ybar <- mean(Acc)

nybar <- nrow(x) * ybar

lmtest <- (t(err) %*% err-nybar)^2/(2*t(lambda) %*% lambda)

lmtest

 

# The LaGrange Multiplier statistic is Chi Squared with 1 degree of freedom.  The test is

# non-significant, so no evidence of overdispersion for this test.

 

# We can also test for overdispersion in the context of the Negative Binomial model. If

# the reported heterogeneity parameter is significant then this is evidence for overdispersion.

 

startvalues <- poissonmn.out$coefficients

negbinmn.out <- zelig(Acc ~ TB + TC + TD + TE + T6569+ T7074 + T7579 + O7579 + offset(Logmonths),

    model = "negbin", start = startvalues, maxit=100, data = ShipAccidents)

summary(negbinmn.out)

 

# In this case the estimation does not converge and no overdispersion parameter is reported.

# This does not bode well for McCullaugh and Nelder's analysis.

 

# All of the preceding interpretational tools using Zelig can also be applied to the

# negative binomial model in straight forward fashion. We leave that as an exercise.

 

# There are library packages in R that will also estimate other models of heterogeneity

# in count processes such as the zero inflated poisson or negative binomial.