dp.97 <- read.table("http://jgill.wustl.edu/data/dp.data",header=TRUE) dp.out <- glm(EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + log(VC100k96) + SOUTH + PROPDEGREE, family=poisson,data=dp.97) ci.summary <- function (in.object, alpha = 0.05,rd=4) { ci <- in.object$coef + qnorm(1-alpha/2) * t(c(-1,1) %o% summary(in.object)$coef[,2]) out.mat <- round(cbind(in.object$coefficient, summary(in.object)$coef[,2], ci),rd) dimnames(out.mat)[[2]] <- c("Coefficient","Std. Error", paste(1-alpha,"CI Lower"),paste(1-alpha,"CI Upper")) out.mat } ci.summary(dp.out) Coefficient Std. Error 0.95 CI Lower 0.95 CI Upper (Intercept) -6.3068 4.1823 -14.5040 1.8903 INCOME 0.0003 0.0001 0.0002 0.0004 PERPOVERTY 0.0690 0.0799 -0.0877 0.2257 PERBLACK -0.0950 0.0229 -0.1398 -0.0502 log(VC100k96) 0.2212 0.4427 -0.6465 1.0890 SOUTH 2.3099 0.4291 1.4689 3.1509 PROPDEGREE -19.7022 4.4679 -28.4592 -10.9452 library(faraway) bliss data(bliss) dead alive conc 1 2 28 0 2 8 22 1 3 15 15 2 4 23 7 3 5 27 3 4 modl <- glm(cbind(dead,alive) ~ conc, family=binomial, bliss); summary(modl) Deviance Residuals: 1 2 3 4 5 -0.4510 0.3597 0.0000 0.0643 -0.2045 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -2.3238 0.4179 -5.561 2.69e-08 conc 1.1619 0.1814 6.405 1.51e-10 (Dispersion parameter for binomial family taken to be 1) Null deviance: 64.76327 on 4 degrees of freedom Residual deviance: 0.37875 on 3 degrees of freedom df.residual(modl) 3 deviance(modl) 0.3787483 1-pchisq(deviance(modl),df.residual(modl)) 0.9445968 pchisq(deviance(modl),df.residual(modl),lower.tail=FALSE) 0.9445968 anova(modl,test="Chi") Analysis of Deviance Table Model: binomial, link: logit Response: cbind(dead, alive) Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev P(>|Chi|) NULL 4 64.763 conc 1 64.385 3 0.379 1.024e-15 modl2 <- glm(cbind(dead,alive) ~ conc+I(conc^2), family=binomial,bliss) summary(modl2) Deviance Residuals: 1 2 3 4 5 -0.19971 0.32415 -0.21850 0.01265 0.05132 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -2.49589 0.59869 -4.169 3.06e-05 conc 1.41018 0.61696 2.286 0.0223 I(conc^2) -0.06117 0.14319 -0.427 0.6692 (Dispersion parameter for binomial family taken to be 1) Null deviance: 64.76327 on 4 degrees of freedom Residual deviance: 0.19549 on 2 degrees of freedom anova(modl,modl2,test="Chi") Analysis of Deviance Table Model 1: cbind(dead, alive) ~ conc Model 2: cbind(dead, alive) ~ conc + I(conc^2) Resid. Df Resid. Dev Df Deviance P(>|Chi|) 1 3 0.37875 2 2 0.19549 1 0.18325 0.6686 anova(modl2,test="Chi") Analysis of Deviance Table Model: binomial, link: logit Response: cbind(dead, alive) Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev P(>|Chi|) NULL 4 64.763 conc 1 64.385 3 0.379 1.024e-15 I(conc^2) 1 0.183 2 0.195 0.6686 residuals(modl,"deviance") residuals(modl,"pearson") residuals(modl,"response") bliss$dead/30 - fitted(modl) residuals(modl,"working") par(mfrow=c(1,3),mar=c(4,4,2,2),oma=c(2,2,2,2)) plot(Species ~ Area, gala) plot(Species ~ log(Area), gala) mu <- predict(modp,type="response") z <- predict(modp)+(gala$Species-mu)/mu plot(z ~ log(Area), gala,ylab="Linearized Response") modpl <- glm(Species ~ log(Area) + log(Elevation) + log(Nearest) + log(Scruz+0.1) + log(Adjacent), family=poisson, gala) c(deviance(modp),deviance(modpl)) mu <- predict(modpl,type="response") u <- (gala$Species-mu)/mu + coef(modpl)[2]*log(gala$Area) par(mfrow=c(1,2),mar=c(4,4,2,2),oma=c(2,2,2,2)) plot(u ~ log(Area), gala,ylab="Partial Residual") abline(0,coef(modpl)[2]) z <- predict(modpl)+(gala$Species-mu)/mu plot(z ~ predict(modpl), xlab="Linear predictor", ylab="Linearized Response") par(mfrow=c(1,2),mar=c(4,4,2,2),oma=c(2,2,2,2)) halfnorm(rstudent(modpl)) gali <- influence(modpl) halfnorm(gali$hat) par(mfrow=c(1,2),mar=c(4,4,2,2),oma=c(2,2,2,2)) halfnorm(cooks.distance(modpl)) plot(gali$coef[,5],ylab="Change in Scruz coef",xlab="Case no.") modplr <- glm(Species ~ log(Area) + log(Elevation) + log(Nearest) + log(Scruz+0.1) + log(Adjacent), family=poisson, gala, subset=-25) cbind(coef(modpl),coef(modplr)) modpla <- glm(Species ~ log(Area)+log(Adjacent), family=poisson, gala) dp <- sum(residuals(modpla,type="pearson")^2)/modpla$df.res summary(modpla,dispersion=dp) scotland.df <- read.table("http://jgill.wustl.edu/data/scotvote.dat",header=TRUE) scottish.vote.glm <- glm((PerYesTax/100) ~ CouncilTax * PerClaimantFemale + StdMortalityRatio + Active + GDP + Percentage5to15, family=Gamma, data=scotland.df) \begin{Verbatim}[formatcom=\color{MyGreen}] committee.dat <- read.table("http://jgill.wustl.edu/data/committee.dat", header=TRUE,col.names=1) attach(committee.dat) committee.out <- glm.nb(BILLS104 ~ SIZE + SUBS * (log(STAFF)) + PRESTIGE + BILLS103) resp <- resid(committee.out,type="response") pears <- resid(committee.out,type="pearson") working <- resid(committee.out,type="working") devs <- resid(committee.out,type="deviance") copper.data <- read.table("http://jgill.wustl.edu/data/copper.dat", header=TRUE) copper.stage1 <- glm(COPPERPRICE ~ INCOMEINDEX + ALUMPRICE + INVENTORYINDEX + log(TIME), family=Gamma) copper.stage2 <- glm(WORLDCONSUMPTION ~ copper.stage1$fitted.values + INCOMEINDEX + ALUMPRICE, family=Gamma) ############################### SETUP ############################################### fluid.scan <- scan("http://jgill.wustl.edu/data/fluid.data",skip=1) fluid.mat <- matrix(fluid.scan,byrow=T,nrow=15459,ncol=21) fluid.mat <- fluid.mat[,1:13] dimnames(fluid.mat)[[2]] <- c("fluidvot","dist2","coalspec","mwcf","factor", "landmark", "freshyr1", "expert2","cj","cjstrat","dissent","unanimou","minority") fluid.factors <- data.frame(fluidvot=fluid.mat[,1],dist2=fluid.mat[,2], coalspec=fluid.mat[,3], mwcf=fluid.mat[,4],complexity=fluid.mat[,5], landmark=fluid.mat[,6],freshyr1=fluid.mat[,7], expert2=fluid.mat[,8], cj=fluid.mat[,9],cjstrat=fluid.mat[,10],dissent=fluid.mat[,11], unanimou=fluid.mat[,12],minority=fluid.mat[,13]) \end{Verbatim} \section*{Case Study: Analyzing Votes on the Supreme Court, code } \begin{Verbatim}[formatcom=\color{MyGreen}] ############################### RUN THE KITCHEN SINK MODEL ########################## kitchen.sink.fit <- glm(fluidvot ~ dist2 + coalspec + mwcf + complexity+ landmark + freshyr1 + expert2 + cj + cjstrat + dissent + unanimou + minority, family=binomial(link=logit),data=fluid.factors) summary.glm(kitchen.sink.fit) Deviance Residuals: Min 1Q Median 3Q Max -2.95255 -0.31468 -0.16436 -0.07645 3.76560 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -3.030356 0.066926 -45.279 < 2e-16 dist2 0.060781 0.001688 36.006 < 2e-16 coalspec -0.458590 0.053830 -8.519 < 2e-16 mwcf -0.336781 0.112812 -2.985 0.00283 complexity 0.112127 0.035784 3.133 0.00173 landmark 0.070948 0.154825 0.458 0.64677 freshyr1 0.266453 0.131643 2.024 0.04296 expert2 -0.152239 0.112687 -1.351 0.17670 cj -0.088581 0.153582 -0.577 0.56410 cjstrat 0.165715 0.275034 0.603 0.54683 dissent 0.382112 0.070001 5.459 4.80e-08 unanimou -0.142704 0.130013 -1.098 0.27238 minority 0.913227 0.184645 4.946 7.58e-07 (Dispersion parameter for binomial family taken to be 1) Null deviance: 8259.0 on 15458 degrees of freedom Residual deviance: 5271.5 on 15446 degrees of freedom AIC: 5297.5 Number of Fisher Scoring iterations: 7 8259.0 - 5271.5 = 2987.5 on df = 12 1-pchisq(2987.5,12) gives 0 \end{Verbatim} \section*{Case Study: Analyzing Votes on the Supreme Court, code } \begin{Verbatim}[formatcom=\color{MyGreen}] anova(kitchen.sink.fit) Analysis of Deviance Table Model: binomial, link: logit Response: fluidvot Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 15458 8259.0 dist2 1 2332.5 15457 5926.6 coalspec 1 50.1 15456 5876.5 mwcf 1 5.5 15455 5871.0 complexity 1 2.5 15454 5868.4 landmark 1 0.01139 15453 5868.4 freshyr1 1 3.8 15452 5864.6 expert2 1 0.3 15451 5864.2 cj 1 4.0 15450 5860.3 cjstrat 1 45.4 15449 5814.9 dissent 1 516.9 15448 5298.1 unanimou 1 1.9 15447 5296.2 minority 1 24.7 15446 5271.5 # BEWARE! CHANGING ORDER SLIGHTLY (dist2 MOVED TO LAST): anova(kitchen.sink.fit2) Df Deviance Resid. Df Resid. Dev NULL 15458 8259.0 coalspec 1 330.2 15457 7928.9 mwcf 1 3.0 15456 7925.9 complexity 1 6.7 15455 7919.2 landmark 1 0.2 15454 7919.0 freshyr1 1 3.2 15453 7915.8 expert2 1 0.1 15452 7915.7 cj 1 1.5 15451 7914.2 cjstrat 1 63.9 15450 7850.3 dissent 1 649.5 15449 7200.8 unanimou 1 27.7 15448 7173.1 minority 1 1.2 15447 7172.0 dist2 1 1900.5 15446 5271.5 correctly.fit.mat[,2][correctly.fit.mat[,2] < 0.5] <- 0 correctly.fit.mat[,2][correctly.fit.mat[,2] > 0.5] <- 1 table(correctly.fit.mat) kitchen.sink.fit.fitted.values fluid.factors.fluidvot 0 1 0 14184 111 1 766 398 (14184 + 398)/(14184 + 111 + 766 + 398) [1] 0.9432693 sum(diag(table(correctly.fit.mat))) / sum(table(correctly.fit.mat)) [1] 0.9432693 correctly.fit.mat <- data.frame(fluid.factors$fluidvot,kitchen.sink.fit$fitted.values) mean(correctly.fit.mat[,2]) [1] 0.07529594 correctly.fit.mat[,2][correctly.fit.mat[,2] < mean(correctly.fit.mat[,2])] <- 0 correctly.fit.mat[,2][correctly.fit.mat[,2] > mean(correctly.fit.mat[,2])] <- 1 table(correctly.fit.mat) kitchen.sink.fit.fitted.values fluid.factors.fluidvot 0 1 0 11549 2746 1 229 935 sum(diag(table(correctly.fit.mat))) / sum(table(correctly.fit.mat)) [1] 0.8075555 fluid.logit.fit1 <- glm(fluidvot ~ dist2 + coalspec + complexity+ freshyr1 + expert2 + dissent + minority + dist2:minority + complexity:minority + dissent:dist2, family=binomial(link=logit),data=fluid.factors) summary.glm(fluid.logit.fit1) Deviance Residuals: Min 1Q Median 3Q Max -4.05747 -0.32187 -0.17542 -0.08674 3.63272 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -3.029126 0.058573 -51.716 < 2e-16 dist2 0.053652 0.002125 25.245 < 2e-16 coalspec -0.508543 0.054160 -9.390 < 2e-16 complexity 0.203519 0.040830 4.985 6.21e-07 freshyr1 0.252072 0.132876 1.897 0.05782 expert2 -0.144012 0.114785 -1.255 0.20961 dissent 0.405871 0.066298 6.122 9.25e-10 minority 0.723628 0.159244 4.544 5.52e-06 dist2:minority -0.020005 0.007179 -2.786 0.00533 complexity:minority -0.254218 0.077374 -3.286 0.00102 dist2:dissent 0.017926 0.003265 5.490 4.01e-08 (Dispersion parameter for binomial family taken to be 1) Null deviance: 8259.0 on 15458 degrees of freedom Residual deviance: 5210.7 on 15448 degrees of freedom AIC: 5232.7 # COMPARE THE ABOVE TO THE KITCHEN SINK MODEL FIT: Null deviance: 8259.0 on 15458 degrees of freedom Residual deviance: 5271.5 on 15446 degrees of freedom AIC: 5297.5 # TEST MODEL IMPROVEMENT: 5271.5 - 5210.7 = 60.8 on df = 2 1-pchisq(60.8,2) gives 6.27276e-14 correctly.fit.mat <- data.frame(fluid.factors$fluidvot,fluid.logit.fit1$fitted.values) correctly.fit.mat[,2][correctly.fit.mat[,2] < 0.5] <- 0 correctly.fit.mat[,2][correctly.fit.mat[,2] > 0.5] <- 1 table(correctly.fit.mat) fluid.logit.fit1.fitted.values fluid.factors.fluidvot 0 1 0 14194 101 1 765 399 sum(diag(table(correctly.fit.mat))) / sum(table(correctly.fit.mat)) [1] 0.9439809 # FROM KITCHEN SINK MODEL: predicted correctly, naive: [1] 0.9432693 # CHI-SQUARE TEST OF PEARSON RESIDUALS: pchisq(sum(residuals(fluid.logit.fit1,type="pearson")^2),15448) [1] 1