pdf("g06.pdf");options(width=64) #setwd("C:\\Users\\kolassa\\Class553") setwd("~/Taught1/960-553/Data")
# Block 1 #**************************************************/ #* Multi-Way Analyses */ #**************************************************/ #*************************************************************/ #* Prostate data */ #* From Andrews and Herzberg (1985), Data: A Collection of */ #* Problems from Many Fields for the Student and Research */ #* Worker, Table 46. Observations represent subjects in a */ #* prostate cohort study, randomized to one of four dose */ #* levels of diethylstilbestrol. Rx records dose in four */ #* ordered categories, with 1 being placebo. Disease stage */ #* is 3 or 4. monfol is months of followup. surv is 0 if */ #* alive after 50 mo, and codes cause of death otherwise. */ #* http://lib.stat.cmu.edu/datasets/Andrews/T46.1 */ #* The on-line version of the data set adds 3 fields before */ #* the first field in the book. Variables of interest are */ #* stage, rx, monfol, and surv in fields 5, 6, 10, 11 of the */ #* online version, resp. Causes of death are given by var- */ #* ious positive integers in surv; I recode these to 1. The */ #* data file has more columns than we are interested in. Put*/ #* place-holding variables in for variables not of interest */ #* between variables of interest. Data were previously pub- */ #* lished by Byar and Corle (1977 Chronic Disease) and Byar */ #* and Green (1980 Bull. Cancer). Lower value of dichoto- */ #* mized dose begins with blank to make it alphabetized */ #* before high. */ #*************************************************************/ # If we omit the ones past where the data of interest stops # in the "what" list, and use flush=T, R will ignore them. prostate<-read.table("T46.1")[,c(5,6,10,11)] dimnames(prostate)[[2]]<-c("stage","rx","monfol","surv") prostate$alive<-(prostate$surv==0)+0 prostate$dose<-c(" low","high")[1+(prostate$rx>1)] stage4<-prostate[prostate$stage==4,] #*******************************************************/ #* Test whether survival is the same across treatments.*/ #*******************************************************/ # Run Poisson regression. We need to change this into a data # frame with a variable representing counts. # First fit additive model. newdata<-as.data.frame(xtabs(~rx+alive,data=stage4)) summary(m1<-glm(Freq~rx+alive,family=poisson, data=newdata)) # Now fit model with interactions. m2<-glm(Freq~rx*alive,family=poisson,data=newdata) # Do we need interactions? (anovaout<-anova(m1,m2)) # Calculate p value 1-pchisq(anovaout$Deviance[2],anovaout$Df[2]) # Compare to earlier Pearson test chisq.test(stage4$rx,stage4$alive) #In the above call to glm, rx was treated as categorical, #because, while it was originally numeric, it got changed #to a character string when it got used as the labels for #table rows, and this got carried over into the data frame #created from teh table. If we want this treated as #continuous, convert back to numeric. summary(m3<-glm(Freq~as.numeric(rx)+alive,family=poisson, data=newdata)) #**************************************************************/ #* Shipping Data: McCullagh and Nelder (1989) Generalized */ #* Linear Models provide data on losses by a shipping insurer.*/ #* Data are grouped into putatively homogeneous classes, based*/ #* on ship type, start of construction 5 year period, star of */ #* observation 5 year period, ship months at risk, and count */ #* of losses. Empty categories are removed. */ #**************************************************************/ ships<-read.table("ships.dat", col.names=c("type","built", "period","smar","cases")) #You can save the result of a complicated model in R just as #you can save any other object. Printing the result of a #complicated model in R is generally not informative. glmo<-glm(cases~type+built,family=poisson,data=ships) cat("Result of printing glm output\n") glmo #Instead, print the results of a summary. cat("Result of printing glm summary\n") summary(glmo) summary(glmo1<-glm(cases~type+factor(built)+ offset(log(smar)),family=poisson,data=ships)) summary(glmo2<-glm(cases~type+factor(built)+factor(period)+ offset(log(smar)),family=poisson,data=ships)) # Block 2 #**************************************************/ #* Models with Interaction */ #**************************************************/ summary(glmo3<-glm(cases~type*factor(built)+offset(log(smar)), family=poisson,data=ships)) #Do we need the interactions? (anovaout<-anova(glmo2,glmo3)) # Calculate p value 1-pchisq(anovaout$Deviance[2],anovaout$Df[2]) #**************************************************************/ #* Death Penalty Example */ #* Death penalty data from Radelet, M. (1981), Racial */ #* Characteristics and Imposition of the Death Penalty */ #* American Sociological Review, 46, 918-927. Data reflect */ #* outcomes of murder cases in which race of victim and */ #* principle suspect are known. 1st column is indicator of */ #* relationship between victim and suspect (P for close */ #* relationship, N for not), Race of victim and defendant */ #* (both either B or W; other races deleted), number of cases,*/ #* number of indictments, and number of death sentences. */ #* Referenced by Moore (2000) Basic Practice of Statistics, p.*/ #* Following Moore, we will only use the non-primary cases. */ #* http://stat.rutgers.edu/~kolassa/960-553/floridadeath.dat */ #**************************************************************/ death1<-read.table("floridadeath.dat", col.names=c("rela","rv","rd","nc","ni","nd")) death2<-death1; death1$we<-death1$nd;death1$lc<-"Death" death2$we<-death1$nc-death1$nd;death2$lc<-"Live" death<-rbind(death1,death2) death$rdc<-c("White defend.","Black defend.")[(death$rd=="B")+1] death$rvc<-c("White victim","Black victim")[(death$rv=="B")+1] nonprim<-death[death$rela=="N",] # Fit model ignoring race of victim. The * indicates # interactions, which forces main effects. summary(d1<-glm(we~rdc*lc,data=nonprim,family=poisson)) # Allow for different marginal probability for each race # of victim, but odds ratio must be the same. summary(d2<-glm(we~ rdc*lc+ rdc*rvc+ lc*rvc,data=nonprim, family=poisson)) #In mantelhaen.test last entry in model gives strata, and so #estimated OR measures association betw rdc and lc. Est- #imator is MH estimator. #***********************************************************/ #*Compare OR estimate to exp of rdc*lc interaction in */ #*Poisson regression d2 with all 2-way interactions above. */ #***********************************************************/ mantelhaen.test(mm<-xtabs(we~rdc+lc+rvc,data=nonprim)) # Allow different odds ratio for relationship between race of # the defendant and death penalty for each race of victim. summary(d3<-glm(we~ rdc*lc*rvc,data=nonprim, family=poisson)) #Do we need to let the odds ratio vary by race of victim? (anovaout<-anova(d2,d3)) # Calculate p value 1-pchisq(anovaout$Deviance[2],anovaout$Df[2]) # Do we need to let the odds ratio vary by race of victim? # Compare to deviance test between d2 and d3 above. library(DescTools)#For BreslowDayTest BreslowDayTest(mm) BreslowDayTest(mm,correct=TRUE) # Compare to the Wald test for equality of OR coef(summary(d3))["rdcWhite defend.:lcLive:rvcWhite victim",] # Block 3 #*******************************************************/ #* Proportional Mortality: Fit two Poisson regression */ #* models, with offset. */ #*******************************************************/ #**************************************************************/ #* Montana Nickel Smelter Example */ #* Causes of death for nickel smelters, from Breslow and Day */ #* (1987) Statistical Models in Cancer Research V. II, Inter- */ #* national Agency for Research on Cancer. The study is desc-*/ #* ribled on pp. 349ff. Variables are age group, calendar */ #* period, hire period, arsenic exposure, person-years at */ #* risk, deaths from all causes, deaths from respiratory can- */ #* cer, deaths from all cancers, and deaths from cirulatory */ #* disease. */ #* http://faculty.washington.edu/norm/BDdata/montana4.dat */ #**************************************************************/ montana<-as.data.frame(scan("montana4.dat", what=list(ageg=0,period=0,hire=0,arsenic=0,pyar=0,alld=0, rcd=0,acd=0,cd=0))) montana$lpyar<-log(montana$pyar) summary(glm(rcd~factor(period)+factor(hire)+factor(ageg)+ arsenic+offset(lpyar),family=poisson,data=montana))$coef summary(glm(cd~factor(period)+factor(hire)+factor(ageg)+arsenic+ +offset(lpyar),family=poisson,data=montana))$coef #**************************************************************/ #* Now add the two causes of death, and do prop. mortality. */ #**************************************************************/ summary(glm(cbind(rcd,cd)~factor(period)+factor(hire)+ factor(ageg)+arsenic,family=binomial,data=montana))$coef # Block 4 #*******************************************************/ #* Logistic regregression for 2xK table. */ #*******************************************************/ stage4$frx<-factor(stage4$rx) summary(glm(alive~frx,family=binomial,data=stage4))$coef #* Demonstraton of logistic regression with data grouped.*/ stage4a<-as.data.frame(xtabs(~rx+alive,data=stage4)) stage4a$frx<-factor(stage4a$rx) summary(glm(alive~frx,family=binomial,weight=Freq, data=stage4a))$coef # Block 5 #**************************************************************/ #* Stratified 2x2 tables using logistic regression */ #* Treat death penalty data as two 2x2 tables for lc vs rvc */ #* stratified on rdc, or as two 2x2 tables for lc vs rdc */ #* stratified on rvc. The model without interactions presumes*/ #* the same OR between lc and each of the stratifiers/cov- */ #* variates in both tables, and the coefficient of covariate */ #* is the log of the odds ratio. So the Wald test for each */ #* covariate is an alternative to the Mantel-Hanszel test. */ #* The test of the interaction is an alternative to the Bres- */ #* low-Day test for homogeity of odds ratios. Note infinite */ #* MLE for model with interactions. */ #**************************************************************/ nonprim$surv<-(nonprim$lc=="Live")+0 nonprim$rvf<-factor(nonprim$rvc,labels=c("White","Black")) nonprim$rdf<-factor(nonprim$rdc,labels=c("White","Black")) summary(glm(surv~rvf+rdf, family=binomial,data=nonprim,weight=we))$coef summary(glmout<-glm(surv~rvf*rdf, family=binomial,data=nonprim,weight=we))$coef aov(glmout)