pdf("g06.pdf");options(width=64) #setwd("C:\\Users\\kolassa\\Class553") setwd("~/Taught1/960-553/Data")
# Block 1
#**************************************************/
#* Multi-Way Analyses                             */
#**************************************************/
#*************************************************************/
#*  Prostate data                                            */
#* From Andrews and Herzberg (1985), Data: A Collection of   */
#* Problems from Many Fields for the Student and Research    */
#* Worker, Table 46.  Observations represent subjects in a   */
#* prostate cohort study, randomized to one of four dose     */
#* levels of diethylstilbestrol.  Rx records dose in four    */
#* ordered categories, with 1 being placebo.  Disease stage  */
#* is 3 or 4.  monfol is months of followup.  surv is 0 if   */
#* alive after 50 mo, and codes cause of death otherwise.    */
#* http://lib.stat.cmu.edu/datasets/Andrews/T46.1            */
#* The on-line version of the data set adds 3 fields before  */
#* the first field in the book.  Variables of interest are   */
#* stage, rx, monfol, and surv in fields 5, 6, 10, 11 of the */
#* online version, resp.  Causes of death are given by var-  */
#* ious positive integers in surv; I recode these to 1.  The */
#* data file has more columns than we are interested in.  Put*/
#* place-holding variables in for variables not of interest  */
#* between variables of interest.  Data were previously pub- */
#* lished by Byar and Corle (1977 Chronic Disease) and Byar  */
#* and Green (1980 Bull. Cancer).  Lower value of dichoto-   */
#* mized dose begins with blank to make it alphabetized      */
#* before high.                                              */
#*************************************************************/
# If we omit the ones past where the data of interest stops
# in the "what" list, and use flush=T, R will ignore them. 
prostate<-read.table("T46.1")[,c(5,6,10,11)]
dimnames(prostate)[[2]]<-c("stage","rx","monfol","surv")
prostate$alive<-(prostate$surv==0)+0                      
prostate$dose<-c(" low","high")[1+(prostate$rx>1)]         
stage4<-prostate[prostate$stage==4,]
#*******************************************************/
#* Test whether survival is the same across treatments.*/
#*******************************************************/
# Run Poisson regression.  We need to change this into a data
# frame with a variable representing counts.  
# First fit additive model.
newdata<-as.data.frame(xtabs(~rx+alive,data=stage4))
summary(m1<-glm(Freq~rx+alive,family=poisson, data=newdata))
# Now fit model with interactions.
m2<-glm(Freq~rx*alive,family=poisson,data=newdata)
# Do we need interactions?
(anovaout<-anova(m1,m2))
# Calculate p value
1-pchisq(anovaout$Deviance[2],anovaout$Df[2])
# Compare to earlier Pearson test
chisq.test(stage4$rx,stage4$alive)
#In the above call to glm, rx was treated as categorical, 
#because, while it was originally numeric, it got changed
#to a character string when it got used as the labels for
#table rows, and this got carried over into the data frame
#created from teh table.  If we want this treated as
#continuous, convert back to numeric.
summary(m3<-glm(Freq~as.numeric(rx)+alive,family=poisson, 
   data=newdata))
#**************************************************************/
#* Shipping Data: McCullagh and Nelder (1989) Generalized     */
#* Linear Models provide data on losses by a shipping insurer.*/
#* Data are grouped into putatively homogeneous classes, based*/
#* on ship type, start of construction 5 year period, star of */
#* observation 5 year period, ship months at risk, and count  */
#* of losses.  Empty categories are removed.                  */
#**************************************************************/
ships<-read.table("ships.dat",
   col.names=c("type","built", "period","smar","cases"))
#You can save the result of a complicated model in R just as
#you can save any other object.  Printing the result of a 
#complicated model in R is generally not informative.
glmo<-glm(cases~type+built,family=poisson,data=ships)
cat("Result of printing glm output\n")
glmo
#Instead, print the results of a summary.
cat("Result of printing glm summary\n")
summary(glmo)
summary(glmo1<-glm(cases~type+factor(built)+
   offset(log(smar)),family=poisson,data=ships))
summary(glmo2<-glm(cases~type+factor(built)+factor(period)+
   offset(log(smar)),family=poisson,data=ships))
# Block 2
#**************************************************/
#* Models with Interaction                        */
#**************************************************/
summary(glmo3<-glm(cases~type*factor(built)+offset(log(smar)),
   family=poisson,data=ships))
#Do we need the interactions? 
(anovaout<-anova(glmo2,glmo3))
# Calculate p value
1-pchisq(anovaout$Deviance[2],anovaout$Df[2])
#**************************************************************/
#*          Death Penalty Example                             */
#* Death penalty data from Radelet, M. (1981), Racial         */
#* Characteristics and Imposition of the Death Penalty        */
#* American Sociological Review, 46, 918-927. Data reflect    */
#* outcomes of murder cases in which race of victim and       */
#* principle suspect are known.  1st column is indicator of   */
#* relationship between victim and suspect (P for close       */
#* relationship, N for not), Race of victim and defendant     */
#* (both either B or W; other races deleted), number of cases,*/
#* number of indictments, and number of death sentences.      */
#* Referenced by Moore (2000) Basic Practice of Statistics, p.*/
#* Following Moore, we will only use the non-primary cases.   */
#* http://stat.rutgers.edu/~kolassa/960-553/floridadeath.dat  */
#**************************************************************/
death1<-read.table("floridadeath.dat",
   col.names=c("rela","rv","rd","nc","ni","nd"))
death2<-death1; death1$we<-death1$nd;death1$lc<-"Death" 
death2$we<-death1$nc-death1$nd;death2$lc<-"Live"
death<-rbind(death1,death2)
death$rdc<-c("White defend.","Black defend.")[(death$rd=="B")+1]
death$rvc<-c("White victim","Black victim")[(death$rv=="B")+1]
nonprim<-death[death$rela=="N",]                       
# Fit model ignoring race of victim.  The * indicates 
# interactions, which forces main effects.
summary(d1<-glm(we~rdc*lc,data=nonprim,family=poisson))
# Allow for different marginal probability for each race
# of victim, but odds ratio must be the same.
summary(d2<-glm(we~ rdc*lc+ rdc*rvc+ lc*rvc,data=nonprim,
   family=poisson))
#In mantelhaen.test last entry in model gives strata, and so
#estimated OR measures association betw rdc and lc.  Est-   
#imator is MH estimator.                                    
#***********************************************************/
#*Compare OR estimate to exp of rdc*lc interaction in      */
#*Poisson regression d2 with all 2-way interactions above. */
#***********************************************************/
mantelhaen.test(mm<-xtabs(we~rdc+lc+rvc,data=nonprim))
# Allow different odds ratio for relationship between race of
# the defendant and death penalty for each race of victim.
summary(d3<-glm(we~ rdc*lc*rvc,data=nonprim,
   family=poisson))
#Do we need to let the odds ratio vary by race of victim?
(anovaout<-anova(d2,d3))
# Calculate p value
1-pchisq(anovaout$Deviance[2],anovaout$Df[2])
# Do we need to let the odds ratio vary by race of victim?
# Compare to deviance test between d2 and d3 above.
library(DescTools)#For BreslowDayTest
BreslowDayTest(mm)
BreslowDayTest(mm,correct=TRUE)
# Compare to the Wald test for equality of OR
coef(summary(d3))["rdcWhite defend.:lcLive:rvcWhite victim",]
# Block 3
#*******************************************************/
#* Proportional Mortality: Fit two Poisson regression  */
#* models, with offset.                                */
#*******************************************************/
#**************************************************************/
#*          Montana Nickel Smelter Example                    */
#* Causes of death for nickel smelters, from Breslow and Day  */
#* (1987) Statistical Models in Cancer Research V. II, Inter- */
#* national Agency for Research on Cancer.  The study is desc-*/
#* ribled on pp. 349ff.  Variables are age group, calendar    */
#* period, hire period, arsenic exposure, person-years at     */
#* risk, deaths from all causes, deaths from respiratory can- */
#* cer, deaths from all cancers, and deaths from cirulatory   */
#* disease.                                                   */
#* http://faculty.washington.edu/norm/BDdata/montana4.dat     */
#**************************************************************/
montana<-as.data.frame(scan("montana4.dat",
   what=list(ageg=0,period=0,hire=0,arsenic=0,pyar=0,alld=0,
      rcd=0,acd=0,cd=0)))
montana$lpyar<-log(montana$pyar)
summary(glm(rcd~factor(period)+factor(hire)+factor(ageg)+
   arsenic+offset(lpyar),family=poisson,data=montana))$coef
summary(glm(cd~factor(period)+factor(hire)+factor(ageg)+arsenic+
   +offset(lpyar),family=poisson,data=montana))$coef
#**************************************************************/
#* Now add the two causes of death, and do prop. mortality.   */
#**************************************************************/
summary(glm(cbind(rcd,cd)~factor(period)+factor(hire)+
   factor(ageg)+arsenic,family=binomial,data=montana))$coef
# Block 4
#*******************************************************/
#* Logistic regregression for 2xK table.               */
#*******************************************************/
stage4$frx<-factor(stage4$rx)
summary(glm(alive~frx,family=binomial,data=stage4))$coef
#* Demonstraton of logistic regression with data grouped.*/
stage4a<-as.data.frame(xtabs(~rx+alive,data=stage4))
stage4a$frx<-factor(stage4a$rx)
summary(glm(alive~frx,family=binomial,weight=Freq,
   data=stage4a))$coef
# Block 5
#**************************************************************/
#* Stratified 2x2 tables using logistic regression            */
#* Treat death penalty data as two 2x2 tables for lc vs rvc   */
#* stratified on rdc, or as two 2x2 tables for lc vs rdc      */
#* stratified on rvc.  The model without interactions presumes*/
#* the same OR between lc and each of the stratifiers/cov-    */
#* variates in both tables, and the coefficient of covariate  */
#* is the log of the odds ratio.  So the Wald test for each   */
#* covariate is an alternative to the Mantel-Hanszel test.    */
#* The test of the interaction is an alternative to the Bres- */
#* low-Day test for homogeity of odds ratios.  Note infinite  */
#* MLE for model with interactions.                           */
#**************************************************************/
nonprim$surv<-(nonprim$lc=="Live")+0
nonprim$rvf<-factor(nonprim$rvc,labels=c("White","Black"))
nonprim$rdf<-factor(nonprim$rdc,labels=c("White","Black"))
summary(glm(surv~rvf+rdf,
   family=binomial,data=nonprim,weight=we))$coef
summary(glmout<-glm(surv~rvf*rdf,
   family=binomial,data=nonprim,weight=we))$coef
aov(glmout)