pdf("g07.pdf");options(width=64) #setwd("C:\\Users\\kolassa\\Class553") setwd("~/Taught1/960-553/Data")
# Block 1 #*************************************************************/ #* Prostate data */ #* From Andrews and Herzberg (1985), Data: A Collection of */ #* Problems from Many Fields for the Student and Research */ #* Worker, Table 46. Observations represent subjects in a */ #* prostate cohort study, randomized to one of four dose */ #* levels of diethylstilbestrol. Rx records dose in four */ #* ordered categories, with 1 being placebo. Disease stage */ #* is 3 or 4. monfol is months of followup. surv is 0 if */ #* alive after 50 mo, and codes cause of death otherwise. */ #* http://lib.stat.cmu.edu/datasets/Andrews/T46.1 */ #* The on-line version of the data set adds 3 fields before */ #* the first field in the book. Variables of interest are */ #* stage, rx, monfol, and surv in fields 5, 6, 10, 11 of the */ #* online version, resp. Causes of death are given by var- */ #* ious positive integers in surv; I recode these to 1. The */ #* data file has more columns than we are interested in. Put*/ #* place-holding variables in for variables not of interest */ #* between variables of interest. Data were previously pub- */ #* lished by Byar and Corle (1977 Chronic Disease) and Byar */ #* and Green (1980 Bull. Cancer). Lower value of dichoto- */ #* mized dose begins with blank to make it alphabetized */ #* before high. */ #*************************************************************/ # If we omit the ones past where the data of interest stops # in the "what" list, and use flush=T, R will ignore them. prostate<-read.table("T46.1")[,c(5,6,10,11)] dimnames(prostate)[[2]]<-c("stage","rx","monfol","surv") prostate$alive<-(prostate$surv==0)+0 prostate$dose<-c(" low","high")[1+(prostate$rx>1)] stage4<-prostate[prostate$stage==4,] newdata<-as.data.frame(xtabs(~rx+alive,data=stage4)) summary(m1<-glm(alive~factor(rx),family=binomial, data=stage4)) summary(m2<-glm(alive~rx,family=binomial, weight=Freq,data=newdata)) newestdata1<-newdata[newdata$alive==1,] newestdata0<-newdata[newdata$alive==0,] newestdata<-merge(newestdata0,newestdata1,by="rx") newestdata$prop<-newestdata$Freq.y/( newestdata$Freq.x+ newestdata$Freq.y) newestdata$n<-newestdata$Freq.x+ newestdata$Freq.y summary(m3<-glm(prop~rx,weight=n,family=binomial, data=newestdata)) # Block 2 #******************************************************/ # Link comparison */ #******************************************************/ uuu<-(0:90)/30; vvv<-cbind(pnorm(uuu),1/(1+exp(-1.702*uuu))) plot(range(uuu),range(vvv),type="n",ylab="Probability", xlab="Linear Predictor", main="Comparison of Probit and Logistic Link Functions") for(jj in 1:2) lines(uuu,vvv[,jj],lty=jj) legend(1.5,.8,lty=1:2,legend=c("Probit","Logit rescaled")) #*******************************************************/ #* Probit Regression */ #*******************************************************/ #*************************************************************/ #* Prostate data */ #* From Andrews and Herzberg (1985), Data: A Collection of */ #* Problems from Many Fields for the Student and Research */ #* Worker, Table 46. Observations represent subjects in a */ #* prostate cohort study, randomized to one of four dose */ #* levels of diethylstilbestrol. Rx records dose in four */ #* ordered categories, with 1 being placebo. Disease stage */ #* is 3 or 4. monfol is months of followup. surv is 0 if */ #* alive after 50 mo, and codes cause of death otherwise. */ #* http://lib.stat.cmu.edu/datasets/Andrews/T46.1 */ #* The on-line version of the data set adds 3 fields before */ #* the first field in the book. Variables of interest are */ #* stage, rx, monfol, and surv in fields 5, 6, 10, 11 of the */ #* online version, resp. Causes of death are given by var- */ #* ious positive integers in surv; I recode these to 1. The */ #* data file has more columns than we are interested in. Put*/ #* place-holding variables in for variables not of interest */ #* between variables of interest. Data were previously pub- */ #* lished by Byar and Corle (1977 Chronic Disease) and Byar */ #* and Green (1980 Bull. Cancer). Lower value of dichoto- */ #* mized dose begins with blank to make it alphabetized */ #* before high. */ #*************************************************************/ # If we omit the ones past where the data of interest stops # in the "what" list, and use flush=T, R will ignore them. prostate<-read.table("T46.1")[,c(5,6,10,11)] dimnames(prostate)[[2]]<-c("stage","rx","monfol","surv") prostate$alive<-(prostate$surv==0)+0 prostate$dose<-c(" low","high")[1+(prostate$rx>1)] stage4<-prostate[prostate$stage==4,] stage4a<-as.data.frame(xtabs(~rx+alive,data=stage4)) summary(glmout<-glm(alive~as.factor(rx), family=binomial(link=probit),data=stage4a,weight=Freq))$coef # Block 3 #************************************************************/ #Results for 2022 NBA season (except for the end of the */ #playoffs), from http://stats.nba.com/schedule/ . Use only */ #regular season games. */ #************************************************************/ nba<-read.table("nba2022.dat",head=F,stringsAsFactors=F) names(nba)<-c("type","v","vs","h","hs") nba<-nba[nba$type=="(REGULAR)",] #*************************************************************/ # The model calls for as many regressors as there are compet-*/ # itors compared. To automate coding of the regressors, give*/ # each team a number. The number will be the rank of each */ # alphabatized city name. */ #*************************************************************/ count<-table(c(nba$h,nba$v)) ngames<-dim(nba)[1] allteams<-sort(unique(c(nba$h,nba$v))) wl<-array(0,c(length(allteams),2)) dimnames(wl)<-list(allteams,c("w","l")) nba$xmat<-array(0, c(ngames,length(allteams))) dimnames(nba$xmat)<-list(NULL,allteams) for(jj in seq(ngames)){ nba$xmat[jj,nba$h[jj]]<-1 nba$xmat[jj,nba$v[jj]]<--1 if(nba$vs[jj]>nba$hs[jj]){ wl[nba$v[jj],"w"]<-wl[nba$v[jj],"w"]+1 wl[nba$h[jj],"l"]<-wl[nba$h[jj],"l"]+1 } if(nba$hs[jj]>nba$vs[jj]){ wl[nba$h[jj],"w"]<-wl[nba$h[jj],"w"]+1 wl[nba$v[jj],"l"]<-wl[nba$v[jj],"l"]+1 } } #Bradley-Terry model for basketball data. #*************************************************************/ # Now run the Bradley-Terry model. The first model is the */ # basic preference model. The team with the largest fitted */ # parameter is judged best by the model. The team with the */ # highest integer code is taken as baseline, and all other */ # team results are relative to that one. */ #*************************************************************/ (btout<-glm((hs>vs)~xmat-1,data=nba,family=binomial)) # Compare to raw results cbind(wl,rank(wl[,"w"]/(wl[,"l"]+wl[,"w"])),rank(btout$coef)) #************************************************************/ # The second model contains an intercept, allowing for home-*/ # field advantage. */ #************************************************************/ glm((hs>vs)~xmat,data=nba,family=binomial) # Block 4 ###################################################### # Arcsine transformaton # ###################################################### plot(atan(1)*c(-4,4),c(-1,1),type="n", xlab="x",ylab="\\sin(x)") theta<-(-25:25)/100*8*atan(1) lines(theta,sin(theta),lty=1, xlab="x",ylab="\\sin(x)") lines(theta<-(-50:(-25))/100*8*atan(1),sin(theta),type="l", xlab="x",ylab="\\sin(x)",lty=2) lines(theta<-(25:50)/100*8*atan(1),sin(theta),type="l", xlab="x",ylab="\\sin(x)",lty=2) axis(1,at=2*c(-2,-1,1,2)*atan(1), labels=c("-\\pi","-\\pi/2","\\pi/2","\\pi")) plot((0:100)/100, asin(sqrt((0:100)/100)), xlab = "p", ylim=c(0,asin(1)), ylab = "\\arcsin(\\sqrt{p})",type = "l",xaxs="i",yaxs="i") axis(4,at=c(30,60,90)/90*asin(1),labels=c(30,60,90)) # Block 5