pdf("g07.pdf");options(width=64) #setwd("C:\\Users\\kolassa\\Class553") setwd("~/Taught1/960-553/Data")
# Block 1
#*************************************************************/
#*  Prostate data                                            */
#* From Andrews and Herzberg (1985), Data: A Collection of   */
#* Problems from Many Fields for the Student and Research    */
#* Worker, Table 46.  Observations represent subjects in a   */
#* prostate cohort study, randomized to one of four dose     */
#* levels of diethylstilbestrol.  Rx records dose in four    */
#* ordered categories, with 1 being placebo.  Disease stage  */
#* is 3 or 4.  monfol is months of followup.  surv is 0 if   */
#* alive after 50 mo, and codes cause of death otherwise.    */
#* http://lib.stat.cmu.edu/datasets/Andrews/T46.1            */
#* The on-line version of the data set adds 3 fields before  */
#* the first field in the book.  Variables of interest are   */
#* stage, rx, monfol, and surv in fields 5, 6, 10, 11 of the */
#* online version, resp.  Causes of death are given by var-  */
#* ious positive integers in surv; I recode these to 1.  The */
#* data file has more columns than we are interested in.  Put*/
#* place-holding variables in for variables not of interest  */
#* between variables of interest.  Data were previously pub- */
#* lished by Byar and Corle (1977 Chronic Disease) and Byar  */
#* and Green (1980 Bull. Cancer).  Lower value of dichoto-   */
#* mized dose begins with blank to make it alphabetized      */
#* before high.                                              */
#*************************************************************/
# If we omit the ones past where the data of interest stops
# in the "what" list, and use flush=T, R will ignore them. 
prostate<-read.table("T46.1")[,c(5,6,10,11)]
dimnames(prostate)[[2]]<-c("stage","rx","monfol","surv")
prostate$alive<-(prostate$surv==0)+0                      
prostate$dose<-c(" low","high")[1+(prostate$rx>1)]         
stage4<-prostate[prostate$stage==4,]
newdata<-as.data.frame(xtabs(~rx+alive,data=stage4))
summary(m1<-glm(alive~factor(rx),family=binomial, 
   data=stage4))
summary(m2<-glm(alive~rx,family=binomial,             
  weight=Freq,data=newdata))
newestdata1<-newdata[newdata$alive==1,]
newestdata0<-newdata[newdata$alive==0,]
newestdata<-merge(newestdata0,newestdata1,by="rx")
newestdata$prop<-newestdata$Freq.y/( 
   newestdata$Freq.x+ newestdata$Freq.y)
newestdata$n<-newestdata$Freq.x+ newestdata$Freq.y
summary(m3<-glm(prop~rx,weight=n,family=binomial,
   data=newestdata))
# Block 2
#******************************************************/
# Link comparison                                     */
#******************************************************/
uuu<-(0:90)/30; vvv<-cbind(pnorm(uuu),1/(1+exp(-1.702*uuu)))
plot(range(uuu),range(vvv),type="n",ylab="Probability",
   xlab="Linear Predictor",
   main="Comparison of Probit and Logistic Link Functions")
for(jj in 1:2) lines(uuu,vvv[,jj],lty=jj)
legend(1.5,.8,lty=1:2,legend=c("Probit","Logit rescaled"))
#*******************************************************/
#* Probit Regression                                   */
#*******************************************************/
#*************************************************************/
#*  Prostate data                                            */
#* From Andrews and Herzberg (1985), Data: A Collection of   */
#* Problems from Many Fields for the Student and Research    */
#* Worker, Table 46.  Observations represent subjects in a   */
#* prostate cohort study, randomized to one of four dose     */
#* levels of diethylstilbestrol.  Rx records dose in four    */
#* ordered categories, with 1 being placebo.  Disease stage  */
#* is 3 or 4.  monfol is months of followup.  surv is 0 if   */
#* alive after 50 mo, and codes cause of death otherwise.    */
#* http://lib.stat.cmu.edu/datasets/Andrews/T46.1            */
#* The on-line version of the data set adds 3 fields before  */
#* the first field in the book.  Variables of interest are   */
#* stage, rx, monfol, and surv in fields 5, 6, 10, 11 of the */
#* online version, resp.  Causes of death are given by var-  */
#* ious positive integers in surv; I recode these to 1.  The */
#* data file has more columns than we are interested in.  Put*/
#* place-holding variables in for variables not of interest  */
#* between variables of interest.  Data were previously pub- */
#* lished by Byar and Corle (1977 Chronic Disease) and Byar  */
#* and Green (1980 Bull. Cancer).  Lower value of dichoto-   */
#* mized dose begins with blank to make it alphabetized      */
#* before high.                                              */
#*************************************************************/
# If we omit the ones past where the data of interest stops
# in the "what" list, and use flush=T, R will ignore them. 
prostate<-read.table("T46.1")[,c(5,6,10,11)]
dimnames(prostate)[[2]]<-c("stage","rx","monfol","surv")
prostate$alive<-(prostate$surv==0)+0                      
prostate$dose<-c(" low","high")[1+(prostate$rx>1)]         
stage4<-prostate[prostate$stage==4,]
stage4a<-as.data.frame(xtabs(~rx+alive,data=stage4))
summary(glmout<-glm(alive~as.factor(rx),
 family=binomial(link=probit),data=stage4a,weight=Freq))$coef
# Block 3
#************************************************************/
#Results for 2022 NBA season (except for the end of the     */
#playoffs), from http://stats.nba.com/schedule/ .  Use only */
#regular season games.                                      */
#************************************************************/
nba<-read.table("nba2022.dat",head=F,stringsAsFactors=F)
names(nba)<-c("type","v","vs","h","hs")
nba<-nba[nba$type=="(REGULAR)",]
#*************************************************************/
# The model calls for as many regressors as there are compet-*/
# itors compared.  To automate coding of the regressors, give*/
# each team a number. The number will be the rank of each    */
# alphabatized city name.                                    */
#*************************************************************/
count<-table(c(nba$h,nba$v))
ngames<-dim(nba)[1]
allteams<-sort(unique(c(nba$h,nba$v)))
wl<-array(0,c(length(allteams),2))
dimnames(wl)<-list(allteams,c("w","l"))
nba$xmat<-array(0, c(ngames,length(allteams)))
dimnames(nba$xmat)<-list(NULL,allteams)
for(jj in seq(ngames)){
  nba$xmat[jj,nba$h[jj]]<-1
  nba$xmat[jj,nba$v[jj]]<--1
  if(nba$vs[jj]>nba$hs[jj]){
     wl[nba$v[jj],"w"]<-wl[nba$v[jj],"w"]+1
     wl[nba$h[jj],"l"]<-wl[nba$h[jj],"l"]+1
  }
  if(nba$hs[jj]>nba$vs[jj]){
     wl[nba$h[jj],"w"]<-wl[nba$h[jj],"w"]+1
     wl[nba$v[jj],"l"]<-wl[nba$v[jj],"l"]+1
  }
}
#Bradley-Terry model for basketball data.
#*************************************************************/
# Now run the Bradley-Terry model.  The first model is the   */
# basic preference model.  The team with the largest fitted  */
# parameter is judged best by the model.  The team with the  */
# highest integer code is taken as baseline, and all other   */
# team results are relative to that one.                    */
#*************************************************************/
(btout<-glm((hs>vs)~xmat-1,data=nba,family=binomial))
# Compare to raw results
cbind(wl,rank(wl[,"w"]/(wl[,"l"]+wl[,"w"])),rank(btout$coef))
#************************************************************/
# The second model contains an intercept, allowing for home-*/
# field advantage.                                          */
#************************************************************/
glm((hs>vs)~xmat,data=nba,family=binomial)
# Block 4
######################################################
# Arcsine transformaton                              #
######################################################
plot(atan(1)*c(-4,4),c(-1,1),type="n",
   xlab="x",ylab="\\sin(x)")
theta<-(-25:25)/100*8*atan(1)
lines(theta,sin(theta),lty=1, xlab="x",ylab="\\sin(x)")
lines(theta<-(-50:(-25))/100*8*atan(1),sin(theta),type="l",
   xlab="x",ylab="\\sin(x)",lty=2)
lines(theta<-(25:50)/100*8*atan(1),sin(theta),type="l",
   xlab="x",ylab="\\sin(x)",lty=2)
axis(1,at=2*c(-2,-1,1,2)*atan(1),
   labels=c("-\\pi","-\\pi/2","\\pi/2","\\pi"))
plot((0:100)/100, asin(sqrt((0:100)/100)), xlab = "p",
   ylim=c(0,asin(1)),
   ylab = "\\arcsin(\\sqrt{p})",type = "l",xaxs="i",yaxs="i")
axis(4,at=c(30,60,90)/90*asin(1),labels=c(30,60,90))
# Block 5