R version 4.2.2 Patched (2022-11-10 r83330) -- "Innocent and Trusting"
Copyright (C) 2022 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> pdf("g07.pdf");options(width=64)
> #setwd("C:\\Users\\kolassa\\Class542")
> setwd("~/Taught1/960-542/Data")
> # Example of infinite estimates.  Breast cancer data is a canon-
> # ical example of a data set giving infinite parameter estimates.
> library(coxphf)#For breast data set
> data(breast)
> ?breast
breast                 package:coxphf                  R Documentation

_B_r_e_a_s_t _C_a_n_c_e_r _D_a_t_a _S_e_t

_D_e_s_c_r_i_p_t_i_o_n:

     Provides the breast cancer data set as used by Heinze & Schemper,
     2001.  The data sets contains information on 100 breast cancer
     patients, including: survival time, survival status, Tumor stage,
     Nodal status, Grading, Cathepsin-D tumorexpression

_U_s_a_g_e:

     breast
     
_F_o_r_m_a_t:

     A data frame with 100 observations on the following 6 variables.

     ‘T’ a numeric vector

     ‘N’ a numeric vector

     ‘G’ a numeric vector

     ‘CD’ a numeric vector

     ‘TIME’ a numeric vector

     ‘CENS’ a numeric vector

     ‘pat’ a numeric vector

_R_e_f_e_r_e_n_c_e_s:

     Heinze, G., and Schemper, M. 2001. A solution to the problem of
     monotone likelihood. Biometrics 57(1) pp. 114-119.


> # Package coxphf does not force loading survival.
> library(survival)#For coxph
> fit.breast<-coxph(Surv(TIME,CENS)~T+N+G+CD,data=breast)
> # The estimate for parameter G is very large.  The partial
> # likelihood increases as the value of this parameter
> # increases, and the number here is the value when the
> # Newton Raphson iterations stopped.  Note also the large
> # standard error.
> summary(fit.breast)
Call:
coxph(formula = Surv(TIME, CENS) ~ T + N + G + CD, data = breast)

  n= 100, number of events= 26 

        coef exp(coef)  se(coef)     z Pr(>|z|)  
T  1.279e+00 3.593e+00 5.022e-01 2.547   0.0109 *
N  9.463e-01 2.576e+00 4.251e-01 2.226   0.0260 *
G  1.842e+01 1.001e+08 4.317e+03 0.004   0.9966  
CD 4.001e-01 1.492e+00 4.435e-01 0.902   0.3670  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

   exp(coef) exp(-coef) lower .95 upper .95
T  3.593e+00  2.783e-01    1.3428     9.616
N  2.576e+00  3.882e-01    1.1198     5.926
G  1.001e+08  9.993e-09    0.0000       Inf
CD 1.492e+00  6.703e-01    0.6256     3.558

Concordance= 0.821  (se = 0.033 )
Likelihood ratio test= 39.36  on 4 df,   p=6e-08
Wald test            = 18.89  on 4 df,   p=8e-04
Score (logrank) test = 39.74  on 4 df,   p=5e-08

> # This happens when G is in the model by itself.
> coxph(Surv(TIME,CENS)~G,data=breast)
Call:
coxph(formula = Surv(TIME, CENS) ~ G, data = breast)

       coef exp(coef)  se(coef)     z     p
G 1.961e+01 3.278e+08 5.335e+03 0.004 0.997

Likelihood ratio test=19.01  on 1 df, p=1.299e-05
n= 100, number of events= 26 
> # This arises because all subjects with the event were graded 1.
> # These happened when some graded 0 were still at risk, and so
> # those graded at 1 are prima facie infinitely more at risk.
> plot(breast$TIME,breast$G,pch=breast$CENS)
> legend(0,.5,legend=c("Censored","Event"),pch=0:1)
> #Sketch the likelihood.  Coxph returns a list with one component loglik. 
> #The first component is for the empty model.  The second is for the fit model.
> out<-rep(NA,20)
> for(jj in seq(length(out))) 
+    out[jj]<-coxph( Surv(TIME,CENS)~T+N+CD+offset(G*jj),
+    data=breast)$loglik[2]
> plot(seq(20),out,type="l",main="Profile log likelihood",
+    sub="Breast Cancer Data",xlab="G parameter",ylab="Log Likelihood")
> #Focus on the higher vaules
> for(jj in seq(length(out))) 
+    out[jj]<-coxph( Surv(TIME,CENS)~T+N+CD+offset(G*(19+jj/20)),
+    data=breast)$loglik[2]
> plot(19+seq(20)/20,out,type="l",main="Profile log likelihood",
+    sub="Breast Cancer Data",xlab="G parameter",ylab="Log Likelihood")
> #As a second example, consider the larynx cancer data, this time with
> #age and diagnosis year added.
> #*******************************************************/
> # Larynx cancer data from Klelin and Moeschberger. See */
> # class web page for link.  Cancers are graded on an   */
> # integer scale, with increasing numbers indicating    */
> # more sever.  Variables are stage, time to death, year*/
> # of diagnosis, and status.                            */
> #*******************************************************/
> larynx<-read.table("larynx.txt",header=TRUE)
> # Add age
> coxph(Surv(time,delta)~age+factor(stage),data=larynx)
Call:
coxph(formula = Surv(time, delta) ~ age + factor(stage), data = larynx)

                  coef exp(coef) se(coef)     z        p
age            0.01903   1.01921  0.01426 1.335   0.1820
factor(stage)2 0.14004   1.15032  0.46249 0.303   0.7620
factor(stage)3 0.64238   1.90100  0.35611 1.804   0.0712
factor(stage)4 1.70598   5.50678  0.42191 4.043 5.27e-05

Likelihood ratio test=18.31  on 4 df, p=0.001072
n= 90, number of events= 50 
> # Age is non-signifiant, but close to significant.  Notably, the
> # coefficient of age is very small.  That's because the age coefficient
> # reflects the change log hazard as age changes by one year, which is
> # a small change.  Multiply by 10 to get the change in hazard as one 
> # ages a decade.
> # Now apply these techniques to a smaller data set, and investigating
> # a smaller data set.
> coxph(Surv(time,delta)~factor(stage),data=larynx,subset=age<60)
Call:
coxph(formula = Surv(time, delta) ~ factor(stage), data = larynx, 
    subset = age < 60)

                     coef  exp(coef)   se(coef)      z     p
factor(stage)2 -1.909e+01  5.112e-09  9.953e+03 -0.002 0.998
factor(stage)3  5.843e-01  1.794e+00  6.064e-01  0.964 0.335
factor(stage)4  5.372e-01  1.711e+00  1.102e+00  0.488 0.626

Likelihood ratio test=5.77  on 3 df, p=0.1235
n= 28, number of events= 12 
> # Stage 2 appears to have a strong protective effect relative to stages
> # 3 and 4, which seems reasonable.  It also appears to have a strong 
> # protective effect relative to stage 1, the baseline, which does not
> # seem reasonable.
> # coxphf maximizes the posterior under Jeffreys prior,
> # rather than the partial likelihood by itself.
> library(coxphf)#For coxphf
> data(breast)
> summary(coxphf(Surv(TIME,CENS)~T+N+G+CD, data=breast))
coxphf(formula = Surv(TIME, CENS) ~ T + N + G + CD, data = breast)

Model fitted by Penalized ML
Confidence intervals and p-values by Profile Likelihood 

        coef  se(coef) exp(coef) lower 0.95  upper 0.95
T  1.2244388 0.4916044  3.402256  1.3627461    9.472184
N  0.9188882 0.4225734  2.506502  1.1204552    5.832863
G  2.4244141 1.4735463 11.295610  1.4656675 1451.945938
CD 0.3971181 0.4418554  1.487532  0.6268672    3.511784
      Chisq           p
T  6.983773 0.008225204
N  5.004409 0.025282830
G  6.090654 0.013589876
CD 0.822321 0.364502440

Likelihood ratio test=35.95142 on 4 df, p=2.961054e-07, n=100
Wald test = 23.20007 on 4 df, p = 0.000115489

Covariance-Matrix:
             T           N           G          CD
T   0.24167491 -0.00079123 -0.06806805 -0.07374037
N  -0.00079123  0.17856824 -0.04416386 -0.05117879
G  -0.06806805 -0.04416386  2.17133881 -0.02606956
CD -0.07374037 -0.05117879 -0.02606956  0.19523619
> #*************************************************************/
> #Explore model selection techniques.                         */
> #*************************************************************/
> #*******************************************************/
> # Larynx cancer data from Klelin and Moeschberger. See */
> # class web page for link.  Cancers are graded on an   */
> # integer scale, with increasing numbers indicating    */
> # more sever.  Variables are stage, time to death, year*/
> # of diagnosis, and status.                            */
> #*******************************************************/
> larynx<-read.table("larynx.txt",header=TRUE)
> larynx$fstage<-factor(larynx$stage)
> (fit<-coxph(Surv(time,delta)~age+diagyr+fstage,data=larynx))
Call:
coxph(formula = Surv(time, delta) ~ age + diagyr + fstage, data = larynx)

            coef exp(coef) se(coef)      z        p
age      0.01869   1.01887  0.01433  1.304   0.1922
diagyr  -0.01819   0.98198  0.07646 -0.238   0.8120
fstage2  0.15164   1.16375  0.46481  0.326   0.7442
fstage3  0.64473   1.90546  0.35619  1.810   0.0703
fstage4  1.73211   5.65255  0.43596  3.973 7.09e-05

Likelihood ratio test=18.37  on 5 df, p=0.002518
n= 90, number of events= 50 
> #*************************************************************/
> #Find the best model among those with age, diagyr, and stage.*/
> #*************************************************************/
> cat('\n Stepwise model selection  \n')

 Stepwise model selection  
> cat('Stepwise model selection\n')
Stepwise model selection
> #Step applies backwards stepwise using AICR to model fit.
> #R throws a warning here when trying to evaluate AIC at  
> #the null model.                                         
> step(fit)
Start:  AIC=385.36
Surv(time, delta) ~ age + diagyr + fstage

         Df    AIC
- diagyr  1 383.41
- age     1 385.10
      385.36
- fstage  3 394.87

Step:  AIC=383.41
Surv(time, delta) ~ age + fstage

         Df    AIC
- age     1 383.24
      383.41
- fstage  3 393.10

Step:  AIC=383.24
Surv(time, delta) ~ fstage

         Df    AIC
      383.24
- fstage  3 393.73
Call:
coxph(formula = Surv(time, delta) ~ fstage, data = larynx)

           coef exp(coef) se(coef)     z        p
fstage2 0.06481   1.06696  0.45843 0.141   0.8876
fstage3 0.61481   1.84930  0.35519 1.731   0.0835
fstage4 1.73490   5.66838  0.41939 4.137 3.52e-05

Likelihood ratio test=16.49  on 3 df, p=0.0009016
n= 90, number of events= 50 
> #One could do this in a non-automated way by doing
> extractAIC(fit)
[1]   5.0000 385.3583
> extractAIC(coxph(Surv(time,delta)~diagyr+fstage, data=larynx))
[1]   4.0000 385.0995
> extractAIC(coxph(Surv(time,delta)~age+fstage, data=larynx))
[1]   4.0000 383.4147
> extractAIC(coxph(Surv(time,delta)~age+diagyr, data=larynx))
[1]   2.0000 394.8735
> extractAIC(coxph(Surv(time,delta)~fstage, data=larynx))
[1]   3.0000 383.2416
> extractAIC(coxph(Surv(time,delta)~age, data=larynx))
[1]   1.0000 393.0956
> extractAIC(coxph(Surv(time,delta)~diagyr, data=larynx))
[1]   1.0000 395.6327
> extractAIC(coxph(Surv(time,delta)~1, data=larynx))
[1]   0.000 393.727
> #**********************************************************/
> # "Baseline" hazard function.                             */
> # Calculating hazard function for a user-chosen covariate */
> #pattern chosen by user, for an individual 50 years old,  */
> #in 1979, stage 1.  Year is defined as years since 1974.  */
> #**********************************************************/
> cat('\n Using whole data set, with a particular cumulative H\n')

 Using whole data set, with a particular cumulative H
> friend<-data.frame(age=50,diagyr=5,
+   fstage=factor(1,levels= c("1","2","3","4")))
> fit<-coxph(Surv(time,delta)~age+diagyr+fstage,data=larynx)
> plot(survfit(fit,newdata=friend),xlab="Survival",ylab="Time",
+  main="Fitted Survival Curve for Stage 1 50 yo in 1979")
> #***********************************************************/
> # Late entry.  Correct and incorrect survival fits         */
> # are on the same plot.                                    */
> #***********************************************************/
> #**********************************************************/
> # Data excerpted from NIH Multi Center AIDS Cohort Study. */
> # Cohort consists of gay and bisexual men.  Variables are */
> # health: initial health report                           */
> # life: finer self-reported happiness measure,            */
> # happy: self-reported level of happiness at enrollment   */
> # status: 1 for death, 0 ow,                              */
> # entry: age at entry,                                    */
> # time: time to death or loss to followup                 */
> # aaids: age at development of aids.                      */
> #**********************************************************/
> mcac<-read.table("mcac.dat")
> names(mcac)<- c("health","life","happy","status","entry",
+    "time","aaids")
> plot(survfit(coxph(Surv(time,status)~1,data=mcac),conf.int=FALSE),
+   lty=1,main="MCAC Study Results",xlab="Survival Time (Months)")
> # 202 of the individuals in this study are lost to followup 
> # immediately after enrollment.  R requires deleting them.
> cleanmcac<-mcac[mcac$entry #Both require a plot area of the same size, so it doesn't
> #matter which we do first.  The calls to plot and lines below
> #actually call plot.survfit and lines.survfit.  In plot.survfit,
> #conf.int and mark.time defaults are true.  They are false in
> #lines.survfit.  To get both lines in the same format, turn
> #these features off in plot.
> lines(survfit(coxph(Surv(entry,time,status)~1,data=cleanmcac),
+  conf.int=FALSE),lty=2)
> legend(10,.2,legend=c("Ignoring delayed entry",
+   "Accounting for Delayed Entry"),lty=1:2)
> coxph(Surv(time,status)~happy,data=mcac)
Call:
coxph(formula = Surv(time, status) ~ happy, data = mcac)

          coef exp(coef) se(coef)      z      p
happy -0.08229   0.92101  0.04158 -1.979 0.0478

Likelihood ratio test=3.92  on 1 df, p=0.04767
n= 5622, number of events= 1647 
> coxph(Surv(entry,time,status)~happy,data=cleanmcac)
Call:
coxph(formula = Surv(entry, time, status) ~ happy, data = cleanmcac)

          coef exp(coef) se(coef)      z      p
happy -0.10547   0.89990  0.04124 -2.558 0.0105

Likelihood ratio test=6.55  on 1 df, p=0.01046
n= 5420, number of events= 1647 
> #Here is code showing that splitting lines
> #leaves the fit unchanged.  Split each line with time
> #greater than 5 at 5.
> firstlarynx<-larynx
> firstlarynx$start<-0
> firstlarynx[firstlarynx$time>5,"delta"]<-0
> firstlarynx[firstlarynx$time>5,"time"]<-5
> secondlarynx<-larynx[larynx$time>5,]
> secondlarynx$start<-5
> newlarynx<-rbind(firstlarynx,secondlarynx)
> (fit1<-coxph(Surv(time,delta)~stage,data=larynx))
Call:
coxph(formula = Surv(time, delta) ~ stage, data = larynx)

        coef exp(coef) se(coef)     z        p
stage 0.5088    1.6633   0.1412 3.604 0.000313

Likelihood ratio test=13.26  on 1 df, p=0.0002706
n= 90, number of events= 50 
> (fit2<-coxph(Surv(start,time,delta)~stage,data=newlarynx))
Call:
coxph(formula = Surv(start, time, delta) ~ stage, data = newlarynx)

        coef exp(coef) se(coef)     z        p
stage 0.5088    1.6633   0.1412 3.604 0.000313

Likelihood ratio test=13.26  on 1 df, p=0.0002706
n= 121, number of events= 50 
>