R version 4.2.2 Patched (2022-11-10 r83330) -- "Innocent and Trusting"
Copyright (C) 2022 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> pdf("g10.pdf");options(width=64)
> #setwd("C:\\Users\\kolassa\\Class542")
> setwd("~/Taught1/960-542/Data")
> #************************************************************/
> # Data from http://lib.stat.cmu.edu/datasets/csb/ch14.dat   */
> # Case Studies in Biometry, ch. 14, Interpretation of a     */
> # Leukemia Trial Stopped Early, Emerson, et al.  Consider   */
> # age, treatment arm (0 for D, 1 for I), remission indicator*/
> # and time of remission, transplant indicator and time      */
> # of transplant, end of study time and indicator of death,  */
> # fab (a measure of disease status, -1=>missing), gender    */
> # (1=M,0=F), white blood count (-1=>missing), kscore (a     */
> # measure of disease status), and chemotherapy rounds before*/
> # remission).                                               */
> # See http://lib.stat.cmu.edu/datasets/csb/ch14.txt for     */
> # original variable list.                                   */
> #************************************************************/
> emerson<-read.table("ch14.dat",na.strings=".",
+    stringsAsFactors=FALSE,col.names=c("id","startdate","ttt",
+    "sex","age","fab","kscore","wbc","x3","x4","b1","b2","d2",
+    "crdate","lastdate","statusc","transc","trandate","b6"),
+    colClasses=c(startdate="character",crdate="character",
+       trandate="character",lastdate="character"))
> #Build 0-1 status variable for remission, transplant, and
> #eventual death.
> emerson$crind<-as.numeric(emerson$b2=="Y")
> emerson$transs<-as.numeric(emerson$transc=="Y")
> emerson$status<-as.numeric(emerson$statusc=="D")
> # Replace missingness indicator by standard R NA.
> emerson$fab[emerson$fab==-1]<-NA
> emerson$wbc[emerson$wbc==-1]<-NA
> # Create categorical variable for wbc, kscore
> emerson$wbcl<-1*(emerson$wbc<13.5)
> emerson$ksh<- (emerson$kscore>55)+ (emerson$kscore>85)
> # Replace event times coded as dates to times
> for(j in c("startdate","crdate","lastdate","trandate")) 
+    emerson[[j]]<-as.numeric(
+       as.Date(as.character(emerson[[j]]),"%m%d%y"))
> emerson$crdate[emerson$crind==0]<-emerson$lastdate[
+    emerson$crind==0]
> # Replace times by elapsed times.
> emerson$crtime<-emerson$crdate-emerson$startdate;
> emerson$lasttime<-emerson$lastdate-emerson$startdate;
> emerson$ttime<-emerson$trandate-emerson$startdate;
> #*******************************************/
> # Accelerated life fit with Various errors */
> #*******************************************/
> library(survival)#For survreg
> # The next two models fit different error distributions.  
> # Model time until remission.  Negative products of 
> # parameters and covariates indicate faster progress to
> # remission, and is good.
> summary(f1<-survreg(Surv(crtime,crind)~ttt+kscore,
+   dist="weibull", data=emerson))

Call:
survreg(formula = Surv(crtime, crind) ~ ttt + kscore, data = emerson, 
    dist = "weibull")
               Value Std. Error     z       p
(Intercept)  2.77187    0.78778  3.52 0.00043
tttI        -0.74871    0.25737 -2.91 0.00362
kscore       0.03236    0.00959  3.37 0.00074
Log(scale)   0.17256    0.07980  2.16 0.03058

Scale= 1.19 

Weibull distribution
Loglik(model)= -522.9   Loglik(intercept only)= -532
	Chisq= 18.21 on 2 degrees of freedom, p= 0.00011 
Number of Newton-Raphson Iterations: 5 
n= 130 

> summary(f2<-survreg(Surv(crtime,crind)~ttt+kscore,
+   dist="lognormal", data=emerson))

Call:
survreg(formula = Surv(crtime, crind) ~ ttt + kscore, data = emerson, 
    dist = "lognormal")
               Value Std. Error     z       p
(Intercept)  3.16529    0.70719  4.48 7.6e-06
tttI        -0.63213    0.21081 -3.00  0.0027
kscore       0.01919    0.00872  2.20  0.0277
Log(scale)   0.12077    0.07966  1.52  0.1295

Scale= 1.13 

Log Normal distribution
Loglik(model)= -501.8   Loglik(intercept only)= -508.4
	Chisq= 13.14 on 2 degrees of freedom, p= 0.0014 
Number of Newton-Raphson Iterations: 4 
n= 130 

> # Compare proportional hazards regression.  Positive
> # products of parameters and covariates indicate higher
> # hazard of remission, and is good.
> summary(e1<-coxph(Surv(crtime,crind)~ttt+kscore, data=emerson))
Call:
coxph(formula = Surv(crtime, crind) ~ ttt + kscore, data = emerson)

  n= 130, number of events= 89 

            coef exp(coef)  se(coef)      z Pr(>|z|)   
tttI    0.586731  1.798101  0.216579  2.709  0.00675 **
kscore -0.018840  0.981337  0.009343 -2.016  0.04376 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

       exp(coef) exp(-coef) lower .95 upper .95
tttI      1.7981     0.5561    1.1761    2.7489
kscore    0.9813     1.0190    0.9635    0.9995

Concordance= 0.612  (se = 0.031 )
Likelihood ratio test= 12.56  on 2 df,   p=0.002
Wald test            = 12.82  on 2 df,   p=0.002
Score (logrank) test = 13.13  on 2 df,   p=0.001

> # Model time to death.  Positive products of
> # parameters and covariates indicate slower progress to
> # remission, and is good.
> summary(f3<-survreg(Surv(lasttime,status)~ttt+kscore,
+   dist="weibull", data=emerson))

Call:
survreg(formula = Surv(lasttime, status) ~ ttt + kscore, data = emerson, 
    dist = "weibull")
               Value Std. Error     z      p
(Intercept)  6.47342    0.67897  9.53 <2e-16
tttI         0.55811    0.21371  2.61  0.009
kscore      -0.00188    0.00837 -0.22  0.822
Log(scale)  -0.01764    0.08992 -0.20  0.844

Scale= 0.983 

Weibull distribution
Loglik(model)= -658.7   Loglik(intercept only)= -662.2
	Chisq= 6.94 on 2 degrees of freedom, p= 0.031 
Number of Newton-Raphson Iterations: 5 
n= 130 

> summary(f4<-survreg(Surv(lasttime,status)~ttt+kscore,
+   dist="lognormal", data=emerson))

Call:
survreg(formula = Surv(lasttime, status) ~ ttt + kscore, data = emerson, 
    dist = "lognormal")
              Value Std. Error    z       p
(Intercept) 5.29549    0.92975 5.70 1.2e-08
tttI        0.52440    0.27289 1.92   0.055
kscore      0.00776    0.01150 0.67   0.500
Log(scale)  0.37879    0.07957 4.76 1.9e-06

Scale= 1.46 

Log Normal distribution
Loglik(model)= -664.8   Loglik(intercept only)= -666.8
	Chisq= 4.12 on 2 degrees of freedom, p= 0.13 
Number of Newton-Raphson Iterations: 3 
n= 130 

> # The weibull model has a small sigma, and so the exponential
> # fits almost as well.
> summary(f5<-survreg(Surv(lasttime,status)~factor(ttt)+kscore,
+   dist="exponential", data=emerson))

Call:
survreg(formula = Surv(lasttime, status) ~ factor(ttt) + kscore, 
    data = emerson, dist = "exponential")
                Value Std. Error     z      p
(Intercept)   6.46495    0.68912  9.38 <2e-16
factor(ttt)I  0.56237    0.21640  2.60 0.0094
kscore       -0.00177    0.00849 -0.21 0.8352

Scale fixed at 1 

Exponential distribution
Loglik(model)= -658.7   Loglik(intercept only)= -662.2
	Chisq= 6.94 on 2 degrees of freedom, p= 0.031 
Number of Newton-Raphson Iterations: 4 
n= 130 

> #*******************************************/
> # Estimation of Derived Quantities         */
> #*******************************************/
> # Built-in derived quantities for survreg include time, 
> # quantile, and linear predictor.  See ?predict.survreg
> # for the full list.
> # Predict the median for treatment "D" and kscore 80.
> # Fit the exponential model with treatment and kscore.
> newdata<-data.frame(ttt=factor("D",levels=c("D","I")),kscore=80,
+    lasttime=500,status=1)
> # By analogy with proportional hazards regression, one
> # might expect this to work:
> # predict(f3,type="survival",p=.5,newdata=newdata,se.fit=TRUE)
> # but type="survival" is not supported.  Your choices are
> #    lp, linear or link, all of which give you alpha+beta z,
> #    response, which gives you the exponential of lp, 
> #    quantile, the quantile on the original scale,
> #    uquantile, the quantile on the log scale.
> # In order to calculate the fitted # survival probability, 
> # https://stackoverflow.com/questions/65282390/how-to-use-use-predict-for-survreg-models
> # says to use predictSurvProb from ldatools.  This doesn't
> # give standard errors.
> library(PHInfiniteEstimates)#For survregpredict
> survregpredict(f3,newdata,500)
       fit         se 
0.40845117 0.05848334 
> # Now predict other quantities with less trouble.
> predict(f3,type="quantile",p=.5,newdata=newdata,se.fit=TRUE)
$fit
       1 
388.8042 

$se.fit
       1 
55.88913 

> #Get confidence intervals for quantile via est+/-1.96 SE
> f6<-survreg(Surv(lasttime,status)~factor(ttt)+kscore,data=emerson)
> pp<-predict(f6,newdata=newdata,type="lp",se.fit=TRUE)
> pp$acf<-exp(pp$fit); pp$stdacf=pp$acf*pp$se.fit
> pp
$fit
      1 
6.32318 

$se.fit
        1 
0.1404645 

$acf
       1 
557.3423 

$stdacf
       1 
78.28681 

> f2<-coxph(Surv(lasttime,status)~factor(ttt)+kscore,data=emerson)
> predict(f2,newdata=newdata,type="risk",se.fit=TRUE)
$fit
       1 
1.000671 

$se.fit
          1 
0.003829663 

> #*******************************************************/
> # Larynx cancer data from Klelin and Moeschberger. See */
> # class web page for link.  Cancers are graded on an   */
> # integer scale, with increasing numbers indicating    */
> # more sever.  Variables are stage, time to death, year*/
> # of diagnosis, and status.                            */
> #*******************************************************/
> larynx<-read.table("larynx.txt",header=TRUE)
> #***********************************************#
> # One-sample parametric diagnostic plot.        #
> #***********************************************#
> bh<-basehaz(coxph(Surv(time,delta)~1,data=larynx,
+    subset=stage==1))
> plot(log(bh$time),log(bh$hazard),xlab="log time",
+    ylab="log hazard", sub="Relation should be stsraight line",
+    main="Empirical vs. Putative Cumulative Hazards",col=1)
> abline(lm(log(bh$hazard)~log(bh$time)),col=1)
> points(log(bh$time),-qnorm(exp(-bh$hazard)),col=2)
> abline(lm(-qnorm(exp(-bh$hazard))~log(bh$time)),col=2)
> points(log(bh$time),log(exp(bh$hazard)-1),col=3)
> abline(lm(log(exp(bh$hazard)-1)~log(bh$time)),col=3)
> legend(1,-2.5,legend=c("Weibull","Log Normal","Log Logistic"),
+    col=1:3,lty=1)
> #*******************************************************#
> # Counterpart to Anderson plots                         #
> #*******************************************************#
> sf<-survfit(coxph(Surv(time,delta)~strata(stage),data=larynx))
> plot( sf$time, sf$surv, xlab="Survival",ylab="Time",
+    pch=rep(seq(length(sf$strata)),sf$strata),
+    main="Inverse fitted survival curves", 
+    sub="Under AFT should be linear")
> legend(.6,max(sf$time),pch=1:4,legend=paste("Stage",1:4))
> #*************************************************************/
> # Simulations show that accelerated failure model recovers   */
> # the proper distribution.  Simulate random exponential time */
> # time1.  Then time2=time^2 becomes Weibull with shape       */
> # parameter .5, and time3 becomes Weibull with shape param-  */
> # ter .5 and intercept 1.                                    */
> #*************************************************************/
> set.seed(4457674)
> fake<-list(time=rexp(100),cent=rexp(100),time4=exp(rnorm(100)))
> fake$time1<-apply(cbind(fake$time,fake$cent),1,min)
> fake$status<-fake$time fake$time2<-fake$time1^2
> fake$time3<-fake$time2*exp(1)
> fake$status4<-fake$time4>fake$cent
> fake<-as.data.frame(fake)
> cat('Simulated data with intercept 0, scale 1\n')
Simulated data with intercept 0, scale 1
> library(survival)#For predict.survreg
> summary(fit1<-survreg(Surv(time1,status)~1,data=fake))

Call:
survreg(formula = Surv(time1, status) ~ 1, data = fake)
             Value Std. Error     z     p
(Intercept) -0.139      0.122 -1.15 0.252
Log(scale)  -0.192      0.114 -1.69 0.092

Scale= 0.825 

Weibull distribution
Loglik(model)= -42.7   Loglik(intercept only)= -42.7
Number of Newton-Raphson Iterations: 7 
n= 100 

> # Cox and Snell residual for the PH regression.
> d1<-(log(fake$time1)-predict(fit1,type="lp"))/fit1$scale
> n1<-basehaz(coxph(Surv(exp(d1),fake$status)~1))
> plot(n1$time,n1$hazard,main="CS Residuals for Weibull Regression",
+    xlab="Time",ylab="Hazard", sub="Correct Exponential Data")
> abline(a=0,b=1)
> cat('Simulated data with intercept 0, scale 2\n')
Simulated data with intercept 0, scale 2
> summary(fit2<-survreg(Surv(time2,status)~1,data=fake))

Call:
survreg(formula = Surv(time2, status) ~ 1, data = fake)
             Value Std. Error     z       p
(Intercept) -0.279      0.243 -1.15    0.25
Log(scale)   0.501      0.114  4.40 1.1e-05

Scale= 1.65 

Weibull distribution
Loglik(model)= -19.2   Loglik(intercept only)= -19.2
Number of Newton-Raphson Iterations: 7 
n= 100 

> # Cox and Snell residual for the Weibull regression.
> d2<-(log(fake$time2)-predict(fit2,type="lp"))/fit2$scale
> n2<-basehaz(coxph(Surv(exp(d2),fake$status)~1))
> plot(n2$time,n2$hazard,main="CS Residuals for Weibull Regression",
+    xlab="Time",ylab="Hazard", sub="Correct Weibull Data")
> abline(a=0,b=1)
> # Cox and Snell residual for a different Weibull regression.
> cat('Simulated data with intercept 1, scale 2\n')
Simulated data with intercept 1, scale 2
> summary(fit3<-survreg(Surv(time3,status)~1,data=fake))

Call:
survreg(formula = Surv(time3, status) ~ 1, data = fake)
            Value Std. Error    z       p
(Intercept) 0.721      0.243 2.96   0.003
Log(scale)  0.501      0.114 4.40 1.1e-05

Scale= 1.65 

Weibull distribution
Loglik(model)= -67.2   Loglik(intercept only)= -67.2
Number of Newton-Raphson Iterations: 7 
n= 100 

> d3<-(log(fake$time3)-predict(fit3,type="lp"))/fit3$scale
> n3<-basehaz(coxph(Surv(exp(d3),fake$status)~1))
> plot(n3$time,n3$hazard,main="CS Residuals for Weibull Regression",
+    xlab="Time",ylab="Hazard",
+    sub="A different correct Exponential data set")
> abline(a=0,b=1)
> cat('Simulated log normal data\n')
Simulated log normal data
> summary(fit4<-survreg(Surv(time4,status4)~1,data=fake))

Call:
survreg(formula = Surv(time4, status4) ~ 1, data = fake)
              Value Std. Error    z      p
(Intercept)  1.0810     0.1065 10.2 <2e-16
Log(scale)  -0.1566     0.0824 -1.9  0.058

Scale= 0.855 

Weibull distribution
Loglik(model)= -135.8   Loglik(intercept only)= -135.8
Number of Newton-Raphson Iterations: 6 
n= 100 

> # Cox and Snell residual for a Weibull regression,
> # applied to log normal data, which should indicate the 
> # wrong model.
> d4<-(log(fake$time4)-predict(fit4,type="lp"))/fit4$scale
> n4<-basehaz(coxph(Surv(-log(pnorm(-d4)),fake$status4)~1))
> plot(n4$time,n4$hazard,main="CS Residuals for Weibull Regression",
+    xlab="Time",ylab="Hazard",
+    sub="An incorrect log normal data set")
> abline(a=0,b=1)
> cat("Generalized Gamma")
Generalized Gamma> # Library flexsurv has a lot of dependencies.
> library(flexsurv)#For models giving gengamma distribution
> # Fit the parametric survival model with the generalized gamma
> # distribution.  Q is estimated as less than one, and k
> # as something greater than one but not huge, so the best
> # fit is somewhere between log gamma and Weibull.
> flexsurvreg(Surv(time,delta)~age+factor(stage),data=larynx,
+    dist="gengamma")
Call:
flexsurvreg(formula = Surv(time, delta) ~ age + factor(stage), 
    data = larynx, dist = "gengamma")

Estimates: 
                data mean  est      L95%     U95%     se     
mu                   NA     3.4379   1.5925   5.2832   0.9415
sigma                NA     1.1034   0.6982   1.7438   0.2577
Q                    NA     0.4585  -0.6867   1.6037   0.5843
age             64.6111    -0.0176  -0.0441   0.0090   0.0135
factor(stage)2   0.1889    -0.1585  -1.0027   0.6856   0.4307
factor(stage)3   0.3000    -0.7579  -1.5305   0.0148   0.3942
factor(stage)4   0.1444    -1.7295  -2.6089  -0.8501   0.4487
                exp(est)  L95%     U95%   
mu                   NA        NA       NA
sigma                NA        NA       NA
Q                    NA        NA       NA
age              0.9826    0.9568   1.0090
factor(stage)2   0.8534    0.3669   1.9850
factor(stage)3   0.4687    0.2164   1.0149
factor(stage)4   0.1774    0.0736   0.4274

N = 90,  Events: 50,  Censored: 40
Total time at risk: 377.8
Log-likelihood = -141.0771, df = 7
AIC = 296.1541

> flexsurvreg(Surv(time,delta)~age+factor(stage),data=larynx,
+    dist="gengamma.orig")
Call:
flexsurvreg(formula = Surv(time, delta) ~ age + factor(stage), 
    data = larynx, dist = "gengamma.orig")

Estimates: 
                data mean  est        L95%       U95%     
shape                  NA   4.70e-01   3.37e-02   6.54e+00
scale                  NA   1.78e+00   5.95e-08   5.34e+07
k                      NA   3.86e+00   4.59e-02   3.25e+02
age              6.46e+01  -1.77e-02  -4.38e-02   8.50e-03
factor(stage)2   1.89e-01  -1.57e-01  -9.97e-01   6.83e-01
factor(stage)3   3.00e-01  -7.41e-01  -1.51e+00   2.56e-02
factor(stage)4   1.44e-01  -1.71e+00  -2.59e+00  -8.41e-01
                se         exp(est)   L95%       U95%     
shape            6.31e-01         NA         NA         NA
scale            1.57e+01         NA         NA         NA
k                8.74e+00         NA         NA         NA
age              1.34e-02   9.82e-01   9.57e-01   1.01e+00
factor(stage)2   4.29e-01   8.55e-01   3.69e-01   1.98e+00
factor(stage)3   3.91e-01   4.77e-01   2.21e-01   1.03e+00
factor(stage)4   4.46e-01   1.80e-01   7.52e-02   4.31e-01

N = 90,  Events: 50,  Censored: 40
Total time at risk: 377.8
Log-likelihood = -141.0807, df = 7
AIC = 296.1614

> #gengamma.orig, using Stacy parameterization, can give con-
> #vergence issues if the best fit is log normal.
> flexsurvreg(Surv(lasttime,status)~ttt+kscore,data=emerson,
+    dist="gengamma.orig")
Call:
flexsurvreg(formula = Surv(lasttime, status) ~ ttt + kscore, 
    data = emerson, dist = "gengamma.orig")

Estimates: 
        data mean  est        L95%       U95%       se       
shape          NA   9.75e-01   4.52e-01   2.10e+00   3.82e-01
scale          NA   5.71e+02   7.48e+01   4.35e+03   5.92e+02
k              NA   1.06e+00   3.54e-01   3.21e+00   5.99e-01
tttI     5.00e-01   5.56e-01   1.34e-01   9.79e-01   2.16e-01
kscore   7.95e+01  -1.23e-03  -1.78e-02   1.53e-02   8.43e-03
        exp(est)   L95%       U95%     
shape          NA         NA         NA
scale          NA         NA         NA
k              NA         NA         NA
tttI     1.74e+00   1.14e+00   2.66e+00
kscore   9.99e-01   9.82e-01   1.02e+00

N = 130,  Events: 87,  Censored: 43
Total time at risk: 64669
Log-likelihood = -658.6775, df = 5
AIC = 1327.355

> #gengammam uses the Prentice parameterization.
> flexsurvreg(Surv(lasttime,status)~ttt+kscore,data=emerson,
+    dist="gengamma")
Call:
flexsurvreg(formula = Surv(lasttime, status) ~ ttt + kscore, 
    data = emerson, dist = "gengamma")

Estimates: 
        data mean  est       L95%      U95%      se      
mu            NA    6.41103   5.03485   7.78722   0.70215
sigma         NA    0.99407   0.76328   1.29463   0.13399
Q             NA    0.96969   0.43508   1.50430   0.27276
tttI     0.50000    0.55716   0.13482   0.97949   0.21548
kscore  79.53846   -0.00124  -0.01775   0.01527   0.00843
        exp(est)  L95%      U95%    
mu            NA        NA        NA
sigma         NA        NA        NA
Q             NA        NA        NA
tttI     1.74570   1.14433   2.66310
kscore   0.99876   0.98241   1.01539

N = 130,  Events: 87,  Censored: 43
Total time at risk: 64669
Log-likelihood = -658.6774, df = 5
AIC = 1327.355

> flexsurvreg(Surv(time4,status4)~1,data=fake,
+    dist="gengamma.orig")
Call:
flexsurvreg(formula = Surv(time4, status4) ~ 1, data = fake, 
    dist = "gengamma.orig")

Estimates: 
       est       L95%      U95%      se      
shape  2.41e-01  3.82e-02  1.52e+00  2.26e-01
scale  7.80e-06  2.25e-22  2.70e+11  1.52e-04
k      2.01e+01  5.47e-01  7.39e+02  3.70e+01

N = 100,  Events: 67,  Censored: 33
Total time at risk: 191.7829
Log-likelihood = -131.8511, df = 3
AIC = 269.7021

> flexsurvreg(Surv(time4,status4)~1,data=fake,
+    dist="gengamma")
Call:
flexsurvreg(formula = Surv(time4, status4) ~ 1, data = fake, 
    dist = "gengamma")

Estimates: 
       est      L95%     U95%     se     
mu      0.6577   0.2941   1.0213   0.1855
sigma   0.9357   0.7895   1.1091   0.0811
Q       0.1195  -0.5065   0.7454   0.3194

N = 100,  Events: 67,  Censored: 33
Total time at risk: 191.7829
Log-likelihood = -131.7977, df = 3
AIC = 269.5955

>