R version 3.6.3 (2020-02-29) -- "Holding the Windsock"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> pdf("g05.pdf");options(width=64)
> #Examine the parent-child height data.  Data are from
> #https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1
> #Documentation claims that these are formatted for Stata; fortunately R reads 
> #them just fine.  Tell R that the first row contains column names.
> galton<-read.table("galton-stata11.tab",header=TRUE)
> #First examine daughters.
> daughters<-galton[galton$female==1,]
> #Families have multiple children.  Most of what we will do this semester will
> #require observations to be independent, and so keep only the first daughter.
> first<-c(TRUE,daughters$family[-1]!=daughters$family[-length(daughters$family)])
> fd<-daughters[first,c("mother","father","height")]
> myfit<-lm(height~mother+father,data=fd)
> summary(myfit)

Call:
lm(formula = height ~ mother + father, data = fd)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.6346 -1.0986 -0.0422  1.3220  4.7772 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 17.10803    5.43661   3.147  0.00196 ** 
mother       0.32877    0.06415   5.125 8.24e-07 ***
father       0.38753    0.05490   7.059 4.43e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.916 on 165 degrees of freedom
Multiple R-squared:  0.3288,	Adjusted R-squared:  0.3207 
F-statistic: 40.41 on 2 and 165 DF,  p-value: 5.19e-15

> #Make a nonsense variable by adding mother and father height and keeping
> #remainder after dividing by 4.
> biggerset<-fd
> biggerset$nonsense<-(biggerset$mother+biggerset$father)%%4
> biggerfit<-lm(height~mother+father+nonsense,data=biggerset)
> # Calculate R^2 and see that it goes up a tiny amount.  Note that 
> # nonsense has a large p value.
> summary(biggerfit)

Call:
lm(formula = height ~ mother + father + nonsense, data = biggerset)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.6073 -1.0756 -0.0367  1.3495  4.8036 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 16.98437    5.50927   3.083  0.00241 ** 
mother       0.32960    0.06456   5.105 9.06e-07 ***
father       0.38900    0.05585   6.965 7.57e-11 ***
nonsense    -0.02044    0.13013  -0.157  0.87535    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.921 on 164 degrees of freedom
Multiple R-squared:  0.3289,	Adjusted R-squared:  0.3166 
F-statistic: 26.79 on 3 and 164 DF,  p-value: 3.728e-14

> # Determine the region to examine for the confidence regions.  This needs to be
> # larger than the cross of the two separate one-dimension intervals of the same
> # level.  Choose a higher level, .99.
> myfit<-lm(height~mother+father,data=fd,x=TRUE)
> # The line above saves the regression matrix as part of myfit.  The matrix
> # includes the 1 corresponding to the intercept.
> bnds<-confint(myfit,level=.99)[2:3,]
> # A simple but computationally intensive way to determine the region is to evaluate the
> # statistic on a grid of parameter values and plot level curves.  Set up a 101x101 grid.
> testout<-array(NA,c(101,101))
> Winv<-solve(solve(t(myfit$x)%*%myfit$x)[2:3,2:3])
> for(i in 0:100){ 
+    for(j in 0:100) {
+       gammad<-(bnds[,1]+c(i,j)/100*(bnds[,2]-bnds[,1]))-myfit$coef[2:3]
+       testout[i+1,j+1]<-gammad%*%Winv%*%gammad/2
+    }
+ }
> # Even though we calculated the statistic over 10k times, it was still quick.
> # The first two arguments are vectors containing grid points on which the statistic
> # is calculated.  The third argument is the matrix of results.  The remaining
> # arguments can be in any order.  Levels determines where the contour will be drawn.
> # The others give plot titles and axis labels.  Note that one doesn't divide the
> # error by two before plugging into the F statistic.
> contour( bnds[1,1]+(0:100)*diff(bnds[1,])/100, bnds[2,1]+(0:100)*diff(bnds[2,])/100,
+    testout,
+    xlab=names(myfit$coef)[2], ylab=names(myfit$coef)[3],
+    main="Confidence Intervals and Regions for Height Regression Parameters",
+    sub="95% Intervals and Region",
+    levels=summary(myfit)$sigma^2*qf(.95,2,dim(myfit$x)[1]-3))
> # Draw univariate intervals on the plot.
> bnds<-confint(myfit,level=.95)[2:3,]
> abline(v=bnds[1,1],lty=2); abline(v=bnds[1,2],lty=2)
> abline(h=bnds[2,1],lty=2); abline(h=bnds[2,2],lty=2)
> # Give the plot a legend.
> legend(myfit$coef[2],myfit$coef[3],xjust=.5,lty=c(1,2),legend=c(
+    "Simultaneous Region from F test inversion",
+    "Separate Intervals from t test inversion"))
> X<-cbind(1,fd$mother,fd$father)
> #Predict daughter height for 5'5'' father, 5'10" mother.
> x<-c(1,70,65)
> # The matrix multiplications on the next two lines give 1x1 matrices.
> # We want to treat them as scalars.  R treats scalars as vectors with a single
> # component, and so convert them to vectors.
> myfitted<-as.vector(x%*%myfit$coef)
> sdfit<-sqrt(as.vector(x%*%solve(t(X)%*%X)%*%x))
> print(interval<-myfitted+c(-1,1)*qt(0.975,dim(X)[1]-3)*summary(myfit)$sigma*sdfit)
[1] 64.35238 66.27052
> cat("Confidence Interval for fit\n")
Confidence Interval for fit
> predict(myfit,list(mother=70,father=65),interval="confidence")
       fit      lwr      upr
1 65.31145 64.35238 66.27052
> cat("Prediction Interval for fit\n")
Prediction Interval for fit
> predict(myfit,list(mother=70,father=65),interval="prediction")
       fit     lwr      upr
1 65.31145 61.4094 69.21351
> attach(fd)
> plot(mother,father)
> detach(fd)
> points(70,65,pch=2)
> legend(58,78,pch=c(1,2),legend=c("Sample Points","Point for Prediction"))
> # Note that the point above is within the observed range for mother's and father's
> # heights separately, but the combination is outside the pattern observed
> # in the data.  This is called hidden extrapolation.  Counterpart to hat matrix value
> # is value from fitted value variance calculation, and indicates a value far from the
> # center based on the structure of the data set.
> c(1,70,65)%*%solve(t(X)%*%X)%*%c(1,70,65)
           [,1]
[1,] 0.06429469
> # Simulated Example of Simpson's Paradox generalized to a continuous variables.
> g<-((1:40)>20)*1;x<-rnorm(40)+g*3;y<-x+.4*rnorm(40)-g*6
> plot(x,y,col=2+g)
> abline(lm(y~x)); for(jj in 0:1) abline(lm(y[g==jj]~x[g==jj]),col=jj+2)
> # A Real Example of Simpson's Paradox generalized to a continuous variables.
> #install.packages("ISLR")
> #I found this example in Ismay, C., and Kim, A.Y. (2019) Statistical Inference via Data Science: A moderndive into R and the tidyverse.
> library(ISLR)# For data set Credit
> lgrp<-.bincode(Credit$Limit,breaks=quantile(Credit$Limit,(0:4)/4))
> plot(Credit$Income,Credit$Balance,col=lgrp+1)
> legend(0,2000,col=2:5,
+    legend=paste(c("Low","Moderate Low","Moderate High","High"),"Limit"),pch=1)
> abline(lm(Balance~Income,data=Credit))
> for(ii in 1:4) abline(lm(Balance~Income,data=Credit[lgrp==ii,]),col=ii+1)
> #A related phenomenon: Lord's Paradox.
> #I found this example in Nickerson and Brown (2019) Emerg Themes Epidemiol 16:5.
> #install.packages("car")
> library(car)# For data set WeightLoss
> #Use these variables from WeightLoss: group, with values "Control", 
> #  "Diet", and "DietEx".  Ignore "Diet" group.
> #       se1 and se3: Self esteem at 1 and 3 mo.
> # Model how self esteme changes.
> summary(lm(se3~(group=="DietEx")+se1,
+    data=WeightLoss[WeightLoss$group!="Diet",]))

Call:
lm(formula = se3 ~ (group == "DietEx") + se1, data = WeightLoss[WeightLoss$group != 
    "Diet", ])

Residuals:
    Min      1Q  Median      3Q     Max 
-4.6440 -0.9588  0.2531  0.7788  2.9075 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)   
(Intercept)             8.4676     3.4875   2.428  0.02529 * 
group == "DietEx"TRUE   2.3148     0.7560   3.062  0.00642 **
se1                     0.4485     0.2340   1.917  0.07038 . 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.748 on 19 degrees of freedom
Multiple R-squared:  0.4408,	Adjusted R-squared:  0.3819 
F-statistic: 7.488 on 2 and 19 DF,  p-value: 0.003999

> summary(lm((se3-se1)~(group=="DietEx"),
+    data=WeightLoss[WeightLoss$group!="Diet",]))

Call:
lm(formula = (se3 - se1) ~ (group == "DietEx"), data = WeightLoss[WeightLoss$group != 
    "Diet", ])

Residuals:
    Min      1Q  Median      3Q     Max 
-5.3333 -1.3333  0.6000  0.6667  3.6667 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)  
(Intercept)             0.3333     0.5593   0.596   0.5578  
group == "DietEx"TRUE   2.0667     0.8295   2.491   0.0216 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.937 on 20 degrees of freedom
Multiple R-squared:  0.2368,	Adjusted R-squared:  0.1987 
F-statistic: 6.207 on 1 and 20 DF,  p-value: 0.02163

> # The model that adjusts for baseline has an adjustment that is statistically
> # significantly difference from analyzing change score.
> myfit<-lm(height~mother+father,data=fd)
> plot(myfit,which=1)
> plot(fd$mother,residuals(myfit))
> plot(fd$father,residuals(myfit))
> # Examine a data set involving prediction of trees based on girth and height.
> library(datasets)# For data set trees
> pairs(trees)
> #R gives a variety of plots for linear model fits.  Select which 
> #plot using which argument.  Options will be summarized later.
> plot(badfit<-lm(Volume~Girth+Height,data=trees),which=1)
> #Model ought to be Volume= C*Girth^2*Height.  Do this on log scale.
> logfit<-lm(log(Volume)~log(Girth)+log(Height),data=trees)
> # There are a variety of plots one could draw.  Start with the residual plot
> # with nonparametric fit superimposed (which=1).  We will learn the 
> # nonparametric fit later this semester, I hope.
> plot(logfit,which=1)
> 
<\/pre>