R version 3.6.3 (2020-02-29) -- "Holding the Windsock"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> pdf("g11.pdf");options(width=64)
> #Examine the parent-child height data.  Data are from
> #https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1
> #Documentation claims that these are formatted for Stata; fortunately R reads 
> #them just fine.  Tell R that the first row contains column names.
> galton<-read.table("galton-stata11.tab",header=TRUE)
> #First examine daughters.
> daughters<-galton[galton$female==1,]
> #Families have multiple children.  Most of what we will do this semester will
> #require observations to be independent, and so keep only the first daughter.
> first<-c(TRUE,daughters$family[-1]!=daughters$family[-length(daughters$family)])
> fd<-daughters[first,c("mother","father","height")]
> fd$mcmround<-round(fd$mother*2.54)
> summary(colinfit<-lm(height~mother+mcmround,data=fd,x=TRUE))

Call:
lm(formula = height ~ mother + mcmround, data = fd, x = TRUE)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.7377 -1.2562  0.1006  1.3926  5.7623 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  43.3121     4.7518   9.115 2.59e-16 ***
mother       -1.4345     1.5073  -0.952    0.343    
mcmround      0.6988     0.5874   1.190    0.236    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.177 on 165 degrees of freedom
Multiple R-squared:  0.1335,	Adjusted R-squared:  0.123 
F-statistic: 12.71 on 2 and 165 DF,  p-value: 7.316e-06

> # Examine variance inflation factor.  As far as I can tell, VIF
> # from package regclass gives the same results as vif from
> # package car.
> # First install via install.packages("regclass")
> library(car)# For vif
> vif(colinfit)
  mother mcmround 
429.3064 429.3064 
> cor(colinfit$x[,-1])
            mother  mcmround
mother   1.0000000 0.9988347
mcmround 0.9988347 1.0000000
> # Exhibit residual standard deviation fit.
> # I will use squareds later as well.
> fd$m2<-fd$mother^2
> fd$f2<-fd$father^2
> summary(quadfit<-lm(height~mother+m2+father+f2,data=fd,x=TRUE))

Call:
lm(formula = height ~ mother + m2 + father + f2, data = fd, x = TRUE)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.6064 -1.0835 -0.0299  1.2899  4.8083 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)
(Intercept)  3.936e+01  1.022e+02   0.385    0.701
mother      -4.687e-01  2.521e+00  -0.186    0.853
m2           6.241e-03  1.972e-02   0.317    0.752
father       4.807e-01  1.863e+00   0.258    0.797
f2          -6.792e-04  1.341e-02  -0.051    0.960

Residual standard error: 1.927 on 163 degrees of freedom
Multiple R-squared:  0.3292,	Adjusted R-squared:  0.3128 
F-statistic:    20 on 4 and 163 DF,  p-value: 2.045e-13

> vif(quadfit)
  mother       m2   father       f2 
1532.425 1532.938 1142.701 1143.185 
> cor(quadfit$x[,-1])
           mother         m2     father         f2
mother 1.00000000 0.99967200 0.06125154 0.06372421
m2     0.99967200 1.00000000 0.06295853 0.06545140
father 0.06125154 0.06295853 1.00000000 0.99955902
f2     0.06372421 0.06545140 0.99955902 1.00000000
> # Columns of the vectors component are eigenvectors.
> eigen(t(colinfit$x)%*%colinfit$x)
eigen() decomposition
$values
[1] 5.150294e+06 1.923955e+00 2.083728e-01

$vectors
             [,1]        [,2]        [,3]
[1,] -0.005707583 -0.08717905  0.99617631
[2,] -0.366392019 -0.92673322 -0.08320107
[3,] -0.930443073  0.36546593  0.02665230

> eigen(t(quadfit$x)%*%quadfit$x)
eigen() decomposition
$values
[1] 6.790807e+09 1.720933e+07 1.107003e+03 7.871421e-01
[5] 3.549838e-04

$vectors
              [,1]          [,2]         [,3]         [,4]
[1,] -0.0001570435 -3.876817e-05 -0.021134452 -0.006459057
[2,] -0.0100800322 -7.201644e-03 -0.676300632 -0.736275000
[3,] -0.6478353977 -7.616720e-01  0.011477531  0.005772841
[4,] -0.0109283888  3.327806e-03 -0.736168384  0.676609439
[5,] -0.7616352382  6.479142e-01  0.009755372 -0.004872952
              [,5]
[1,]  9.997558e-01
[2,] -1.905540e-02
[3,]  1.486278e-04
[4,] -1.119258e-02
[5,]  8.022798e-05

> # Can often better address this if covariates are rescaled
> ss<-apply(quadfit$x^2,2,sum)
> quadfit$newx<-t(apply(quadfit$x,1,"/",sqrt(ss)))
> cor(quadfit$newx[,-1])
           mother         m2     father         f2
mother 1.00000000 0.99967200 0.06125154 0.06372421
m2     0.99967200 1.00000000 0.06295853 0.06545140
father 0.06125154 0.06295853 1.00000000 0.99955902
f2     0.06372421 0.06545140 0.99955902 1.00000000
> eigen(t(quadfit$newx)%*%quadfit$newx)
eigen() decomposition
$values
[1] 4.991422e+00 6.521035e-03 2.055264e-03 9.282862e-07
[5] 5.469335e-07

$vectors
           [,1]        [,2]        [,3]       [,4]        [,5]
[1,] -0.4473588  0.04587956  0.71587410  0.2549874 -0.46933021
[2,] -0.4474391  0.32492479  0.09729481  0.3016640  0.77055475
[3,] -0.4469496  0.59957381 -0.51715131 -0.1528469 -0.38722047
[4,] -0.4474488 -0.30941772  0.14002819 -0.8096665  0.16994850
[5,] -0.4468713 -0.66112848 -0.43704076  0.4062735 -0.08457154

> ss<-apply(colinfit$x^2,2,sum)
> colinfit$newx<-t(apply(colinfit$x,1,"/",sqrt(ss)))
> cor(colinfit$newx[,-1])
            mother  mcmround
mother   1.0000000 0.9988347
mcmround 0.9988347 1.0000000
> eigen(t(colinfit$newx)%*%colinfit$newx)
eigen() decomposition
$values
[1] 2.999126e+00 8.720159e-04 1.524414e-06

$vectors
           [,1]       [,2]         [,3]
[1,] -0.5772665  0.8165221  0.007416852
[2,] -0.5773935 -0.4017504 -0.710783625
[3,] -0.5773908 -0.4145940  0.703371615

> fd$f2o<-resid(lm(f2~father,data=fd))
> fd$m2o<-resid(lm(m2~mother,data=fd))
> vif(lm(height ~ mother + m2 + father + f2, data = fd))
  mother       m2   father       f2 
1532.425 1532.938 1142.701 1143.185 
> vif(lm(height ~ mother + m2o + father + f2o, data = fd))
  mother      m2o   father      f2o 
1.011001 1.005440 1.008420 1.008020 
> library(MASS)# For lm.ridge
> #lm.ridge does not penalize intercept.
> ridge1<-lm.ridge(height~mother+mcmround, lambda=(0:100)/100,data=fd)
> plot(ridge1)
> ridge2<-lm.ridge(height~mother+I(mother^2)+father+I(father^2),
+    lambda=(0:100)/100,data=fd)
> plot(ridge2)
> #More logical to apply to standardized covariates
> select(ridge4<-lm.ridge(height~quadfit$newx[,-1],lambda=(0:10)/10,data=fd))
modified HKB estimator is 1.176636 
modified L-W estimator is 4.199935 
smallest value of GCV  at 1 
> 
<\/pre>