pdf("g03.pdf");options(width=64)
# Fisher's Z is in the package DescTools.  If not already installed, do
#install.packages("DescTools")
library(DescTools)# For CorCI
#Examine the parent-child height data.  Data are from
#https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1
#Documentation claims that these are formatted for Stata; fortunately R reads 
#them just fine.  Tell R that the first row contains column names.
galton<-read.table("galton-stata11.tab",header=TRUE)
#First examine daughters.
daughters<-galton[galton$female==1,]
#Families have multiple children.  Most of what we will do this semester will
#require observations to be independent, and so keep only the first daughter.
first<-c(TRUE,daughters$family[-1]!=daughters$family[-length(daughters$family)])
fd<-daughters[first,c("mother","father","height")]
cat("Confidence Interval for Mother Daughter Height Cor. via Fisher Z\n")
CorCI(cor(fd$height,fd$mother),length(fd$mother),
   conf.level=0.95,alternative="two.sided")
# This confidence interval differs slightly from that in SAS,;
# in that SAS includes a small bias correction but R does not.;
library(datasets)# For data set anscombe
#Get documentation for the data set using ?anscombe
summary(fit1<-lm(y1~x1,data=anscombe))
summary(fit2<-lm(y2~x2,data=anscombe))
summary(fit3<-lm(y3~x3,data=anscombe))
summary(fit4<-lm(y4~x4,data=anscombe))
par(mfrow=c(2,2))
attach(anscombe)
plot(x1,y1,main="Standard Case"); abline(fit1)
plot(x2,y2,main="Linear Model Wrong"); abline(fit2)
plot(x3,y3,main="Outlier"); abline(fit3)
plot(x4,y4,main="Influential Observation"); abline(fit4)
par(mfrow=c(1,1))
detach(anscombe)
attach(fd)
# Build the 168 x 2 regression model with the first column all 1s and the 
# second column the regressor of interest.
X<-cbind(1,mother)
# Transposing is done by t().  Multiplication is done by %*%.  The matrix 
# inverse is calculated using solve().  This now gives the same parameter 
# fit as above:
solve(t(X)%*%X)%*%t(X)%*%height
# Here we're using the matrix inverse to solve a system of equations with two
# equations and two unknown.  R does this more efficiently as
solve(t(X)%*%X,t(X)%*%height)
# Compare with
print(onefit<-lm(height~mother,data=fd))
# We can now add father's height
X<-cbind(1,mother,father)
solve(t(X)%*%X,t(X)%*%height)
detach(fd)
myfit<-lm(height~mother+father,data=fd)
print(myfit)
#Fit with mother and father
myfit
#Fit with just mother
onefit
#Notice that the fit including father has mother's coefficient smaller.  
#That's because taller women tend to be married to taller men, and so 
#some of the increased height for daughters of taller mothers
#gets ascribed to the fathers.
#Perform the standardized regression.  Standardized the response explanatory variables.
fd$stheight<-(fd$height-mean(fd$height))/sd(fd$height)
fd$stmother<-(fd$mother-mean(fd$mother))/sd(fd$mother)
fd$stfather<-(fd$father-mean(fd$father))/sd(fd$father)
lm(stheight~stmother+stfather,data=fd)
# Shows that changing mother's height by 1 sd changes daughter height by 
# approx one third sd.
# Shows that changing father's height by 1 sd changes daughter height by 
# approx one half sd.
<\/pre>