pdf("g06.pdf");options(width=64)
# Refit the trees data set on the log scale:
library(datasets)# For data set trees
logfit<-lm(log(Volume)~log(Girth)+log(Height),data=trees)
# How do we test hypotheses for which null values are not zero?  The brute
# force approach is to subtract off the fits under the non-null but non-zero
# model.  This makes the the remainder have zero null values.
newtrees<-trees
newtrees$adjvolume<-newtrees$Volume/(newtrees$Height*newtrees$Girth^2)
summary(lm(log(adjvolume)~log(Height)+log(Girth),data=newtrees))
# This needs to be done frequently enough that there's a more regular
# way to do this:
summary(lm(log(Volume)~log(Height)+log(Girth),data=trees,
   offset=log(Height)+2*log(Girth)))
# Here's a way to do test non-zero null hypotheses without refitting the
# model.  To do this one needs to explore how lm feeds information to other 
# functions.  lm gives only intermediate results for the fit.  Fitting the 
# model but not saving the result, or putting the fit object at the command 
# prompt, like this
logfit
# prints the object.  You'll notice that it doesn't print much, and conjecture
# that there's a lot more information in this object than is immediately 
# printed.  This conjecture is right.  If you do
# print.default(logfit)
# you will see extensive output.  Generic R tasks, like summary, print, and
# plot, and some others, are tailored to the kind of object that are given.
# To find what kind of object we have, ask for its class:
class(logfit)
# See that the class is lm.  Then the specific form of the generic task is
# print.lm.  You will find that you can't apply, examine, or get documentation
# for this function directly.   R doesn't want you to think about
# the printing process, and so you have to work to see it.  R stores the
# various objects and functions accessible to you in various places.  This
# allows for there to be different objects with the same name.  Preinstalled
# objects are classified into name spaces.  print.lm is in the stats name
# space.  Many R objects are available to you just by typing the name at
# the command line, but some aren't.  To figure out what namespace print.lm
# sits inside, we'll guess that it's the same one that lm is in.  Documentation
# ?lm
# indicates the namespace as stats.  My failure to access this function 
# by typing its name at the prompt indicates that R hasn't made this easily
# accessible.  In R, to make an object accessible without explicit reference
# is called exporing it, and so print.lm is not exported from stats.
# Refer to an object in a specific namespace by typing the namespace, then
# :: and the object name, with no spaces.  If the object is not exported, type
# ::: instead of :: .
stats:::print.lm
# You will see that print.lm does very little calculation, and just formats.
# To get the useful information from the intermediate calculation stored in 
# log fit, call summary.
logfitsum<-summary(logfit)
# Again this generic task was fed to a specific function specified by the
# class of logfit.  In this case, the generic funcition is summary.lm.
# Because the summary was saved, it wasn't printed.  Print it now.
print(logfitsum)
# summary.lm tests the null hypotheses that the regresssion parameters are
# zero.  Interesting null hypothesis is that coefficient of log girth is 2,
#coefficient of log height is 1.  Explore this using confidence intervals:
confint(logfit)
# Confirm that the interesting values of the parameters sit inside the
# the confidence intervals.  If you want p-values, you need to calculate them
# yourself.  The function summary.lm is exported, and its documentation is
# viewable, via
# ?summary.lm
# Note that there is no option for a null hypothesis for regression coefficients
# other than zero.  See that the summary object has a component coef, which is 
# a matrix with one row for each regression coefficient, and columns for
# estimates, standard error, t statistic, and p-values.  Extract the right
# estimates and standard errors.
estse<-logfitsum$coef[c("log(Girth)","log(Height)"),c(1,2)]
# Calculate new t statistics using the new null hypotheses.
newtstat<-(estse[,1]-c(2,1))/estse[,2]
# The summary object has a component called df with degrees of freedom.
# ?summary.lm tells us that this vector has three components, the model,
# residual, and adjusted degrees of freedom.  We want the second.
2*pt(-abs(newtstat),logfitsum$df[2])
#Here rstandard gives studentized residuals (name not withstanding!)
#and type "sd.1" (the default) indicates standardize via residual 
#standard deviation.  The only alternative is cross-validation, to be
#discussed later.  A similar function rstudent estimates separate
#variances omitting the current variable.
badfit<-lm(Volume~Girth+Height,data=trees)
plot(fitted(badfit),rstandard(badfit,type="sd.1"),
   main="Residual plot for bad tree fit")
# Do a similar approach for height data.
#Examine the parent-child height data.  Data are from
#https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1
#Documentation claims that these are formatted for Stata; fortunately R reads 
#them just fine.  Tell R that the first row contains column names.
galton<-read.table("galton-stata11.tab",header=TRUE)
#First examine daughters.
daughters<-galton[galton$female==1,]
#Families have multiple children.  Most of what we will do this semester will
#require observations to be independent, and so keep only the first daughter.
first<-c(TRUE,daughters$family[-1]!=daughters$family[-length(daughters$family)])
fd<-daughters[first,c("mother","father","height")]
heightfit<-lm(height~mother+father,data=fd)
plot(fitted(heightfit),rstandard(heightfit),
   main="Residual plot for daughter height based on both parent heights")
# Caption makes it look like these are standardized, but documentation
# says they are studentized.  In general, if you care which is used,
# read documentation carefully.
# Argument which gives: 
#   1. residuals vs. fitted.
#   2. normal quantile plot of studentized residuals (caption notwithstanding)
#   3. Scale-Location plot
#   4. Cook's distance
#   5. Residuals vs. Leverage
#   6. Cook's distance vs. leverage.
#  Plots 4-6 will be discussed in a later lecture.
plot(logfit,which=3)
plot(heightfit,which=3)
#Are studentized residuals close to zero?  Do a simulation.
W<-cbind(1,0:8)
n<-dim(W)[1]
H<-W%*%solve(t(W)%*%W)%*%t(W)
#Build a structure to hold our simulated values.  These will be,
#for each random data set, a residual associated with W=0, and 
#one for W near its mean.
out<-array(NA,c(1000000,2))
for(j in seq(dim(out)[1])){
   y<-rnorm(n)
   rvec<-(y-H%*%y)
   s<-sqrt(sum(rvec^2)/(n-2))
   rstudvec<-rvec/(s*sqrt(1-diag(H)))
   out[j,]<-rstudvec[c(1,floor((n+1)/2))]
}
apply(out,2,var)
library(e1071)# For skewness, kurtosis
apply(out,2,skewness)
apply(out,2,kurtosis)
# Install if needed:
# install.packages("car")#Quite large to install
# The default quantile plot from R uses standardized residuals.
plot(logfit,which=2)
# A little work gives a plot with studentized residuals.
library(car)# For qqPlot
# The below compares to t distribution.
qqPlot(logfit,main="Normal Quantile Plot via qqPlot")
library(MASS)# For studres
qqnorm(studres(logfit),main="Normal Quantile Plot via studres and qqnorm")
abline(0,1)
# Perform a similar analysis for height data.
#Examine the parent-child height data.  Data are from
#https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1
#Documentation claims that these are formatted for Stata; fortunately R reads 
#them just fine.  Tell R that the first row contains column names.
galton<-read.table("galton-stata11.tab",header=TRUE)
#First examine daughters.
daughters<-galton[galton$female==1,]
#Families have multiple children.  Most of what we will do this semester will
#require observations to be independent, and so keep only the first daughter.
first<-c(TRUE,daughters$family[-1]!=daughters$family[-length(daughters$family)])
fd<-daughters[first,c("mother","father","height")]
myfit<-lm(height~mother+father,data=fd)
qqPlot(myfit)
#Plot partial residuals.  Plot with one argument puts component
#number on horizontal axis.
smallerfit<-lm(log(Volume)~log(Height),data=trees)
partfit<-lm(log(Girth)~log(Height),data=trees)
plot(resid(partfit),resid(smallerfit),main="Added Variable Plot",
   sub="Note convex relationship")
#Plot PRESS residuals.  Plot with one argument puts component
#number on horizontal axis.
plot(rstandard(logfit,type="pred"),
   main="PRESS residuals for tree data",sub="Log fit")
#Calculate PRESS statistic.  Plot with one argument puts component
#number on horizontal axis.
#install.packages("qpcR")
library(qpcR)# For PRESS
#PRESS applied to a model with internal transformations 
#doesn't seem to work.
summary(logfit)
PRESS(logfit)
#Let's try this:
newtrees<-trees
newtrees$logvolume<-log(newtrees$Volume)
newtrees$logheight<-log(newtrees$Height)
newtrees$loggirth<-log(newtrees$Girth)
PRESS(lm(logvolume~logheight+loggirth,data=newtrees))
<\/pre>