pdf("g08.pdf");options(width=64)
#install.packages("lme4")
library(lme4)# For lme
# Consider the balanced case, and convert to a data set with a constant
# explanatory variable on each cluster.  Note that the least squares fit
# of averages equals the ML fit.  It isn't the same if the explanatory
# variable in the reduced case needs to be averaged over cluster.
#**************************************************************/
#* IQ and Brain size, from Tramo MJ et al, Neurology 1998;50: */
#* 1246-1252. Description at statlib says that it is MSWord   */
#* format, but fortunately that isn't true.  The data file has*/
#* text descriptions in it, and tabs between columns.  Data   */
#* lines (and only data lines) start with numeral.  expand re-*/
#* moves tabs.                                                */
#* wget http://lib.stat.cmu.edu/datasets/IQ_Brain_Size        */
#* expand IQ_Brain_Size | grep '^[0-9]' > twinbrain.dat       */
#* Variables are                                              */
#* CCMIDSA: Corpus Collasum Surface Area (cm2)                */
#* FIQ: Full-Scale IQ                                         */
#* HC: Head Circumference (cm)                                */
#* ORDER: Birth Order                                         */
#* PAIR: Pair ID (Genotype)                                   */
#* SEX: Sex (1=Male 2=Female)                                 */
#* TOTSA: Total Surface Area (cm2)                            */
#* TOTVOL: Total Brain Volume (cm3)                           */
#* WEIGHT: Body Weight (kg)`                                  */
#**************************************************************/
twinbrain<-as.data.frame(scan("twinbrain.dat",
   what=list(CCMIDSA=0,FIQ=0,HC=0,ORDER=0,PAIR=0,SEX=0,TOTSA=0,
   TOTVOL=0,WEIGHT=0)))
fir<-twinbrain[twinbrain$ORDER==1,]
fir$v1<-fir$TOTVOL 
sec<-twinbrain[twinbrain$ORDER==2,]
sec$v2<-sec$TOTVOL 
both<-merge(fir,sec,by="PAIR")[,c("v1","v2")]
both$diff<-both$v2-both$v1
# Create reduced data set by averaging over cluster.
reduced<-NULL
for(i in unique(twinbrain$PAIR)){
   averaged<-apply(twinbrain[twinbrain$PAIR==i,],2,mean)
   reduced<-rbind(reduced,averaged)
}
reducedbrainfit<-lm(FIQ~TOTVOL,data=data.frame(reduced))
summary(reducedbrainfit)
fake<-merge(twinbrain[,c("PAIR","FIQ")],
   reduced[,c("PAIR","TOTVOL")],by="PAIR")
lmer(FIQ~TOTVOL+(1|PAIR),data=fake,REML=FALSE)
# REML estimates.
summary(lmer(FIQ~TOTVOL+(1|PAIR),data=twinbrain))
# ML estimates.
summary(lmer(FIQ~TOTVOL+(1|PAIR),data=twinbrain,REML=FALSE))
#Examine the parent-child height data.  Data are from
#https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T0HSJ1
#Documentation claims that these are formatted for Stata; fortunately R reads 
#them just fine.  Tell R that the first row contains column names.
galton<-read.table("galton-stata11.tab",header=TRUE)
#First examine daughters.
daughters<-galton[galton$female==1,]
#Families have multiple children.  Most of what we will do this semester will
#require observations to be independent, and so keep only the first daughter.
first<-c(TRUE,daughters$family[-1]!=daughters$family[-length(daughters$family)])
fd<-daughters[first,c("mother","father","height")]
summary(lmer(height~mother+father+(1|family),data=daughters))
library(carData)# For data set States
summary(States[,c("dollars","pay")])
twodfit<-lm(percent~dollars+pay,data=States,x=TRUE)
print(twodfit)
twodfitplot<-function(fit){
   X<-fit$x
   plot(X[,2],X[,3])
   xbar<-apply(X,2,mean)[-1]
   M<-chol(solve(t(X)%*%X)[2:3,2:3])
   theta<-(0:32)/5
   out<-sqrt(0.05)*solve(M)%*%rbind(cos(theta),sin(theta))
   lines(out[1,]+xbar[1],out[2,]+xbar[2],type="l")
   points(xbar[1],xbar[2],pch=2)
   hatdiag<-diag(X%*%solve(t(X)%*%X)%*%t(X))
   xxx<-X[hatdiag==max(hatdiag),2:3]
   text(xxx[1],xxx[2],round(max(hatdiag),2))
}
markleverageplot<-function(fit){
   X<-fit$x
   hat<-diag(X%*%solve(t(X)%*%X)%*%t(X))
# Plot vs. leave one out residuals.
   plot(predict(fit),rstandard(fit,type="predictive"),col=1+(rank(-hat)<4))
}
twodfitplot(twodfit)
markleverageplot(twodfit)
twodfitplot(logfit<-lm(log(Volume)~log(Girth)+log(Height),data=trees,x=TRUE))
markleverageplot(logfit)
twodfitplot(dfit<-lm(height~mother+father,data=fd,x=TRUE))
markleverageplot(dfit)
#The r function influence.measures calculates a 
#variety of influence measures for a fit.  These
#measures are dfbetas for model terms incl. the
#intercept, dffits, cov.r, Cook's distance, and h.
#Observations are marked as influential if any of these conditions happens:
#a dfbetas has absolute value greater than 1, 
#dffits has absolute value greater than 3 sqrt(k/(n-k)), 
#cov.r has absolute value greater than 1+3 k/(n-k), 
#Cook's distance is greater than the F median for k, n-k df,
#or the hat matrix diagonal exceeds 3 k/n.
influence.measures(twodfit)
<\/pre>