pdf("g07.pdf");options(width=64)
# Data set ships gives data on # damage to ships from accidents,
# by ship class and time frame.  Variables include incidents, 
# counting accidents and service, counting the cumulative months
# of service.  There are also indicators for type of ship, and
# indicators for time frame.
library(MASS)# For data set ships
data(ships)
# One might reasonably treat incidents as Poisson with mean proportional to 
# time in service.  Use a square root transformation.
plot(lm(sqrt(incidents)~service+year,data=ships))
# This shows a large outlier.  Perhaps service is incorrectly scaled.
plot(lm(sqrt(incidents)~sqrt(service)+year,data=ships))
# Aside from one outlier, variance stabilization doesn't seem too bad.
# Variability for low fitted values still seems too small.
#  Fit a weighted regression
library(carData)# For data set States
# I'll dump this so I can read into SAS.
write.table(States,file="statesed.dat")
# Education data for states.  Variables include region of country, average dollars spent
# on education per student, average pay per teacher, and some test results.  My use of any
# of these variables as explanatory does not reflect any belief in a direction of
# causation.  All of these variables (except region) likely depend on each other or on
# some unmeasured variables.
statefit<-lm(dollars~pay,data=States)
plot(statefit)
summary(statefit)
# This should be weighted by population, since average expenditure should
# have a variance that decreases as the number of observations increase.
summary(statewfit<-lm(dollars~pay,data=States,weights=pop))
#install.packages("EnvStats") if needed
library(EnvStats)# For boxcox
#Maximizes the correlation in normal probability plot.
#Transforms only response and not explanatory variables.
library(datasets)# For data set trees
out<-boxcox(lm(Volume~Height+Girth,data=trees))
out<-boxcox(lm(Volume~Height+Girth,data=trees),lambda=(-200:200)/100)
plot(out)
# Unload package EnvStats because we will use a tool with exactly the same
# name from another package.
detach("package:EnvStats",unload=TRUE)
# Via log likelihood
library(MASS)# For boxcox
boxcox(lm(Volume~Height+Girth,data=trees))
#**************************************************************/
#* IQ and Brain size, from Tramo MJ et al, Neurology 1998;50: */
#* 1246-1252. Description at statlib says that it is MSWord   */
#* format, but fortunately that isn't true.  The data file has*/
#* text descriptions in it, and tabs between columns.  Data   */
#* lines (and only data lines) start with numeral.  expand re-*/
#* moves tabs.                                                */
#* wget http://lib.stat.cmu.edu/datasets/IQ_Brain_Size        */
#* expand IQ_Brain_Size | grep '^[0-9]' > twinbrain.dat       */
#* Variables are                                              */
#* CCMIDSA: Corpus Collasum Surface Area (cm2)                */
#* FIQ: Full-Scale IQ                                         */
#* HC: Head Circumference (cm)                                */
#* ORDER: Birth Order                                         */
#* PAIR: Pair ID (Genotype)                                   */
#* SEX: Sex (1=Male 2=Female)                                 */
#* TOTSA: Total Surface Area (cm2)                            */
#* TOTVOL: Total Brain Volume (cm3)                           */
#* WEIGHT: Body Weight (kg)`                                  */
#**************************************************************/
twinbrain<-as.data.frame(scan("twinbrain.dat",
   what=list(CCMIDSA=0,FIQ=0,HC=0,ORDER=0,PAIR=0,SEX=0,TOTSA=0,
   TOTVOL=0,WEIGHT=0)))
fir<-twinbrain[twinbrain$ORDER==1,]
fir$v1<-fir$TOTVOL 
sec<-twinbrain[twinbrain$ORDER==2,]
sec$v2<-sec$TOTVOL 
both<-merge(fir,sec,by="PAIR")[,c("v1","v2")]
both$diff<-both$v2-both$v1
# Create reduced data set by averaging over cluster.
reduced<-NULL
for(i in unique(twinbrain$PAIR)){
   averaged<-apply(twinbrain[twinbrain$PAIR==i,],2,mean)
   reduced<-rbind(reduced,averaged)
}
# Here is the average value.  Note that order is constant at
# the average, and so we can't look for an effect due to order.
print(reduced)
#Examine the impact of brain volume on full-scale IQ.
#The above calculations made a matrix, and I need to 
#make it into a data frame to feed lm.
reducedbrainfit<-lm(FIQ~TOTVOL,data=data.frame(reduced))
plot(reducedbrainfit)
#The scale of the fitted values is skewed.  This does not represent
#a failure of model diagnostics, but is somewhat distracting.
#Try a model with both variables logged.
reducedlogfit<-lm(log(FIQ)~log(TOTVOL),data=data.frame(reduced))
plot(reducedlogfit)
#This one looks as good as the raw scale, without the annoying
#high-leverage point.  It also makes more intuitive sense, in
#that both variables are constrained to be positive.
summary(reducedlogfit)
<\/pre>