R version 3.6.3 (2020-02-29) -- "Holding the Windsock"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> pdf("g07.pdf");options(width=64)
> # Data set ships gives data on # damage to ships from accidents,
> # by ship class and time frame.  Variables include incidents, 
> # counting accidents and service, counting the cumulative months
> # of service.  There are also indicators for type of ship, and
> # indicators for time frame.
> library(MASS)# For data set ships
> data(ships)
> # One might reasonably treat incidents as Poisson with mean proportional to 
> # time in service.  Use a square root transformation.
> plot(lm(sqrt(incidents)~service+year,data=ships))
> # This shows a large outlier.  Perhaps service is incorrectly scaled.
> plot(lm(sqrt(incidents)~sqrt(service)+year,data=ships))
> # Aside from one outlier, variance stabilization doesn't seem too bad.
> # Variability for low fitted values still seems too small.
> #  Fit a weighted regression
> library(carData)# For data set States
> # I'll dump this so I can read into SAS.
> write.table(States,file="statesed.dat")
> # Education data for states.  Variables include region of country, average dollars spent
> # on education per student, average pay per teacher, and some test results.  My use of any
> # of these variables as explanatory does not reflect any belief in a direction of
> # causation.  All of these variables (except region) likely depend on each other or on
> # some unmeasured variables.
> statefit<-lm(dollars~pay,data=States)
> plot(statefit)
> summary(statefit)

Call:
lm(formula = dollars ~ pay, data = States)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.12053 -0.37368  0.05311  0.31273  2.43224 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.62426    0.61666  -2.634   0.0113 *  
pay          0.21976    0.01965  11.185  4.3e-15 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.7375 on 49 degrees of freedom
Multiple R-squared:  0.7186,	Adjusted R-squared:  0.7128 
F-statistic: 125.1 on 1 and 49 DF,  p-value: 4.3e-15

> # This should be weighted by population, since average expenditure should
> # have a variance that decreases as the number of observations increase.
> summary(statewfit<-lm(dollars~pay,data=States,weights=pop))

Call:
lm(formula = dollars ~ pay, data = States, weights = pop)

Weighted Residuals:
     Min       1Q   Median       3Q      Max 
-298.736  -12.172    7.656   24.821  246.012 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -1.1180     0.8840  -1.265    0.212    
pay           0.1968     0.0264   7.454 1.32e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 68.32 on 49 degrees of freedom
Multiple R-squared:  0.5314,	Adjusted R-squared:  0.5218 
F-statistic: 55.56 on 1 and 49 DF,  p-value: 1.319e-09

> #install.packages("EnvStats") if needed
> library(EnvStats)# For boxcox
> #Maximizes the correlation in normal probability plot.
> #Transforms only response and not explanatory variables.
> library(datasets)# For data set trees
> out<-boxcox(lm(Volume~Height+Girth,data=trees))
> out<-boxcox(lm(Volume~Height+Girth,data=trees),lambda=(-200:200)/100)
> plot(out)
> # Unload package EnvStats because we will use a tool with exactly the same
> # name from another package.
> detach("package:EnvStats",unload=TRUE)
> # Via log likelihood
> library(MASS)# For boxcox
> boxcox(lm(Volume~Height+Girth,data=trees))
> #**************************************************************/
> #* IQ and Brain size, from Tramo MJ et al, Neurology 1998;50: */
> #* 1246-1252. Description at statlib says that it is MSWord   */
> #* format, but fortunately that isn't true.  The data file has*/
> #* text descriptions in it, and tabs between columns.  Data   */
> #* lines (and only data lines) start with numeral.  expand re-*/
> #* moves tabs.                                                */
> #* wget http://lib.stat.cmu.edu/datasets/IQ_Brain_Size        */
> #* expand IQ_Brain_Size | grep '^[0-9]' > twinbrain.dat       */
> #* Variables are                                              */
> #* CCMIDSA: Corpus Collasum Surface Area (cm2)                */
> #* FIQ: Full-Scale IQ                                         */
> #* HC: Head Circumference (cm)                                */
> #* ORDER: Birth Order                                         */
> #* PAIR: Pair ID (Genotype)                                   */
> #* SEX: Sex (1=Male 2=Female)                                 */
> #* TOTSA: Total Surface Area (cm2)                            */
> #* TOTVOL: Total Brain Volume (cm3)                           */
> #* WEIGHT: Body Weight (kg)`                                  */
> #**************************************************************/
> twinbrain<-as.data.frame(scan("twinbrain.dat",
+    what=list(CCMIDSA=0,FIQ=0,HC=0,ORDER=0,PAIR=0,SEX=0,TOTSA=0,
+    TOTVOL=0,WEIGHT=0)))
> fir<-twinbrain[twinbrain$ORDER==1,]
> fir$v1<-fir$TOTVOL 
> sec<-twinbrain[twinbrain$ORDER==2,]
> sec$v2<-sec$TOTVOL 
> both<-merge(fir,sec,by="PAIR")[,c("v1","v2")]
> both$diff<-both$v2-both$v1
> # Create reduced data set by averaging over cluster.
> reduced<-NULL
> for(i in unique(twinbrain$PAIR)){
+    averaged<-apply(twinbrain[twinbrain$PAIR==i,],2,mean)
+    reduced<-rbind(reduced,averaged)
+ }
> # Here is the average value.  Note that order is constant at
> # the average, and so we can't look for an effect due to order.
> print(reduced)
         CCMIDSA   FIQ    HC ORDER PAIR SEX    TOTSA TOTVOL
averaged   5.905  92.5 54.45   1.5    1   2 1799.385  984.0
averaged   6.010  87.0 52.95   1.5    2   2 1881.300 1031.0
averaged   8.205 102.0 57.35   1.5    3   2 2240.325 1276.5
averaged   7.140  99.5 55.95   1.5    4   2 1858.815 1065.0
averaged   6.455 126.5 53.95   1.5    5   2 1726.170 1052.0
averaged   8.375  98.5 57.20   1.5    6   1 1747.955 1126.0
averaged   6.320  90.5 57.20   1.5    7   1 2077.645 1085.5
averaged   7.610  89.5 56.50   1.5    8   1 2060.740 1393.0
averaged   6.310 105.5 56.85   1.5    9   1 1797.740 1064.5
averaged   7.595 118.5 58.85   1.5   10   1 1872.730 1182.0
           WEIGHT
averaged  58.2875
averaged  61.3490
averaged  62.8240
averaged 120.4305
averaged  72.5760
averaged  61.2360
averaged  81.6480
averaged  98.6580
averaged  85.0500
averaged  75.9780
> #Examine the impact of brain volume on full-scale IQ.
> #The above calculations made a matrix, and I need to 
> #make it into a data frame to feed lm.
> reducedbrainfit<-lm(FIQ~TOTVOL,data=data.frame(reduced))
> plot(reducedbrainfit)
> #The scale of the fitted values is skewed.  This does not represent
> #a failure of model diagnostics, but is somewhat distracting.
> #Try a model with both variables logged.
> reducedlogfit<-lm(log(FIQ)~log(TOTVOL),data=data.frame(reduced))
> plot(reducedlogfit)
> #This one looks as good as the raw scale, without the annoying
> #high-leverage point.  It also makes more intuitive sense, in
> #that both variables are constrained to be positive.
> summary(reducedlogfit)

Call:
lm(formula = log(FIQ) ~ log(TOTVOL), data = data.frame(reduced))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.14635 -0.09923 -0.01418  0.04183  0.22898 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  4.95425    2.86648   1.728    0.122
log(TOTVOL) -0.04929    0.40822  -0.121    0.907

Residual standard error: 0.1299 on 8 degrees of freedom
Multiple R-squared:  0.001819,	Adjusted R-squared:  -0.123 
F-statistic: 0.01458 on 1 and 8 DF,  p-value: 0.9069

> 
<\/pre>