library(alr3)
library(tidyverse)
data(water)
str(water)
## 'data.frame':    43 obs. of  8 variables:
##  $ Year   : int  1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 ...
##  $ APMAM  : num  9.13 5.28 4.2 4.6 7.15 9.7 5.02 6.7 10.5 9.1 ...
##  $ APSAB  : num  3.58 4.82 3.77 4.46 4.99 5.65 1.45 7.44 5.85 6.13 ...
##  $ APSLAKE: num  3.91 5.2 3.67 3.93 4.88 4.91 1.77 6.51 3.38 4.08 ...
##  $ OPBPC  : num  4.1 7.55 9.52 11.14 16.34 ...
##  $ OPRC   : num  7.43 11.11 12.2 15.15 20.05 ...
##  $ OPSLAKE: num  6.47 10.26 11.35 11.13 22.81 ...
##  $ BSAAM  : int  54235 67567 66161 68094 107080 67594 65356 67909 92715 70024 ...
socal.water <- water[ , -1]

Correlations:

water.cor <- cor(socal.water)
water.cor
##             APMAM      APSAB    APSLAKE      OPBPC      OPRC    OPSLAKE
## APMAM   1.0000000 0.82768637 0.81607595 0.12238567 0.1544155 0.10754212
## APSAB   0.8276864 1.00000000 0.90030474 0.03954211 0.1056396 0.02961175
## APSLAKE 0.8160760 0.90030474 1.00000000 0.09344773 0.1063836 0.10058669
## OPBPC   0.1223857 0.03954211 0.09344773 1.00000000 0.8647073 0.94334741
## OPRC    0.1544155 0.10563959 0.10638359 0.86470733 1.0000000 0.91914467
## OPSLAKE 0.1075421 0.02961175 0.10058669 0.94334741 0.9191447 1.00000000
## BSAAM   0.2385695 0.18329499 0.24934094 0.88574778 0.9196270 0.93843604
##             BSAAM
## APMAM   0.2385695
## APSAB   0.1832950
## APSLAKE 0.2493409
## OPBPC   0.8857478
## OPRC    0.9196270
## OPSLAKE 0.9384360
## BSAAM   1.0000000
pairs(socal.water)

## Modeling and Evaluation

library(leaps)
attach(socal.water)
fit <- lm(BSAAM ~ ., data = socal.water)
summary(fit)
## 
## Call:
## lm(formula = BSAAM ~ ., data = socal.water)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12690  -4936  -1424   4173  18542 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15944.67    4099.80   3.889 0.000416 ***
## APMAM         -12.77     708.89  -0.018 0.985725    
## APSAB        -664.41    1522.89  -0.436 0.665237    
## APSLAKE      2270.68    1341.29   1.693 0.099112 .  
## OPBPC          69.70     461.69   0.151 0.880839    
## OPRC         1916.45     641.36   2.988 0.005031 ** 
## OPSLAKE      2211.58     752.69   2.938 0.005729 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7557 on 36 degrees of freedom
## Multiple R-squared:  0.9248, Adjusted R-squared:  0.9123 
## F-statistic: 73.82 on 6 and 36 DF,  p-value: < 2.2e-16
sub.fit <- regsubsets(BSAAM ~ ., data = socal.water)
best.summary <- summary(sub.fit)
# split the plotting window in a grid of one by two
par(mfrow = c(1,2))
plot(best.summary$cp, xlab = "number of features", ylab = "cp")
plot(sub.fit, scale = "Cp")

best.summary$adjr2
## [1] 0.8777515 0.9001619 0.9185369 0.9168706 0.9146772 0.9123079
fit.2 <- lm(BSAAM ~ APSLAKE + OPSLAKE, data = socal.water)
summary(fit.2)
## 
## Call:
## lm(formula = BSAAM ~ APSLAKE + OPSLAKE, data = socal.water)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13335.8  -5893.2   -171.8   4219.5  19500.2 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19144.9     3812.0   5.022  1.1e-05 ***
## APSLAKE       1768.8      553.7   3.194  0.00273 ** 
## OPSLAKE       3689.5      196.0  18.829  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8063 on 40 degrees of freedom
## Multiple R-squared:  0.9049, Adjusted R-squared:  0.9002 
## F-statistic: 190.3 on 2 and 40 DF,  p-value: < 2.2e-16
socal.water["Actual"] <- water$BSAAM

socal.water$Forecast <- predict(fit.2)
ggplot(data = socal.water, aes(x = Forecast, y = Actual)) + geom_point() + geom_smooth(method = lm) +
  labs( title = "Forecast vs Actual")

Logistic Regression

library(MASS)
library(tidyverse)
data(biopsy)
glimpse(biopsy)
## Observations: 699
## Variables: 11
## $ ID    <chr> "1000025", "1002945", "1015425", "1016277", "1017023", "...
## $ V1    <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4, 10...
## $ V2    <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, 1, 7...
## $ V3    <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, 1, 7...
## $ V4    <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, 1, 6...
## $ V5    <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2, 4,...
## $ V6    <int> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, 1,...
## $ V7    <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3, 4,...
## $ V8    <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, 1,...
## $ V9    <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2,...
## $ class <fctr> benign, benign, benign, benign, benign, malignant, beni...
biopsy$ID <- NULL