A sample reports corresponding to this analysis have been uploaded to the course Moodle.

1 Data

source: Automobile Dataset by Dr. Srinivasan Ramakrishnan

setwd("c://ws_stat")
auto_original <- read.csv("Automobile_data.csv")
# str(auto)
# summary(auto)

2 Organizing data

Missing value is denoted not by NA but by ? symbol, so it is replaced by NA, and the character data is converted to numeric data.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.1     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
auto <- auto_original %>%
  filter(price != "?", horsepower != "?", num.of.doors != "?") %>%
  mutate(price_mod = as.numeric(price),  # character --> numeric
         num_door_2 = ifelse(num.of.doors == "two", 1, 0),
         hp = as.numeric(horsepower),  # character --> numeric
         mpg = (city.mpg + highway.mpg) / 2)
dim(auto)  # sample size and the number of variables
## [1] 197  30

3 Descriptive statistics

The summary statistics for the attributes are shown. If the attribute is a categorical variable, the average price per category is also shown. In the case of quantitative variables, a scatter plot with price is also shown.

table(auto$make)
## 
##   alfa-romero          audi           bmw     chevrolet         dodge 
##             3             6             8             3             8 
##         honda         isuzu        jaguar         mazda mercedes-benz 
##            13             2             3            16             8 
##       mercury    mitsubishi        nissan        peugot      plymouth 
##             1            13            18            11             7 
##       porsche          saab        subaru        toyota    volkswagen 
##             4             6            12            32            12 
##         volvo 
##            11
table(auto$fuel.type)
## 
## diesel    gas 
##     19    178
auto %>% group_by(fuel.type) %>% summarise(price = mean(price_mod))
## # A tibble: 2 × 2
##   fuel.type  price
##   <chr>      <dbl>
## 1 diesel    16104.
## 2 gas       12978.
table(auto$num_door_2)
## 
##   0   1 
## 112  85
auto %>% group_by(num_door_2) %>% summarise(price = mean(price_mod))
## # A tibble: 2 × 2
##   num_door_2  price
##        <dbl>  <dbl>
## 1          0 13604.
## 2          1 12853.
table(auto$body.style)
## 
## convertible     hardtop   hatchback       sedan       wagon 
##           6           8          67          92          24
auto %>% group_by(body.style) %>% summarise(price = mean(price_mod))
## # A tibble: 5 × 2
##   body.style   price
##   <chr>        <dbl>
## 1 convertible 21890.
## 2 hardtop     22208.
## 3 hatchback    9958.
## 4 sedan       14564.
## 5 wagon       12500.
cor(auto$city.mpg, auto$highway.mpg)
## [1] 0.9724072
plot(auto$city.mpg, auto$highway.mpg, pch = 20, xlab = "MPG in city road", ylab = "MPG in highway")

summary(auto[, c("wheel.base", "length", "width", "height", "curb.weight")])
##    wheel.base         length          width           height     
##  Min.   : 86.60   Min.   :141.1   Min.   :60.30   Min.   :47.80  
##  1st Qu.: 94.50   1st Qu.:166.8   1st Qu.:64.10   1st Qu.:52.00  
##  Median : 97.00   Median :173.2   Median :65.50   Median :54.10  
##  Mean   : 98.85   Mean   :174.2   Mean   :65.89   Mean   :53.78  
##  3rd Qu.:102.40   3rd Qu.:183.5   3rd Qu.:66.90   3rd Qu.:55.60  
##  Max.   :120.90   Max.   :208.1   Max.   :72.00   Max.   :59.80  
##   curb.weight  
##  Min.   :1488  
##  1st Qu.:2145  
##  Median :2414  
##  Mean   :2558  
##  3rd Qu.:2935  
##  Max.   :4066
cor(auto[, c("wheel.base", "length", "width", "height", "curb.weight")])
##             wheel.base    length     width    height curb.weight
## wheel.base   1.0000000 0.8796894 0.8166003 0.5916237   0.7821089
## length       0.8796894 1.0000000 0.8564946 0.4899971   0.8827182
## width        0.8166003 0.8564946 1.0000000 0.3041986   0.8672891
## height       0.5916237 0.4899971 0.3041986 1.0000000   0.3061494
## curb.weight  0.7821089 0.8827182 0.8672891 0.3061494   1.0000000
plot(auto[, c("wheel.base", "length", "width", "height", "curb.weight")], 
     labels = c("Wheel base", "Length", "Width", "Height", "Curb weight"), pch = 20)

3.1 Price and other primary variables

hist(auto$price_mod, breaks = 30, xlab = "Price")  # histogram of price

summary(auto[, c("price_mod", "wheel.base", "curb.weight", "engine.size", "hp", "mpg")])
##    price_mod       wheel.base      curb.weight    engine.size        hp       
##  Min.   : 5118   Min.   : 86.60   Min.   :1488   Min.   : 61   Min.   : 48.0  
##  1st Qu.: 7775   1st Qu.: 94.50   1st Qu.:2145   1st Qu.: 97   1st Qu.: 70.0  
##  Median :10345   Median : 97.00   Median :2414   Median :119   Median : 95.0  
##  Mean   :13280   Mean   : 98.85   Mean   :2558   Mean   :127   Mean   :103.6  
##  3rd Qu.:16503   3rd Qu.:102.40   3rd Qu.:2935   3rd Qu.:145   3rd Qu.:116.0  
##  Max.   :45400   Max.   :120.90   Max.   :4066   Max.   :326   Max.   :262.0  
##       mpg       
##  Min.   :15.00  
##  1st Qu.:22.00  
##  Median :27.00  
##  Mean   :27.89  
##  3rd Qu.:32.00  
##  Max.   :51.50
cor(auto[, c("price_mod", "wheel.base", "curb.weight", "engine.size", "hp", "mpg")])
##              price_mod wheel.base curb.weight engine.size         hp        mpg
## price_mod    1.0000000  0.5829758   0.8347318   0.8737077  0.8119533 -0.7059215
## wheel.base   0.5829758  1.0000000   0.7821089   0.5719841  0.3731329 -0.5180041
## curb.weight  0.8347318  0.7821089   1.0000000   0.8489320  0.7599250 -0.7839263
## engine.size  0.8737077  0.5719841   0.8489320   1.0000000  0.8252863 -0.6753036
## hp           0.8119533  0.3731329   0.7599250   0.8252863  1.0000000 -0.8181929
## mpg         -0.7059215 -0.5180041  -0.7839263  -0.6753036 -0.8181929  1.0000000
plot(auto[, c("price_mod", "wheel.base", "curb.weight", "engine.size", "hp", "mpg")], 
     labels = c("Price", "Wheel base", "Curb weight", "Engine size", "Horsepower", "MPG"), pch = 20)

4 Regression

To include the vehicle body type as a dummy variable, the baseline category is set to sedan using the relevel function.

auto$body.style_mod <- relevel(as.factor(auto$body.style), ref = "sedan")
table(auto$body.style_mod)  # sedan is shown first
## 
##       sedan convertible     hardtop   hatchback       wagon 
##          92           6           8          67          24

Because the curb weight and engine size variables take large values, if they are included in the model unadjusted, the regression coefficients will be small (or rounded to zero because the values are too small) and difficult to understand when viewed in a table.

In such cases, the original variable can be divided by 100, 1000, or more, as in I(var / 100), where I is the function used to convert the variable in the lm function.

lm_auto_1 <- lm(log(price_mod) ~ fuel.type + num_door_2 + body.style_mod + I(wheel.base/1000) +
                length + width + height + I(curb.weight/1000) + I(engine.size/1000) + I(hp/1000) + 
                mpg, data = auto)
summary(lm_auto_1)
## 
## Call:
## lm(formula = log(price_mod) ~ fuel.type + num_door_2 + body.style_mod + 
##     I(wheel.base/1000) + length + width + height + I(curb.weight/1000) + 
##     I(engine.size/1000) + I(hp/1000) + mpg, data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44627 -0.09352 -0.01445  0.10077  0.45124 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.3297364  0.8631403   6.175 4.21e-09 ***
## fuel.typegas              -0.1875150  0.0617356  -3.037 0.002737 ** 
## num_door_2                 0.0046885  0.0400380   0.117 0.906908    
## body.style_modconvertible  0.3198312  0.0876242   3.650 0.000342 ***
## body.style_modhardtop      0.0742153  0.0736534   1.008 0.314970    
## body.style_modhatchback   -0.0745006  0.0423220  -1.760 0.080032 .  
## body.style_modwagon       -0.1183884  0.0458564  -2.582 0.010618 *  
## I(wheel.base/1000)         2.7945502  5.8190799   0.480 0.631634    
## length                     0.0009795  0.0032462   0.302 0.763207    
## width                      0.0349051  0.0142253   2.454 0.015079 *  
## height                     0.0100827  0.0080212   1.257 0.210364    
## I(curb.weight/1000)        0.2292044  0.1029966   2.225 0.027285 *  
## I(engine.size/1000)        0.0505511  0.7722329   0.065 0.947879    
## I(hp/1000)                 5.7262514  0.8918555   6.421 1.15e-09 ***
## mpg                       -0.0092034  0.0046821  -1.966 0.050861 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1724 on 182 degrees of freedom
## Multiple R-squared:  0.8925, Adjusted R-squared:  0.8842 
## F-statistic: 107.9 on 14 and 182 DF,  p-value: < 2.2e-16
lm_auto_2 <- lm(log(price_mod) ~ fuel.type + num_door_2 + body.style_mod + I(wheel.base/1000) +
                length + width + height + I(curb.weight/1000) + I(engine.size/1000) + I(hp/1000) + 
                city.mpg + highway.mpg, data = auto)
# summary(lm_auto_2)

lm_auto_3 <- lm(log(price_mod) ~ fuel.type + body.style_mod + I(wheel.base/1000) +
                I(curb.weight/1000) + I(hp/1000) + mpg, data = auto)
# summary(lm_auto_3)

lm_auto_4 <- lm(log(price_mod) ~ fuel.type + body.style_mod + I(wheel.base/1000) +
                I(curb.weight/1000) + I(hp/1000), data = auto)
# summary(lm_auto_4)

I consider make fixed effects and the correlation of the error term among car models of the same make.

library(fixest)
lm_auto_5 <- feols(log(price_mod) ~ fuel.type + body.style_mod + I(wheel.base/1000) +
                   I(curb.weight/1000) + I(hp/1000) + make, data = auto, cluster = ~ make)
# summary(lm_auto_5)

To display the estimation results of several regression models alongside each other, we use the package modelsummary.

# install.packages("modelsummary")  # run only the first time 
library(modelsummary)
## Warning: パッケージ 'modelsummary' はバージョン 4.2.3 の R の下で造られました
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'
msummary(list(lm_auto_1, lm_auto_2, lm_auto_3, lm_auto_4, lm_auto_5), 
         gof_omit = "AIC|BIC|Log.Lik.|F|RMSE", stars = TRUE)
 (1)   (2)   (3)   (4)   (5)
(Intercept) 5.330*** 5.127*** 7.362*** 6.706*** 7.041***
(0.863) (0.849) (0.436) (0.343) (0.384)
fuel.typegas -0.188** -0.208*** -0.202*** -0.133* -0.079
(0.062) (0.061) (0.060) (0.053) (0.046)
num_door_2 0.005 0.018
(0.040) (0.040)
body.style_modconvertible 0.320*** 0.316*** 0.317*** 0.329*** 0.228***
(0.088) (0.086) (0.083) (0.084) (0.043)
body.style_modhardtop 0.074 0.059 0.076 0.075 0.028
(0.074) (0.072) (0.067) (0.068) (0.108)
body.style_modhatchback -0.075+ -0.081+ -0.080* -0.078* -0.016
(0.042) (0.042) (0.031) (0.031) (0.027)
body.style_modwagon -0.118* -0.095* -0.120** -0.117** -0.084*
(0.046) (0.046) (0.042) (0.043) (0.036)
I(wheel.base/1000) 2.795 6.120 11.473* 12.150** 8.449+
(5.819) (5.814) (4.468) (4.515) (4.217)
length 0.001 -0.001
(0.003) (0.003)
width 0.035* 0.037**
(0.014) (0.014)
height 0.010 0.010
(0.008) (0.008)
I(curb.weight/1000) 0.229* 0.244* 0.293*** 0.366*** 0.492***
(0.103) (0.101) (0.084) (0.079) (0.077)
I(engine.size/1000) 0.051 0.357
(0.772) (0.764)
I(hp/1000) 5.726*** 5.228*** 5.809*** 6.366*** 3.750***
(0.892) (0.890) (0.790) (0.764) (0.518)
mpg -0.009+ -0.010*
(0.005) (0.004)
city.mpg -0.032**
(0.010)
highway.mpg 0.021*
(0.009)
makeaudi 0.172***
(0.029)
makebmw 0.320***
(0.039)
makechevrolet -0.138*
(0.053)
makedodge -0.192***
(0.030)
makehonda -0.095**
(0.032)
makeisuzu -0.255***
(0.020)
makejaguar -0.186+
(0.103)
makemazda -0.025
(0.023)
makemercedes-benz 0.074
(0.106)
makemercury -0.192***
(0.042)
makemitsubishi -0.236***
(0.017)
makenissan -0.152***
(0.012)
makepeugot -0.221**
(0.075)
makeplymouth -0.217***
(0.025)
makeporsche 0.458***
(0.054)
makesaab -0.003
(0.020)
makesubaru -0.179***
(0.020)
maketoyota -0.187***
(0.016)
makevolkswagen -0.084**
(0.023)
makevolvo -0.036
(0.049)
Num.Obs. 197 197 197 197 197
R2 0.892 0.897 0.888 0.884 0.953
R2 Adj. 0.884 0.889 0.882 0.880 0.946
Std.Errors by: make
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

4.1 Export results to CSV

lm_auto_bind <- msummary(list(lm_auto_1, lm_auto_2, lm_auto_3, lm_auto_4, lm_auto_5), 
                         output = "data.frame", stars = TRUE)
write.csv(lm_auto_bind, "lm_auto_bind.csv")  # export to CSV

4.2 Predicting price of a certain vehicle

Suppose the attributes of the concerned product are: Fuel type: gas, body style: hatchback, wheel base: 99.5, curb weight: 3053, horsepower: 160.

Applying this attribute to the estimation results of model (4), the predicted log price is computed as follows.

\[ \hat{\log y} = 6.706 - 0.133 - 0.078 + 12.15 \times (99.5/1000) + 0.366 \times (3053/1000) + 6.366 \times (160/1000) \approx 9.8 \]

It can also be computed using the predict function in R.

To convert this value (in log) back to the original price unit scale, use exp. Note that the error variance / 2 must be added.

\[ \hat{y} = \exp \left( \hat{\log y} + \frac{\sigma^2}{2} \right) = \exp \left( 9.8 + \frac{0.18^2}{2} \right) \approx 19036 \]

# car model in concern
vehicle_V <- tibble(fuel.type = "gas",
                    body.style_mod = "hatchback",
                    wheel.base = 99.5,
                    curb.weight = 3053,
                    hp = 160)
pred_new <- predict(lm_auto_4, new = vehicle_V)  # prediction (interpolation)
pred_new
##        1 
## 9.838625
summary(lm_auto_4)$sigma  # residual standard error (RMSE)
## [1] 0.1758342
sigma2 <- summary(lm_auto_4)$sigma^2  # residual variance
sigma2
## [1] 0.03091768
exp(pred_new + sigma2 / 2)  # predicted vehicle price
##        1 
## 19035.93

.