AE 05: Predicting hotel price (with numeric engineering!)
Suggested answers
Application exercise
Answers
Setup
reg_metrics <- metric_set(mae, rmse, rsq)
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
sample_n(5000) |>
arrange(arrival_date) |>
select(-arrival_date) |>
mutate(
company = factor(as.character(company)),
country = factor(as.character(country)),
agent = factor(as.character(agent))
)
set.seed(4028)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)
hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)
set.seed(472)
hotel_folds <- vfold_cv(hotel_train, strata = avg_price_per_room)
hotel_folds
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [3372/377]> Fold01
2 <split [3373/376]> Fold02
3 <split [3373/376]> Fold03
4 <split [3373/376]> Fold04
5 <split [3373/376]> Fold05
6 <split [3374/375]> Fold06
7 <split [3375/374]> Fold07
8 <split [3376/373]> Fold08
9 <split [3376/373]> Fold09
10 <split [3376/373]> Fold10
Adjust for skewness
Your turn: Examine hotel_train
and identify a numeric predictor that is skewed. Incorporate an appropriate transformation into the recipe below and estimate a linear regression model using 10-fold cross-validation. How does the model perform with and without the transformation?
hotel_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors())
Tip
Use GGally::ggpairs()
to generate bivariate comparisons for all variables in your dataset.
hotel_train |>
select(where(is.numeric)) |>
ggpairs()
# try Yeo-Johnson with all numeric predictors
hotel_yj_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
step_YeoJohnson(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors())
# check the recipe
hotel_yj_rec |>
prep() |>
tidy(number = 1)
# A tibble: 7 × 3
terms value id
<chr> <dbl> <chr>
1 lead_time 0.173 YeoJohnson_f7mCz
2 stays_in_weekend_nights 0.00547 YeoJohnson_f7mCz
3 stays_in_week_nights 0.121 YeoJohnson_f7mCz
4 booking_changes -4.16 YeoJohnson_f7mCz
5 total_of_special_requests -0.672 YeoJohnson_f7mCz
6 arrival_date_num -3.21 YeoJohnson_f7mCz
7 historical_adr -0.591 YeoJohnson_f7mCz
# original model
set.seed(9)
hotel_lm_wflow <- workflow() |>
add_recipe(hotel_rec) |>
add_model(linear_reg())
hotel_lm_res <- hotel_lm_wflow |>
fit_resamples(hotel_folds,
metrics = reg_metrics,
control = control_resamples(save_pred = TRUE)
)
collect_metrics(hotel_lm_res)
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 mae standard 17.1 10 0.176 Preprocessor1_Model1
2 rmse standard 22.8 10 0.296 Preprocessor1_Model1
3 rsq standard 0.881 10 0.00359 Preprocessor1_Model1
# model with transforms
set.seed(9)
hotel_lm_skew_wflow <- workflow() |>
add_recipe(hotel_yj_rec) |>
add_model(linear_reg())
hotel_lm_skew_res <- hotel_lm_skew_wflow |>
fit_resamples(hotel_folds,
metrics = reg_metrics,
control = control_resamples(save_pred = TRUE)
)
collect_metrics(hotel_lm_skew_res)
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 mae standard 18.9 10 0.261 Preprocessor1_Model1
2 rmse standard 25.3 10 0.374 Preprocessor1_Model1
3 rsq standard 0.853 10 0.00384 Preprocessor1_Model1
Spline functions
Your turn: Implement a natural spline for lead_time
and historical_adr
. Use grid tuning to determine the optimal value for deg_free
. Evaluate the model’s performance.
ns_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
step_ns(lead_time, historical_adr, deg_free = tune()) |>
step_novel(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors())
ns_wf <- workflow() |>
add_recipe(ns_rec) |>
add_model(linear_reg())
ns_res <- ns_wf |>
tune_grid(
hotel_folds,
grid = 10,
metrics = reg_metrics
)
autoplot(ns_res)
show_best(ns_res, metric = "mae")
# A tibble: 5 × 7
deg_free .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 13 mae standard 16.4 10 0.261 Preprocessor9_Model1
2 14 mae standard 16.4 10 0.253 Preprocessor5_Model1
3 11 mae standard 16.4 10 0.251 Preprocessor2_Model1
4 10 mae standard 16.5 10 0.239 Preprocessor6_Model1
5 9 mae standard 16.6 10 0.231 Preprocessor8_Model1
MARS model
Your turn: Implement a MARS model. Use grid tuning to determine the optimal value for num_terms
and prod_degree
. Evaluate the model’s performance.
# recipe
mars_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
step_novel(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors())
# model specification
mars_spec <- mars(mode = "regression", num_terms = tune(), prod_degree = tune())
# workflow
mars_wf <- workflow() |>
add_recipe(mars_rec) |>
add_model(mars_spec)
# space filling grid
set.seed(749)
mars_params <- extract_parameter_set_dials(mars_wf) |>
grid_space_filling(size = 10)
# tune the model
mars_res <- mars_wf |>
tune_grid(
hotel_folds,
grid = mars_params,
metrics = reg_metrics
)
autoplot(mars_res)
show_best(mars_res, metric = "mae")
# A tibble: 5 × 8
num_terms prod_degree .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 2 mae standard 24.8 10 0.294 Preprocessor1_Mo…
2 5 1 mae standard 25.8 10 0.289 Preprocessor1_Mo…
3 4 2 mae standard 26.7 10 0.387 Preprocessor1_Mo…
4 3 2 mae standard 28.1 10 0.377 Preprocessor1_Mo…
5 3 1 mae standard 28.8 10 0.349 Preprocessor1_Mo…
Session information
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.4.1 (2024-06-14)
os macOS Sonoma 14.6.1
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/New_York
date 2024-09-18
pandoc 3.3 @ /usr/local/bin/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
backports 1.5.0 2024-05-23 [1] CRAN (R 4.4.0)
broom * 1.0.6 2024-05-17 [1] CRAN (R 4.4.0)
class 7.3-22 2023-05-03 [1] CRAN (R 4.4.0)
cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0)
codetools 0.2-20 2024-03-31 [1] CRAN (R 4.4.1)
colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.4.0)
data.table 1.15.4 2024-03-30 [1] CRAN (R 4.3.1)
dials * 1.3.0 2024-07-30 [1] CRAN (R 4.4.0)
DiceDesign 1.10 2023-12-07 [1] CRAN (R 4.3.1)
digest 0.6.35 2024-03-11 [1] CRAN (R 4.3.1)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
earth * 5.3.3 2024-02-26 [1] CRAN (R 4.4.0)
ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.3.0)
evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.4.0)
fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0)
Formula * 1.2-5 2023-02-24 [1] CRAN (R 4.3.0)
furrr 0.3.1 2022-08-15 [1] CRAN (R 4.3.0)
future 1.33.2 2024-03-26 [1] CRAN (R 4.3.1)
future.apply 1.11.2 2024-03-28 [1] CRAN (R 4.3.1)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
GGally * 2.2.1 2024-02-14 [1] CRAN (R 4.4.0)
ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
ggstats 0.6.0 2024-04-05 [1] CRAN (R 4.4.0)
globals 0.16.3 2024-03-08 [1] CRAN (R 4.3.1)
glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1)
gower 1.0.1 2022-12-22 [1] CRAN (R 4.3.0)
GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.3.0)
gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1)
hardhat 1.4.0 2024-06-02 [1] CRAN (R 4.4.0)
here 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.1)
infer * 1.0.7 2024-03-25 [1] CRAN (R 4.3.1)
ipred 0.9-14 2023-03-09 [1] CRAN (R 4.3.0)
iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0)
jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
knitr 1.47 2024-05-29 [1] CRAN (R 4.4.0)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0)
lattice 0.22-6 2024-03-20 [1] CRAN (R 4.4.0)
lava 1.8.0 2024-03-05 [1] CRAN (R 4.3.1)
lhs 1.1.6 2022-12-17 [1] CRAN (R 4.3.0)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
listenv 0.9.1 2024-01-29 [1] CRAN (R 4.3.1)
lubridate 1.9.3 2023-09-27 [1] CRAN (R 4.3.1)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
MASS 7.3-61 2024-06-13 [1] CRAN (R 4.4.0)
Matrix 1.7-0 2024-03-22 [1] CRAN (R 4.4.0)
modeldata * 1.4.0 2024-06-19 [1] CRAN (R 4.4.0)
munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1)
nnet 7.3-19 2023-05-03 [1] CRAN (R 4.4.0)
parallelly 1.37.1 2024-02-29 [1] CRAN (R 4.3.1)
parsnip * 1.2.1 2024-03-22 [1] CRAN (R 4.3.1)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
plotmo * 3.6.3 2024-02-26 [1] CRAN (R 4.4.0)
plotrix * 3.8-4 2023-11-10 [1] CRAN (R 4.3.1)
plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1)
prodlim 2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.0)
Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.1)
recipes * 1.0.10 2024-02-18 [1] CRAN (R 4.3.1)
rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.3)
rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.4.0)
rpart 4.1.23 2023-12-05 [1] CRAN (R 4.4.0)
rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.3.1)
rsample * 1.2.1 2024-03-25 [1] CRAN (R 4.3.1)
rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1)
scales * 1.3.0 2023-11-28 [1] CRAN (R 4.4.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
sfd 0.1.0 2024-01-08 [1] CRAN (R 4.4.0)
survival 3.7-0 2024-06-05 [1] CRAN (R 4.4.0)
textrecipes * 1.0.6 2023-11-15 [1] CRAN (R 4.3.1)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidymodels * 1.2.0 2024-03-25 [1] CRAN (R 4.3.1)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.1)
timeDate 4032.109 2023-12-14 [1] CRAN (R 4.3.1)
tune * 1.2.1 2024-04-18 [1] CRAN (R 4.3.1)
utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
withr 3.0.1 2024-07-31 [1] CRAN (R 4.4.0)
workflows * 1.1.4 2024-02-19 [1] CRAN (R 4.3.1)
workflowsets * 1.1.0 2024-03-21 [1] CRAN (R 4.3.1)
xfun 0.45 2024-06-16 [1] CRAN (R 4.4.0)
yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.1)
yardstick * 1.3.1 2024-03-21 [1] CRAN (R 4.3.1)
[1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
──────────────────────────────────────────────────────────────────────────────