AE 05: Predicting hotel price (with numeric engineering!)

Suggested answers

Application exercise
Answers
Modified

September 17, 2024

Setup

reg_metrics <- metric_set(mae, rmse, rsq)

data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |> 
  sample_n(5000) |> 
  arrange(arrival_date) |> 
  select(-arrival_date) |> 
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

set.seed(4028)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)
set.seed(472)
hotel_folds <- vfold_cv(hotel_train, strata = avg_price_per_room)
hotel_folds
#  10-fold cross-validation using stratification 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [3372/377]> Fold01
 2 <split [3373/376]> Fold02
 3 <split [3373/376]> Fold03
 4 <split [3373/376]> Fold04
 5 <split [3373/376]> Fold05
 6 <split [3374/375]> Fold06
 7 <split [3375/374]> Fold07
 8 <split [3376/373]> Fold08
 9 <split [3376/373]> Fold09
10 <split [3376/373]> Fold10

Adjust for skewness

Your turn: Examine hotel_train and identify a numeric predictor that is skewed. Incorporate an appropriate transformation into the recipe below and estimate a linear regression model using 10-fold cross-validation. How does the model perform with and without the transformation?

hotel_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())
Tip

Use GGally::ggpairs() to generate bivariate comparisons for all variables in your dataset.

hotel_train |>
  select(where(is.numeric)) |>
  ggpairs()

# try Yeo-Johnson with all numeric predictors
hotel_yj_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(all_numeric_predictors()) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

# check the recipe
hotel_yj_rec |>
  prep() |>
  tidy(number = 1)
# A tibble: 7 × 3
  terms                        value id              
  <chr>                        <dbl> <chr>           
1 lead_time                  0.173   YeoJohnson_f7mCz
2 stays_in_weekend_nights    0.00547 YeoJohnson_f7mCz
3 stays_in_week_nights       0.121   YeoJohnson_f7mCz
4 booking_changes           -4.16    YeoJohnson_f7mCz
5 total_of_special_requests -0.672   YeoJohnson_f7mCz
6 arrival_date_num          -3.21    YeoJohnson_f7mCz
7 historical_adr            -0.591   YeoJohnson_f7mCz
# original model
set.seed(9)

hotel_lm_wflow <- workflow() |>
  add_recipe(hotel_rec) |>
  add_model(linear_reg())

hotel_lm_res <- hotel_lm_wflow |>
  fit_resamples(hotel_folds,
    metrics = reg_metrics,
    control = control_resamples(save_pred = TRUE)
  )

collect_metrics(hotel_lm_res)
# A tibble: 3 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 mae     standard   17.1      10 0.176   Preprocessor1_Model1
2 rmse    standard   22.8      10 0.296   Preprocessor1_Model1
3 rsq     standard    0.881    10 0.00359 Preprocessor1_Model1
# model with transforms
set.seed(9)

hotel_lm_skew_wflow <- workflow() |>
  add_recipe(hotel_yj_rec) |>
  add_model(linear_reg())

hotel_lm_skew_res <- hotel_lm_skew_wflow |>
  fit_resamples(hotel_folds,
    metrics = reg_metrics,
    control = control_resamples(save_pred = TRUE)
  )

collect_metrics(hotel_lm_skew_res)
# A tibble: 3 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 mae     standard   18.9      10 0.261   Preprocessor1_Model1
2 rmse    standard   25.3      10 0.374   Preprocessor1_Model1
3 rsq     standard    0.853    10 0.00384 Preprocessor1_Model1

Spline functions

Your turn: Implement a natural spline for lead_time and historical_adr. Use grid tuning to determine the optimal value for deg_free. Evaluate the model’s performance.

ns_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_ns(lead_time, historical_adr, deg_free = tune()) |>
  step_novel(all_nominal_predictors()) |>
  step_dummy(all_nominal_predictors())

ns_wf <- workflow() |>
  add_recipe(ns_rec) |>
  add_model(linear_reg())

ns_res <- ns_wf |>
  tune_grid(
    hotel_folds,
    grid = 10,
    metrics = reg_metrics
  )
autoplot(ns_res)

show_best(ns_res, metric = "mae")
# A tibble: 5 × 7
  deg_free .metric .estimator  mean     n std_err .config             
     <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1       13 mae     standard    16.4    10   0.261 Preprocessor9_Model1
2       14 mae     standard    16.4    10   0.253 Preprocessor5_Model1
3       11 mae     standard    16.4    10   0.251 Preprocessor2_Model1
4       10 mae     standard    16.5    10   0.239 Preprocessor6_Model1
5        9 mae     standard    16.6    10   0.231 Preprocessor8_Model1

MARS model

Your turn: Implement a MARS model. Use grid tuning to determine the optimal value for num_terms and prod_degree. Evaluate the model’s performance.

# recipe
mars_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_novel(all_nominal_predictors()) |>
  step_dummy(all_nominal_predictors())

# model specification
mars_spec <- mars(mode = "regression", num_terms = tune(), prod_degree = tune())

# workflow
mars_wf <- workflow() |>
  add_recipe(mars_rec) |>
  add_model(mars_spec)

# space filling grid
set.seed(749)
mars_params <- extract_parameter_set_dials(mars_wf) |>
  grid_space_filling(size = 10)

# tune the model
mars_res <- mars_wf |>
  tune_grid(
    hotel_folds,
    grid = mars_params,
    metrics = reg_metrics
  )
autoplot(mars_res)

show_best(mars_res, metric = "mae")
# A tibble: 5 × 8
  num_terms prod_degree .metric .estimator  mean     n std_err .config          
      <int>       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            
1         5           2 mae     standard    24.8    10   0.294 Preprocessor1_Mo…
2         5           1 mae     standard    25.8    10   0.289 Preprocessor1_Mo…
3         4           2 mae     standard    26.7    10   0.387 Preprocessor1_Mo…
4         3           2 mae     standard    28.1    10   0.377 Preprocessor1_Mo…
5         3           1 mae     standard    28.8    10   0.349 Preprocessor1_Mo…
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-09-18
 pandoc   3.3 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 backports      1.5.0      2024-05-23 [1] CRAN (R 4.4.0)
 broom        * 1.0.6      2024-05-17 [1] CRAN (R 4.4.0)
 class          7.3-22     2023-05-03 [1] CRAN (R 4.4.0)
 cli            3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 codetools      0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace     2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
 data.table     1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
 dials        * 1.3.0      2024-07-30 [1] CRAN (R 4.4.0)
 DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.3.1)
 digest         0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 earth        * 5.3.3      2024-02-26 [1] CRAN (R 4.4.0)
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.3.0)
 evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
 fansi          1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 foreach        1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 Formula      * 1.2-5      2023-02-24 [1] CRAN (R 4.3.0)
 furrr          0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
 future         1.33.2     2024-03-26 [1] CRAN (R 4.3.1)
 future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.3.1)
 generics       0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 GGally       * 2.2.1      2024-02-14 [1] CRAN (R 4.4.0)
 ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 ggstats        0.6.0      2024-04-05 [1] CRAN (R 4.4.0)
 globals        0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
 glue           1.7.0      2024-01-09 [1] CRAN (R 4.3.1)
 gower          1.0.1      2022-12-22 [1] CRAN (R 4.3.0)
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.3.0)
 gtable         0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.4.0)
 here           1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 infer        * 1.0.7      2024-03-25 [1] CRAN (R 4.3.1)
 ipred          0.9-14     2023-03-09 [1] CRAN (R 4.3.0)
 iterators      1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
 knitr          1.47       2024-05-29 [1] CRAN (R 4.4.0)
 labeling       0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 lattice        0.22-6     2024-03-20 [1] CRAN (R 4.4.0)
 lava           1.8.0      2024-03-05 [1] CRAN (R 4.3.1)
 lhs            1.1.6      2022-12-17 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS           7.3-61     2024-06-13 [1] CRAN (R 4.4.0)
 Matrix         1.7-0      2024-03-22 [1] CRAN (R 4.4.0)
 modeldata    * 1.4.0      2024-06-19 [1] CRAN (R 4.4.0)
 munsell        0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 nnet           7.3-19     2023-05-03 [1] CRAN (R 4.4.0)
 parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.3.1)
 parsnip      * 1.2.1      2024-03-22 [1] CRAN (R 4.3.1)
 pillar         1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 plotmo       * 3.6.3      2024-02-26 [1] CRAN (R 4.4.0)
 plotrix      * 3.8-4      2023-11-10 [1] CRAN (R 4.3.1)
 plyr           1.8.9      2023-10-02 [1] CRAN (R 4.3.1)
 prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
 purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 RColorBrewer   1.1-3      2022-04-03 [1] CRAN (R 4.3.0)
 Rcpp           1.0.12     2024-01-09 [1] CRAN (R 4.3.1)
 recipes      * 1.0.10     2024-02-18 [1] CRAN (R 4.3.1)
 rlang          1.1.4      2024-06-04 [1] CRAN (R 4.3.3)
 rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.4.0)
 rpart          4.1.23     2023-12-05 [1] CRAN (R 4.4.0)
 rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rsample      * 1.2.1      2024-03-25 [1] CRAN (R 4.3.1)
 rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 scales       * 1.3.0      2023-11-28 [1] CRAN (R 4.4.0)
 sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 sfd            0.1.0      2024-01-08 [1] CRAN (R 4.4.0)
 survival       3.7-0      2024-06-05 [1] CRAN (R 4.4.0)
 textrecipes  * 1.0.6      2023-11-15 [1] CRAN (R 4.3.1)
 tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidymodels   * 1.2.0      2024-03-25 [1] CRAN (R 4.3.1)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 timeDate       4032.109   2023-12-14 [1] CRAN (R 4.3.1)
 tune         * 1.2.1      2024-04-18 [1] CRAN (R 4.3.1)
 utf8           1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 withr          3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
 workflows    * 1.1.4      2024-02-19 [1] CRAN (R 4.3.1)
 workflowsets * 1.1.0      2024-03-21 [1] CRAN (R 4.3.1)
 xfun           0.45       2024-06-16 [1] CRAN (R 4.4.0)
 yaml           2.3.8      2023-12-11 [1] CRAN (R 4.3.1)
 yardstick    * 1.3.1      2024-03-21 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────