AE 06: Predicting hotel price with boosting models

Application exercise

Modified

September 19, 2024

Setup

library(tidymodels)

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

✔ broom        1.0.6      ✔ recipes      1.0.10
✔ dials        1.3.0      ✔ rsample      1.2.1 
✔ dplyr        1.1.4      ✔ tibble       3.2.1 
✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
✔ infer        1.0.7      ✔ tune         1.2.1 
✔ modeldata    1.4.0      ✔ workflows    1.1.4 
✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
✔ purrr        1.0.2      ✔ yardstick    1.3.1

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::%||%()    masks base::%||%()
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.

library(textrecipes)
library(bonsai)
library(probably)


Attaching package: 'probably'

The following objects are masked from 'package:base':

    as.factor, as.ordered

# metrics
reg_metrics <- metric_set(mae, rsq)

# import data
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
  sample_n(5000) |>
  arrange(arrival_date) |>
  select(-arrival_date) |>
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

# split into training/test sets
set.seed(421)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

# 10-fold CV
set.seed(531)
hotel_rs <- vfold_cv(hotel_train, strata = avg_price_per_room)

# feature engineering recipe
hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  # Defaults to 32 signed indicator columns
  step_dummy_hash(agent) |>
  step_dummy_hash(company) |>
  # Regular indicators for the others
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

Boosting model specification

hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  step_dummy_hash(agent, num_terms = tune("agent hash")) |>
  step_dummy_hash(company, num_terms = tune("company hash")) |>
  step_zv(all_predictors())

lgbm_spec <- boost_tree(trees = tune(), learn_rate = tune()) |>
  set_mode("regression") |>
  set_engine("lightgbm")

lgbm_wflow <- workflow(hash_rec, lgbm_spec)

Create a grid

Demonstration: Create a space filling grid for the boosting workflow.

set.seed(12)
grid <- lgbm_wflow |>
  extract_parameter_set_dials() |>
  grid_space_filling(size = 25)

grid

# A tibble: 25 × 4
   trees learn_rate `agent hash` `company hash`
   <int>      <dbl>        <int>          <int>
 1     1   7.50e- 6          574            574
 2    84   1.78e- 5         2048           2298
 3   167   5.62e-10         1824            912
 4   250   4.22e- 5         3250            512
 5   334   1.78e- 8          512           2896
 6   417   1.33e- 3          322           1625
 7   500   1   e- 1         1448           1149
 8   584   1   e- 7         1290            256
 9   667   2.37e-10          456            724
10   750   1.78e- 2          645            322
# ℹ 15 more rows

Your turn: Try creating a regular grid for the boosting workflow.

# add code here

Your turn: What advantage would a regular grid have?

Add response here.

Update parameter ranges

lgbm_param <- lgbm_wflow |>
  extract_parameter_set_dials() |>
  update(
    trees = trees(c(1L, 100L)),
    learn_rate = learn_rate(c(-5, -1))
  )

set.seed(712)
grid <- lgbm_param |>
  grid_space_filling(size = 25)

grid

# A tibble: 25 × 4
   trees learn_rate `agent hash` `company hash`
   <int>      <dbl>        <int>          <int>
 1     1  0.00147            574            574
 2     5  0.00215           2048           2298
 3     9  0.0000215         1824            912
 4    13  0.00316           3250            512
 5    17  0.0001             512           2896
 6    21  0.0147             322           1625
 7    25  0.1               1448           1149
 8    29  0.000215          1290            256
 9    34  0.0000147          456            724
10    38  0.0464             645            322
# ℹ 15 more rows

Grid search

Let’s take our previous model and tune more parameters:

lgbm_spec <- boost_tree(trees = tune(), learn_rate = tune(), min_n = tune()) |>
  set_mode("regression") |>
  set_engine("lightgbm")

lgbm_wflow <- workflow(hash_rec, lgbm_spec)

# Update the feature hash ranges (log-2 units)
lgbm_param <- lgbm_wflow |>
  extract_parameter_set_dials() |>
  update(
    `agent hash` = num_hash(c(3, 8)),
    `company hash` = num_hash(c(3, 8))
  )

Run the grid search:

set.seed(9)
ctrl <- control_grid(save_pred = TRUE, verbose = FALSE)

lgbm_res <- lgbm_wflow |>
  tune_grid(
    resamples = hotel_rs,
    grid = 25,
    # The options below are not required by default
    param_info = lgbm_param,
    control = ctrl,
    metrics = reg_metrics
  )

Inspect results:

autoplot(lgbm_res)

collect_metrics(lgbm_res)

# A tibble: 50 × 11
   trees min_n learn_rate `agent hash` `company hash` .metric .estimator   mean
   <int> <int>      <dbl>        <int>          <int> <chr>   <chr>       <dbl>
 1   298    19   4.15e- 9          222             36 mae     standard   53.5  
 2   298    19   4.15e- 9          222             36 rsq     standard    0.816
 3  1394     5   5.82e- 6           28             21 mae     standard   53.2  
 4  1394     5   5.82e- 6           28             21 rsq     standard    0.817
 5   774    12   4.41e- 2           27             95 mae     standard    9.86 
 6   774    12   4.41e- 2           27             95 rsq     standard    0.949
 7  1342     7   6.84e-10           71             17 mae     standard   53.5  
 8  1342     7   6.84e-10           71             17 rsq     standard    0.816
 9   669    39   8.62e- 7          141            145 mae     standard   53.5  
10   669    39   8.62e- 7          141            145 rsq     standard    0.817
# ℹ 40 more rows
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>

collect_metrics(lgbm_res, summarize = FALSE)

# A tibble: 500 × 10
   id     trees min_n  learn_rate `agent hash` `company hash` .metric .estimator
   <chr>  <int> <int>       <dbl>        <int>          <int> <chr>   <chr>     
 1 Fold01   298    19     4.15e-9          222             36 mae     standard  
 2 Fold01   298    19     4.15e-9          222             36 rsq     standard  
 3 Fold02   298    19     4.15e-9          222             36 mae     standard  
 4 Fold02   298    19     4.15e-9          222             36 rsq     standard  
 5 Fold03   298    19     4.15e-9          222             36 mae     standard  
 6 Fold03   298    19     4.15e-9          222             36 rsq     standard  
 7 Fold04   298    19     4.15e-9          222             36 mae     standard  
 8 Fold04   298    19     4.15e-9          222             36 rsq     standard  
 9 Fold05   298    19     4.15e-9          222             36 mae     standard  
10 Fold05   298    19     4.15e-9          222             36 rsq     standard  
# ℹ 490 more rows
# ℹ 2 more variables: .estimate <dbl>, .config <chr>

Choose a parameter combination

show_best(lgbm_res, metric = "rsq")

# A tibble: 5 × 11
  trees min_n learn_rate `agent hash` `company hash` .metric .estimator  mean
  <int> <int>      <dbl>        <int>          <int> <chr>   <chr>      <dbl>
1  1890    10    0.0159           115            174 rsq     standard   0.950
2   774    12    0.0441            27             95 rsq     standard   0.949
3  1638    36    0.0409            15            120 rsq     standard   0.948
4   963    23    0.00556          157             13 rsq     standard   0.937
5   590     5    0.00320           85             73 rsq     standard   0.911
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>

show_best(lgbm_res, metric = "mae")

# A tibble: 5 × 11
  trees min_n learn_rate `agent hash` `company hash` .metric .estimator  mean
  <int> <int>      <dbl>        <int>          <int> <chr>   <chr>      <dbl>
1  1890    10    0.0159           115            174 mae     standard    9.80
2   774    12    0.0441            27             95 mae     standard    9.86
3  1638    36    0.0409            15            120 mae     standard   10.0 
4   963    23    0.00556          157             13 mae     standard   11.4 
5   590     5    0.00320           85             73 mae     standard   17.4 
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>

lgbm_best <- select_best(lgbm_res, metric = "mae")
lgbm_best

# A tibble: 1 × 6
  trees min_n learn_rate `agent hash` `company hash` .config              
  <int> <int>      <dbl>        <int>          <int> <chr>                
1  1890    10     0.0159          115            174 Preprocessor12_Model1

Checking calibration

lgbm_res |>
  collect_predictions(
    parameters = lgbm_best
  ) |>
  cal_plot_regression(
    truth = avg_price_per_room,
    estimate = .pred
  )

Tune on `stop_iter`

Your turn: Try early stopping: Set trees = 2000 and tune the stop_iter parameter!

Note that you will need to regenerate lgbm_param with your new workflow!

# add code here

Acknowledgments

Materials derived in part from Machine learning with {tidymodels} and licensed under a Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA) License.