AE 06: Predicting hotel price with boosting models

Application exercise
Modified

September 19, 2024

Setup

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.6      ✔ recipes      1.0.10
✔ dials        1.3.0      ✔ rsample      1.2.1 
✔ dplyr        1.1.4      ✔ tibble       3.2.1 
✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
✔ infer        1.0.7      ✔ tune         1.2.1 
✔ modeldata    1.4.0      ✔ workflows    1.1.4 
✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
✔ purrr        1.0.2      ✔ yardstick    1.3.1 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::%||%()    masks base::%||%()
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.

Attaching package: 'probably'
The following objects are masked from 'package:base':

    as.factor, as.ordered
# metrics
reg_metrics <- metric_set(mae, rsq)

# import data
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
  sample_n(5000) |>
  arrange(arrival_date) |>
  select(-arrival_date) |>
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

# split into training/test sets
set.seed(421)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

# 10-fold CV
set.seed(531)
hotel_rs <- vfold_cv(hotel_train, strata = avg_price_per_room)

# feature engineering recipe
hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  # Defaults to 32 signed indicator columns
  step_dummy_hash(agent) |>
  step_dummy_hash(company) |>
  # Regular indicators for the others
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

Boosting model specification

hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  step_dummy_hash(agent, num_terms = tune("agent hash")) |>
  step_dummy_hash(company, num_terms = tune("company hash")) |>
  step_zv(all_predictors())
lgbm_spec <- boost_tree(trees = tune(), learn_rate = tune()) |>
  set_mode("regression") |>
  set_engine("lightgbm")

lgbm_wflow <- workflow(hash_rec, lgbm_spec)

Create a grid

Demonstration: Create a space filling grid for the boosting workflow.

set.seed(12)
grid <- lgbm_wflow |>
  extract_parameter_set_dials() |>
  grid_space_filling(size = 25)

grid
# A tibble: 25 × 4
   trees learn_rate `agent hash` `company hash`
   <int>      <dbl>        <int>          <int>
 1     1   7.50e- 6          574            574
 2    84   1.78e- 5         2048           2298
 3   167   5.62e-10         1824            912
 4   250   4.22e- 5         3250            512
 5   334   1.78e- 8          512           2896
 6   417   1.33e- 3          322           1625
 7   500   1   e- 1         1448           1149
 8   584   1   e- 7         1290            256
 9   667   2.37e-10          456            724
10   750   1.78e- 2          645            322
# ℹ 15 more rows

Your turn: Try creating a regular grid for the boosting workflow.

# add code here

Your turn: What advantage would a regular grid have?

Add response here.

Update parameter ranges

lgbm_param <- lgbm_wflow |>
  extract_parameter_set_dials() |>
  update(
    trees = trees(c(1L, 100L)),
    learn_rate = learn_rate(c(-5, -1))
  )

set.seed(712)
grid <- lgbm_param |>
  grid_space_filling(size = 25)

grid
# A tibble: 25 × 4
   trees learn_rate `agent hash` `company hash`
   <int>      <dbl>        <int>          <int>
 1     1  0.00147            574            574
 2     5  0.00215           2048           2298
 3     9  0.0000215         1824            912
 4    13  0.00316           3250            512
 5    17  0.0001             512           2896
 6    21  0.0147             322           1625
 7    25  0.1               1448           1149
 8    29  0.000215          1290            256
 9    34  0.0000147          456            724
10    38  0.0464             645            322
# ℹ 15 more rows

Choose a parameter combination

show_best(lgbm_res, metric = "rsq")
# A tibble: 5 × 11
  trees min_n learn_rate `agent hash` `company hash` .metric .estimator  mean
  <int> <int>      <dbl>        <int>          <int> <chr>   <chr>      <dbl>
1  1890    10    0.0159           115            174 rsq     standard   0.950
2   774    12    0.0441            27             95 rsq     standard   0.949
3  1638    36    0.0409            15            120 rsq     standard   0.948
4   963    23    0.00556          157             13 rsq     standard   0.937
5   590     5    0.00320           85             73 rsq     standard   0.911
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>
show_best(lgbm_res, metric = "mae")
# A tibble: 5 × 11
  trees min_n learn_rate `agent hash` `company hash` .metric .estimator  mean
  <int> <int>      <dbl>        <int>          <int> <chr>   <chr>      <dbl>
1  1890    10    0.0159           115            174 mae     standard    9.80
2   774    12    0.0441            27             95 mae     standard    9.86
3  1638    36    0.0409            15            120 mae     standard   10.0 
4   963    23    0.00556          157             13 mae     standard   11.4 
5   590     5    0.00320           85             73 mae     standard   17.4 
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>
lgbm_best <- select_best(lgbm_res, metric = "mae")
lgbm_best
# A tibble: 1 × 6
  trees min_n learn_rate `agent hash` `company hash` .config              
  <int> <int>      <dbl>        <int>          <int> <chr>                
1  1890    10     0.0159          115            174 Preprocessor12_Model1

Checking calibration

lgbm_res |>
  collect_predictions(
    parameters = lgbm_best
  ) |>
  cal_plot_regression(
    truth = avg_price_per_room,
    estimate = .pred
  )

Tune on stop_iter

Your turn: Try early stopping: Set trees = 2000 and tune the stop_iter parameter!

Note that you will need to regenerate lgbm_param with your new workflow!

# add code here

Acknowledgments