AE 04: Predicting hotel price

Suggested answers

Application exercise
Answers
Modified

September 13, 2024

Setup

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.6      ✔ recipes      1.0.10
✔ dials        1.3.0      ✔ rsample      1.2.1 
✔ dplyr        1.1.4      ✔ tibble       3.2.1 
✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
✔ infer        1.0.7      ✔ tune         1.2.1 
✔ modeldata    1.4.0      ✔ workflows    1.1.4 
✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
✔ purrr        1.0.2      ✔ yardstick    1.3.1 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::%||%()    masks base::%||%()
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(textrecipes)

reg_metrics <- metric_set(mae, rsq)

data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |> 
  sample_n(5000) |> 
  arrange(arrival_date) |> 
  select(-arrival_date) |> 
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

set.seed(4028)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

Explore the data

Your turn: Investigate the training data. The outcome is avg_price_per_room. What trends or patterns do you see?

library(skimr)
skim(hotel_train)
Data summary
Name hotel_train
Number of rows 3749
Number of columns 27
_______________________
Column type frequency:
factor 9
numeric 18
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
meal 0 1 FALSE 4 bed: 2865, bre: 743, no_: 116, bre: 25
country 0 1 FALSE 66 prt: 1206, gbr: 814, esp: 353, irl: 216
market_segment 0 1 FALSE 5 onl: 1617, dir: 756, off: 736, gro: 416
distribution_channel 0 1 FALSE 3 ta_: 2616, dir: 812, cor: 321, und: 0
reserved_room_type 0 1 FALSE 7 a: 2034, d: 787, e: 500, g: 138
assigned_room_type 0 1 FALSE 9 a: 1395, d: 1066, e: 562, c: 250
agent 0 1 FALSE 98 dev: 1132, not: 834, ale: 360, cha: 208
company 0 1 FALSE 100 not: 3416, par: 83, lin: 24, ber: 14
customer_type 0 1 FALSE 4 tra: 2699, tra: 811, con: 207, gro: 32

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
avg_price_per_room 0 1 105.62 66.08 19.35 55.00 83.00 142.76 426.25 ▇▃▂▁▁
lead_time 0 1 88.65 101.77 0.00 7.00 43.00 152.00 542.00 ▇▂▁▁▁
stays_in_weekend_nights 0 1 1.17 1.14 0.00 0.00 1.00 2.00 13.00 ▇▁▁▁▁
stays_in_week_nights 0 1 3.08 2.41 0.00 1.00 3.00 5.00 32.00 ▇▁▁▁▁
adults 0 1 1.85 0.47 1.00 2.00 2.00 2.00 4.00 ▂▇▁▁▁
children 0 1 0.13 0.43 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
babies 0 1 0.01 0.12 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
is_repeated_guest 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
previous_cancellations 0 1 0.01 0.11 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.25 1.37 0.00 0.00 0.00 0.00 29.00 ▇▁▁▁▁
booking_changes 0 1 0.36 0.77 0.00 0.00 0.00 0.00 7.00 ▇▁▁▁▁
days_in_waiting_list 0 1 0.57 7.76 0.00 0.00 0.00 0.00 125.00 ▇▁▁▁▁
required_car_parking_spaces 0 1 0.20 0.42 0.00 0.00 0.00 0.00 8.00 ▇▁▁▁▁
total_of_special_requests 0 1 0.74 0.86 0.00 0.00 1.00 1.00 5.00 ▇▂▁▁▁
arrival_date_num 0 1 2017.08 0.33 2016.50 2016.80 2017.09 2017.36 2017.66 ▇▇▇▇▇
near_christmas 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
near_new_years 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
historical_adr 0 1 86.64 39.48 41.96 52.27 71.35 116.78 167.49 ▇▃▂▂▂
ggplot(data = hotel_train, mapping = aes(x = avg_price_per_room)) +
  geom_histogram(color = "white", binwidth = 20, boundary = 0)

ggplot(data = hotel_train, mapping = aes(x = lead_time)) +
  geom_histogram(color = "white")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = hotel_train, mapping = aes(x = lead_time)) +
  geom_histogram(color = "white") +
  scale_x_continuous(transform = boxcox_trans(p = 0.4))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = hotel_train, mapping = aes(x = historical_adr, y = avg_price_per_room)) +
  geom_point(alpha = 0.2) +
  geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

hotel_train |>
  pivot_longer(
    cols = starts_with("near"),
    names_to = "holiday",
    values_to = "value"
  ) |>
  ggplot(mapping = aes(
    x = factor(value),
    y = avg_price_per_room
  )) +
  geom_boxplot() +
  facet_wrap(facets = vars(holiday), ncol = 1)

ggplot(data = hotel_train, mapping = aes(x = company, y = avg_price_per_room)) +
  geom_boxplot()

  • The avg_price_per_room variable is right-skewed.
  • The lead_time variable is right-skewed.
  • The lead_time variable is right-skewed, but a Box-Cox transformation with \(\lambda = 0.4\) makes it more symmetric.
  • There is a positive relationship between historical_adr and avg_price_per_room.
  • The near variables show some differences in the distribution of avg_price_per_room.
  • The company variable shows some differences in the distribution of avg_price_per_room, but also a ton of possible values.

Resampling Strategy

set.seed(472)
hotel_rs <- vfold_cv(hotel_train, strata = avg_price_per_room)
hotel_rs
#  10-fold cross-validation using stratification 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [3372/377]> Fold01
 2 <split [3373/376]> Fold02
 3 <split [3373/376]> Fold03
 4 <split [3373/376]> Fold04
 5 <split [3373/376]> Fold05
 6 <split [3374/375]> Fold06
 7 <split [3375/374]> Fold07
 8 <split [3376/373]> Fold08
 9 <split [3376/373]> Fold09
10 <split [3376/373]> Fold10

A first recipe

hotel_rec <- recipe(avg_price_per_room ~ ., data = hotel_train)

summary(hotel_rec)
# A tibble: 27 × 4
   variable                type      role      source  
   <chr>                   <list>    <chr>     <chr>   
 1 lead_time               <chr [2]> predictor original
 2 stays_in_weekend_nights <chr [2]> predictor original
 3 stays_in_week_nights    <chr [2]> predictor original
 4 adults                  <chr [2]> predictor original
 5 children                <chr [2]> predictor original
 6 babies                  <chr [2]> predictor original
 7 meal                    <chr [3]> predictor original
 8 country                 <chr [3]> predictor original
 9 market_segment          <chr [3]> predictor original
10 distribution_channel    <chr [3]> predictor original
# ℹ 17 more rows

Your turn

What do you think are in the type vectors for the lead_time and country columns?

# add code here
summary(hotel_rec)$type[[1]]
[1] "double"  "numeric"
summary(hotel_rec)$type[[8]]
[1] "factor"    "unordered" "nominal"  

Add response here. Contains information on both the R data type as well as the substantive type of variable for {recipes} (e.g. numeric, nominal, ordinal).

Create a recipe

Your turn: Create a recipe() for the hotel data to:

  • use a Yeo-Johnson (YJ) transformation on lead_time
  • convert factors to indicator variables
  • remove zero-variance variables
  • add the spline technique shown previously
# add code here
hotel_indicators <- recipe(avg_price_per_room ~ ., data = hotel_train) |> 
  step_YeoJohnson(lead_time) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors()) |> 
  step_spline_natural(arrival_date_num, deg_free = 10)

Measuring Performance

We’ll compute two measures, mean absolute error (MAE) and the coefficient of determination (a.k.a \(R^2\)), and focus on the MAE for parameter optimization.

reg_metrics <- metric_set(mae, rsq)

Your turn: Use fit_resamples() to fit your workflow with a recipe.

Collect the predictions from the results. How did you do?

set.seed(9)

# add code here
hotel_lm_wflow <- workflow() |>
  add_recipe(hotel_indicators) |>
  add_model(linear_reg())
 
ctrl <- control_resamples(save_pred = TRUE)
hotel_lm_res <- hotel_lm_wflow |>
  fit_resamples(hotel_rs, control = ctrl, metrics = reg_metrics)
→ A | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
There were issues with some computations   A: x1
There were issues with some computations   A: x6
There were issues with some computations   A: x9
collect_metrics(hotel_lm_res)
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 mae     standard   16.6      10 0.214   Preprocessor1_Model1
2 rsq     standard    0.884    10 0.00339 Preprocessor1_Model1

Fine. MAE is around €16. Not too bad in raw terms. High \(R^2\) value as well.

Holdout predictions

# Since we used `save_pred = TRUE`
lm_cv_pred <- collect_predictions(hotel_lm_res)
lm_cv_pred |> slice(1:7)
# A tibble: 7 × 5
  .pred id      .row avg_price_per_room .config             
  <dbl> <chr>  <int>              <dbl> <chr>               
1  75.1 Fold01    20                 40 Preprocessor1_Model1
2  49.3 Fold01    28                 54 Preprocessor1_Model1
3  64.9 Fold01    45                 50 Preprocessor1_Model1
4  52.8 Fold01    49                 42 Preprocessor1_Model1
5  48.6 Fold01    61                 49 Preprocessor1_Model1
6  29.8 Fold01    66                 40 Preprocessor1_Model1
7  36.9 Fold01    88                 49 Preprocessor1_Model1

Calibration Plot


Attaching package: 'probably'
The following objects are masked from 'package:base':

    as.factor, as.ordered
cal_plot_regression(hotel_lm_res)

Your turn: What does this plot tell us about the performance of our model?

Add response here. The model is mostly calibrated successfully against the true values, but does have a tendency to under-predict prices for true average rates above 200.

Acknowledgments

sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-09-13
 pandoc   3.3 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 backports      1.5.0      2024-05-23 [1] CRAN (R 4.4.0)
 base64enc      0.1-3      2015-07-28 [1] CRAN (R 4.3.0)
 broom        * 1.0.6      2024-05-17 [1] CRAN (R 4.4.0)
 class          7.3-22     2023-05-03 [1] CRAN (R 4.4.0)
 cli            3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 codetools      0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace     2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
 data.table     1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
 dials        * 1.3.0      2024-07-30 [1] CRAN (R 4.4.0)
 DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.3.1)
 digest         0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.3.0)
 evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
 fansi          1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 foreach        1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 furrr          0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
 future         1.33.2     2024-03-26 [1] CRAN (R 4.3.1)
 future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.3.1)
 generics       0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 globals        0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
 glue           1.7.0      2024-01-09 [1] CRAN (R 4.3.1)
 gower          1.0.1      2022-12-22 [1] CRAN (R 4.3.0)
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.3.0)
 gtable         0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.4.0)
 here           1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 infer        * 1.0.7      2024-03-25 [1] CRAN (R 4.3.1)
 ipred          0.9-14     2023-03-09 [1] CRAN (R 4.3.0)
 iterators      1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
 knitr          1.47       2024-05-29 [1] CRAN (R 4.4.0)
 labeling       0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 lattice        0.22-6     2024-03-20 [1] CRAN (R 4.4.0)
 lava           1.8.0      2024-03-05 [1] CRAN (R 4.3.1)
 lhs            1.1.6      2022-12-17 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS           7.3-61     2024-06-13 [1] CRAN (R 4.4.0)
 Matrix         1.7-0      2024-03-22 [1] CRAN (R 4.4.0)
 mgcv           1.9-1      2023-12-21 [1] CRAN (R 4.4.0)
 modeldata    * 1.4.0      2024-06-19 [1] CRAN (R 4.4.0)
 munsell        0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 nlme           3.1-165    2024-06-06 [1] CRAN (R 4.4.0)
 nnet           7.3-19     2023-05-03 [1] CRAN (R 4.4.0)
 parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.3.1)
 parsnip      * 1.2.1      2024-03-22 [1] CRAN (R 4.3.1)
 pillar         1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 probably     * 1.0.3      2024-02-23 [1] CRAN (R 4.3.1)
 prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
 purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 Rcpp           1.0.12     2024-01-09 [1] CRAN (R 4.3.1)
 recipes      * 1.0.10     2024-02-18 [1] CRAN (R 4.3.1)
 repr           1.1.7      2024-03-22 [1] CRAN (R 4.4.0)
 rlang          1.1.4      2024-06-04 [1] CRAN (R 4.3.3)
 rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.4.0)
 rpart          4.1.23     2023-12-05 [1] CRAN (R 4.4.0)
 rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rsample      * 1.2.1      2024-03-25 [1] CRAN (R 4.3.1)
 rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 scales       * 1.3.0.9000 2024-05-07 [1] Github (r-lib/scales@c0f79d3)
 sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 skimr        * 2.1.5      2022-12-23 [1] CRAN (R 4.3.0)
 splines2     * 0.5.3      2024-07-08 [1] CRAN (R 4.4.0)
 stringi        1.8.4      2024-05-06 [1] CRAN (R 4.3.1)
 stringr        1.5.1      2023-11-14 [1] CRAN (R 4.3.1)
 survival       3.7-0      2024-06-05 [1] CRAN (R 4.4.0)
 textrecipes  * 1.0.6      2023-11-15 [1] CRAN (R 4.3.1)
 tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidymodels   * 1.2.0      2024-03-25 [1] CRAN (R 4.3.1)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 timeDate       4032.109   2023-12-14 [1] CRAN (R 4.3.1)
 tune         * 1.2.1      2024-04-18 [1] CRAN (R 4.3.1)
 utf8           1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 withr          3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
 workflows    * 1.1.4      2024-02-19 [1] CRAN (R 4.3.1)
 workflowsets * 1.1.0      2024-03-21 [1] CRAN (R 4.3.1)
 xfun           0.45       2024-06-16 [1] CRAN (R 4.4.0)
 yaml           2.3.8      2023-12-11 [1] CRAN (R 4.3.1)
 yardstick    * 1.3.1      2024-03-21 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────