AE 04: Predicting hotel price

Setup

library(tidymodels)

── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

✔ broom        1.0.6      ✔ recipes      1.0.10
✔ dials        1.3.0      ✔ rsample      1.2.1 
✔ dplyr        1.1.4      ✔ tibble       3.2.1 
✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
✔ infer        1.0.7      ✔ tune         1.2.1 
✔ modeldata    1.4.0      ✔ workflows    1.1.4 
✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
✔ purrr        1.0.2      ✔ yardstick    1.3.1

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::%||%()    masks base::%||%()
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.

library(textrecipes)

reg_metrics <- metric_set(mae, rsq)

data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |> 
  sample_n(5000) |> 
  arrange(arrival_date) |> 
  select(-arrival_date) |> 
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

set.seed(4028)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

Explore the data

Your turn: Investigate the training data. The outcome is avg_price_per_room. What trends or patterns do you see?

library(skimr)
skim(hotel_train)

Data summary
Name	hotel_train
Number of rows	3749
Number of columns	27
_______________________
Column type frequency:
factor	9
numeric	18
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
meal	1	FALSE	4	bed: 2865, bre: 743, no_: 116, bre: 25
country	1	FALSE	66	prt: 1206, gbr: 814, esp: 353, irl: 216
market_segment	1	FALSE	5	onl: 1617, dir: 756, off: 736, gro: 416
distribution_channel	1	FALSE	3	ta_: 2616, dir: 812, cor: 321, und: 0
reserved_room_type	1	FALSE	7	a: 2034, d: 787, e: 500, g: 138
assigned_room_type	1	FALSE	9	a: 1395, d: 1066, e: 562, c: 250
agent	1	FALSE	98	dev: 1132, not: 834, ale: 360, cha: 208
company	1	FALSE	100	not: 3416, par: 83, lin: 24, ber: 14
customer_type	1	FALSE	4	tra: 2699, tra: 811, con: 207, gro: 32

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
avg_price_per_room	1	105.62	66.08	19.35	55.00	83.00	142.76	426.25	▇▃▂▁▁
lead_time	1	88.65	101.77	0.00	7.00	43.00	152.00	542.00	▇▂▁▁▁
stays_in_weekend_nights	1	1.17	1.14	0.00	0.00	1.00	2.00	13.00	▇▁▁▁▁
stays_in_week_nights	1	3.08	2.41	0.00	1.00	3.00	5.00	32.00	▇▁▁▁▁
adults	1	1.85	0.47	1.00	2.00	2.00	2.00	4.00	▂▇▁▁▁
children	1	0.13	0.43	0.00	0.00	0.00	0.00	2.00	▇▁▁▁▁
babies	1	0.01	0.12	0.00	0.00	0.00	0.00	2.00	▇▁▁▁▁
is_repeated_guest	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
previous_cancellations	1	0.01	0.11	0.00	0.00	0.00	0.00	4.00	▇▁▁▁▁
previous_bookings_not_canceled	1	0.25	1.37	0.00	0.00	0.00	0.00	29.00	▇▁▁▁▁
booking_changes	1	0.36	0.77	0.00	0.00	0.00	0.00	7.00	▇▁▁▁▁
days_in_waiting_list	1	0.57	7.76	0.00	0.00	0.00	0.00	125.00	▇▁▁▁▁
required_car_parking_spaces	1	0.20	0.42	0.00	0.00	0.00	0.00	8.00	▇▁▁▁▁
total_of_special_requests	1	0.74	0.86	0.00	0.00	1.00	1.00	5.00	▇▂▁▁▁
arrival_date_num	1	2017.08	0.33	2016.50	2016.80	2017.09	2017.36	2017.66	▇▇▇▇▇
near_christmas	1	0.01	0.08	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
near_new_years	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
historical_adr	1	86.64	39.48	41.96	52.27	71.35	116.78	167.49	▇▃▂▂▂

ggplot(data = hotel_train, mapping = aes(x = avg_price_per_room)) +
  geom_histogram(color = "white", binwidth = 20, boundary = 0)

ggplot(data = hotel_train, mapping = aes(x = lead_time)) +
  geom_histogram(color = "white")

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = hotel_train, mapping = aes(x = lead_time)) +
  geom_histogram(color = "white") +
  scale_x_continuous(transform = boxcox_trans(p = 0.4))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = hotel_train, mapping = aes(x = historical_adr, y = avg_price_per_room)) +
  geom_point(alpha = 0.2) +
  geom_smooth()

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

hotel_train |>
  pivot_longer(
    cols = starts_with("near"),
    names_to = "holiday",
    values_to = "value"
  ) |>
  ggplot(mapping = aes(
    x = factor(value),
    y = avg_price_per_room
  )) +
  geom_boxplot() +
  facet_wrap(facets = vars(holiday), ncol = 1)

ggplot(data = hotel_train, mapping = aes(x = company, y = avg_price_per_room)) +
  geom_boxplot()

The avg_price_per_room variable is right-skewed.
The lead_time variable is right-skewed.
The lead_time variable is right-skewed, but a Box-Cox transformation with \(\lambda = 0.4\) makes it more symmetric.
There is a positive relationship between historical_adr and avg_price_per_room.
The near variables show some differences in the distribution of avg_price_per_room.
The company variable shows some differences in the distribution of avg_price_per_room, but also a ton of possible values.

Resampling Strategy

set.seed(472)
hotel_rs <- vfold_cv(hotel_train, strata = avg_price_per_room)
hotel_rs

#  10-fold cross-validation using stratification 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [3372/377]> Fold01
 2 <split [3373/376]> Fold02
 3 <split [3373/376]> Fold03
 4 <split [3373/376]> Fold04
 5 <split [3373/376]> Fold05
 6 <split [3374/375]> Fold06
 7 <split [3375/374]> Fold07
 8 <split [3376/373]> Fold08
 9 <split [3376/373]> Fold09
10 <split [3376/373]> Fold10

A first recipe

hotel_rec <- recipe(avg_price_per_room ~ ., data = hotel_train)

summary(hotel_rec)

# A tibble: 27 × 4
   variable                type      role      source  
   <chr>                   <list>    <chr>     <chr>   
 1 lead_time               <chr [2]> predictor original
 2 stays_in_weekend_nights <chr [2]> predictor original
 3 stays_in_week_nights    <chr [2]> predictor original
 4 adults                  <chr [2]> predictor original
 5 children                <chr [2]> predictor original
 6 babies                  <chr [2]> predictor original
 7 meal                    <chr [3]> predictor original
 8 country                 <chr [3]> predictor original
 9 market_segment          <chr [3]> predictor original
10 distribution_channel    <chr [3]> predictor original
# ℹ 17 more rows

Your turn

What do you think are in the type vectors for the lead_time and country columns?

# add code here
summary(hotel_rec)$type[[1]]

[1] "double"  "numeric"

summary(hotel_rec)$type[[8]]

[1] "factor"    "unordered" "nominal"

Add response here. Contains information on both the R data type as well as the substantive type of variable for {recipes} (e.g. numeric, nominal, ordinal).

Create a recipe

Your turn: Create a recipe() for the hotel data to:

use a Yeo-Johnson (YJ) transformation on lead_time
convert factors to indicator variables
remove zero-variance variables
add the spline technique shown previously

# add code here
hotel_indicators <- recipe(avg_price_per_room ~ ., data = hotel_train) |> 
  step_YeoJohnson(lead_time) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors()) |> 
  step_spline_natural(arrival_date_num, deg_free = 10)

Measuring Performance

We’ll compute two measures, mean absolute error (MAE) and the coefficient of determination (a.k.a \(R^2\)), and focus on the MAE for parameter optimization.

reg_metrics <- metric_set(mae, rsq)

Your turn: Use fit_resamples() to fit your workflow with a recipe.

Collect the predictions from the results. How did you do?

set.seed(9)

# add code here
hotel_lm_wflow <- workflow() |>
  add_recipe(hotel_indicators) |>
  add_model(linear_reg())
 
ctrl <- control_resamples(save_pred = TRUE)
hotel_lm_res <- hotel_lm_wflow |>
  fit_resamples(hotel_rs, control = ctrl, metrics = reg_metrics)

→ A | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")

There were issues with some computations   A: x1

There were issues with some computations   A: x6

There were issues with some computations   A: x9

collect_metrics(hotel_lm_res)

# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 mae     standard   16.6      10 0.214   Preprocessor1_Model1
2 rsq     standard    0.884    10 0.00339 Preprocessor1_Model1

Fine. MAE is around €16. Not too bad in raw terms. High \(R^2\) value as well.

Holdout predictions

# Since we used `save_pred = TRUE`
lm_cv_pred <- collect_predictions(hotel_lm_res)
lm_cv_pred |> slice(1:7)

# A tibble: 7 × 5
  .pred id      .row avg_price_per_room .config             
  <dbl> <chr>  <int>              <dbl> <chr>               
1  75.1 Fold01    20                 40 Preprocessor1_Model1
2  49.3 Fold01    28                 54 Preprocessor1_Model1
3  64.9 Fold01    45                 50 Preprocessor1_Model1
4  52.8 Fold01    49                 42 Preprocessor1_Model1
5  48.6 Fold01    61                 49 Preprocessor1_Model1
6  29.8 Fold01    66                 40 Preprocessor1_Model1
7  36.9 Fold01    88                 49 Preprocessor1_Model1

Calibration Plot

library(probably)


Attaching package: 'probably'

The following objects are masked from 'package:base':

    as.factor, as.ordered

cal_plot_regression(hotel_lm_res)

Your turn: What does this plot tell us about the performance of our model?

Add response here. The model is mostly calibrated successfully against the true values, but does have a tendency to under-predict prices for true average rates above 200.

Acknowledgments

Materials derived in part from Machine learning with {tidymodels} and licensed under a Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA) License.

Session information

sessioninfo::session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-09-13
 pandoc   3.3 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 backports      1.5.0      2024-05-23 [1] CRAN (R 4.4.0)
 base64enc      0.1-3      2015-07-28 [1] CRAN (R 4.3.0)
 broom        * 1.0.6      2024-05-17 [1] CRAN (R 4.4.0)
 class          7.3-22     2023-05-03 [1] CRAN (R 4.4.0)
 cli            3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 codetools      0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace     2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
 data.table     1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
 dials        * 1.3.0      2024-07-30 [1] CRAN (R 4.4.0)
 DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.3.1)
 digest         0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.3.0)
 evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
 fansi          1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 foreach        1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 furrr          0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
 future         1.33.2     2024-03-26 [1] CRAN (R 4.3.1)
 future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.3.1)
 generics       0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 globals        0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
 glue           1.7.0      2024-01-09 [1] CRAN (R 4.3.1)
 gower          1.0.1      2022-12-22 [1] CRAN (R 4.3.0)
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.3.0)
 gtable         0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.4.0)
 here           1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 infer        * 1.0.7      2024-03-25 [1] CRAN (R 4.3.1)
 ipred          0.9-14     2023-03-09 [1] CRAN (R 4.3.0)
 iterators      1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
 knitr          1.47       2024-05-29 [1] CRAN (R 4.4.0)
 labeling       0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 lattice        0.22-6     2024-03-20 [1] CRAN (R 4.4.0)
 lava           1.8.0      2024-03-05 [1] CRAN (R 4.3.1)
 lhs            1.1.6      2022-12-17 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS           7.3-61     2024-06-13 [1] CRAN (R 4.4.0)
 Matrix         1.7-0      2024-03-22 [1] CRAN (R 4.4.0)
 mgcv           1.9-1      2023-12-21 [1] CRAN (R 4.4.0)
 modeldata    * 1.4.0      2024-06-19 [1] CRAN (R 4.4.0)
 munsell        0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 nlme           3.1-165    2024-06-06 [1] CRAN (R 4.4.0)
 nnet           7.3-19     2023-05-03 [1] CRAN (R 4.4.0)
 parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.3.1)
 parsnip      * 1.2.1      2024-03-22 [1] CRAN (R 4.3.1)
 pillar         1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 probably     * 1.0.3      2024-02-23 [1] CRAN (R 4.3.1)
 prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
 purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 Rcpp           1.0.12     2024-01-09 [1] CRAN (R 4.3.1)
 recipes      * 1.0.10     2024-02-18 [1] CRAN (R 4.3.1)
 repr           1.1.7      2024-03-22 [1] CRAN (R 4.4.0)
 rlang          1.1.4      2024-06-04 [1] CRAN (R 4.3.3)
 rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.4.0)
 rpart          4.1.23     2023-12-05 [1] CRAN (R 4.4.0)
 rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rsample      * 1.2.1      2024-03-25 [1] CRAN (R 4.3.1)
 rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 scales       * 1.3.0.9000 2024-05-07 [1] Github (r-lib/scales@c0f79d3)
 sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 skimr        * 2.1.5      2022-12-23 [1] CRAN (R 4.3.0)
 splines2     * 0.5.3      2024-07-08 [1] CRAN (R 4.4.0)
 stringi        1.8.4      2024-05-06 [1] CRAN (R 4.3.1)
 stringr        1.5.1      2023-11-14 [1] CRAN (R 4.3.1)
 survival       3.7-0      2024-06-05 [1] CRAN (R 4.4.0)
 textrecipes  * 1.0.6      2023-11-15 [1] CRAN (R 4.3.1)
 tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidymodels   * 1.2.0      2024-03-25 [1] CRAN (R 4.3.1)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 timeDate       4032.109   2023-12-14 [1] CRAN (R 4.3.1)
 tune         * 1.2.1      2024-04-18 [1] CRAN (R 4.3.1)
 utf8           1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 withr          3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
 workflows    * 1.1.4      2024-02-19 [1] CRAN (R 4.3.1)
 workflowsets * 1.1.0      2024-03-21 [1] CRAN (R 4.3.1)
 xfun           0.45       2024-06-16 [1] CRAN (R 4.4.0)
 yaml           2.3.8      2023-12-11 [1] CRAN (R 4.3.1)
 yardstick    * 1.3.1      2024-03-21 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────