```{r}
#| label: packages
#| cache: false
library(tidymodels)
library(textrecipes)
library(ranger)
library(probably)
library(future)
plan(multisession)
```
AE 09: Code caching and data preparation
Suggested answers
Application exercise
Answers
Execute options set in YAML header
Below are the execute
options defined in the YAML header.
execute:
cache: true
warning: false
Part 1 - code caching in Quarto
Setup
```{r}
#| label: setup
# metrics
reg_metrics <- metric_set(mae, rsq)
# import data
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
sample_n(5000) |>
arrange(arrival_date) |>
select(-arrival_date) |>
mutate(
company = factor(as.character(company)),
country = factor(as.character(country)),
agent = factor(as.character(agent))
)
# split into training/test sets
set.seed(421)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)
hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)
# 10-fold CV
set.seed(531)
hotel_rs <- vfold_cv(hotel_train, v = 5, strata = avg_price_per_room)
```
Create workflow
```{r}
#| label: hash-rec
#| dependson: setup
hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
step_YeoJohnson(lead_time) |>
step_dummy_hash(agent, num_terms = tune("agent hash")) |>
step_dummy_hash(company, num_terms = tune("company hash")) |>
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors())
```
```{r}
#| label: model-wf
#| dependson: hash-rec
lm_spec <- linear_reg() |>
set_mode("regression") |>
set_engine("lm")
lm_wflow <- workflow(hash_rec, lm_spec)
```
Tune the model
```{r}
#| label: tune-wf
#| dependson: model-wf
set.seed(9)
lm_res <- lm_wflow |>
tune_grid(
resamples = hotel_rs,
grid = 10,
metrics = reg_metrics
)
```
Inspect results:
```{r}
#| label: inspect-results
#| dependson: tune-wf
autoplot(lm_res)
show_best(lm_res, metric = "mae")
```
# A tibble: 5 × 8
`agent hash` `company hash` .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1493 3265 mae standard 17.4 5 0.212 Preprocess…
2 783 1167 mae standard 17.4 5 0.235 Preprocess…
3 2006 1605 mae standard 17.4 5 0.228 Preprocess…
4 658 440 mae standard 17.4 5 0.200 Preprocess…
5 3842 863 mae standard 17.4 5 0.214 Preprocess…
Part 2 - prepare the coffee survey
Your turn: Implement data cleaning/preparation for the coffee survey data set.
library(tidyverse)
library(here)
# load data
coffee_survey <- read_csv(file = here("data/coffee_survey.csv"))
# only keep relevant columns that will be used for modeling
coffee_clean <- coffee_survey |>
select(
-submission_id,
-ends_with("other"),
-ends_with("specify"),
-starts_with("coffee"),
-prefer_abc, -prefer_ad
) |>
# convert categorical variables with single answer to factors
mutate(
# convert all initially with factor()
across(
.cols = c(
age, cups, favorite, style, strength, roast_level,
caffeine, prefer_overall, wfh, total_spend, taste:political_affiliation
),
.fns = factor
),
# adjust order of levels for ordinal variables
age = fct_relevel(
.f = age,
"<18 years old",
"18-24 years old",
"25-34 years old",
"35-44 years old",
"45-54 years old",
"55-64 years old",
">65 years old"
),
cups = fct_relevel(
.f = cups,
"Less than 1", "1", "2", "3", "4", "More than 4"
),
strength = fct_relevel(
.f = strength,
"Weak", "Somewhat light", "Medium", "Somewhat strong", "Very strong"
),
caffeine = fct_relevel(
.f = caffeine,
"Decaf", "Half caff", "Full caffeine"
),
wfh = fct_relevel(
.f = wfh,
"I primarily work from home", "I do a mix of both",
"I primarily work in person"
),
total_spend = fct_relevel(
.f = total_spend,
"<$20"
) |>
fct_relevel(
">$100",
after = 5L
),
across(
.cols = c(most_paid, most_willing),
.fns = \(x) fct_relevel(
.f = x,
"Less than $2", "$2-$4", "$4-$6", "$6-$8", "$8-$10", "$10-$15",
"$15-$20", "More than $20"
)
),
spent_equipment = fct_relevel(
.f = spent_equipment,
"Less than $20", "$20-$50", "$50-$100", "$100-$300", "$300-$500",
"$500-$1000", "More than $1,000"
),
education_level = fct_relevel(
.f = education_level,
"Less than high school", "High school graduate",
"Some college or associate's degree", "Bachelor's degree",
"Master's degree", "Doctorate or professional degree"
),
number_children = fct_relevel(
.f = number_children,
"None", "1", "2", "3", "More than 3"
),
) |>
# fix additions and purchase columns to ensure commas are only used to separate values
mutate(
additions = str_replace_all(
string = additions,
pattern = "Milk, dairy alternative, or coffee creamer",
replacement = "Milk dairy alternative or coffee creamer"
),
purchase = str_replace_all(
string = purchase,
pattern = "National chain \\(e.g. Starbucks, Dunkin\\)",
replacement = "National chain \\(e.g. Starbucks Dunkin\\)"
)
) |>
# separate columns with multiple answers into one column per answer
separate_wider_delim(
cols = c(
where_drink, brew, purchase,
additions, dairy, sweetener, why_drink
),
delim = ", ",
names_sep = "_",
too_few = "align_start"
) |>
# convert multi-answer columns to factors
mutate(
across(
.cols = c(starts_with("where_drink"), starts_with("brew"),
starts_with("purchase"), starts_with("additions"),
starts_with("dairy"), starts_with("sweetener"),
starts_with("why_drink")),
.fns = factor
)
) |>
# drop rows with no prefer_overall
drop_na(prefer_overall) |>
write_rds(file = here("data/coffee_survey_clean.rds"))
Acknowledgments
- Materials for part 1 derived in part from Machine learning with {tidymodels} and licensed under a Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA) License.
Session information
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.4.1 (2024-06-14)
os macOS Sonoma 14.6.1
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/New_York
date 2024-10-02
pandoc 3.4 @ /usr/local/bin/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
backports 1.5.0 2024-05-23 [1] CRAN (R 4.4.0)
broom * 1.0.6 2024-05-17 [1] CRAN (R 4.4.0)
class 7.3-22 2023-05-03 [1] CRAN (R 4.4.0)
cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0)
codetools 0.2-20 2024-03-31 [1] CRAN (R 4.4.1)
colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.4.0)
crayon 1.5.3 2024-06-20 [1] CRAN (R 4.4.0)
data.table 1.15.4 2024-03-30 [1] CRAN (R 4.3.1)
dials * 1.3.0 2024-07-30 [1] CRAN (R 4.4.0)
DiceDesign 1.10 2023-12-07 [1] CRAN (R 4.3.1)
digest 0.6.35 2024-03-11 [1] CRAN (R 4.3.1)
doFuture 1.0.1 2023-12-20 [1] CRAN (R 4.3.1)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.4.0)
fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
float 0.3-2 2023-12-10 [1] CRAN (R 4.3.1)
foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0)
furrr 0.3.1 2022-08-15 [1] CRAN (R 4.3.0)
future * 1.33.2 2024-03-26 [1] CRAN (R 4.3.1)
future.apply 1.11.2 2024-03-28 [1] CRAN (R 4.3.1)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
globals 0.16.3 2024-03-08 [1] CRAN (R 4.3.1)
glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1)
gower 1.0.1 2022-12-22 [1] CRAN (R 4.3.0)
GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.3.0)
gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1)
hardhat 1.4.0 2024-06-02 [1] CRAN (R 4.4.0)
here 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.1)
infer * 1.0.7 2024-03-25 [1] CRAN (R 4.3.1)
ipred 0.9-14 2023-03-09 [1] CRAN (R 4.3.0)
iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0)
jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
knitr 1.47 2024-05-29 [1] CRAN (R 4.4.0)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0)
lattice 0.22-6 2024-03-20 [1] CRAN (R 4.4.0)
lava 1.8.0 2024-03-05 [1] CRAN (R 4.3.1)
lgr 0.4.4 2022-09-05 [1] CRAN (R 4.3.0)
lhs 1.1.6 2022-12-17 [1] CRAN (R 4.3.0)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
listenv 0.9.1 2024-01-29 [1] CRAN (R 4.3.1)
lubridate 1.9.3 2023-09-27 [1] CRAN (R 4.3.1)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
MASS 7.3-61 2024-06-13 [1] CRAN (R 4.4.0)
Matrix 1.7-0 2024-03-22 [1] CRAN (R 4.4.0)
mlapi 0.1.1 2022-04-24 [1] CRAN (R 4.3.0)
modeldata * 1.4.0 2024-06-19 [1] CRAN (R 4.4.0)
munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1)
nnet 7.3-19 2023-05-03 [1] CRAN (R 4.4.0)
parallelly 1.37.1 2024-02-29 [1] CRAN (R 4.3.1)
parsnip * 1.2.1 2024-03-22 [1] CRAN (R 4.3.1)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
probably * 1.0.3 2024-02-23 [1] CRAN (R 4.3.1)
prodlim 2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
ranger * 0.16.0 2023-11-12 [1] CRAN (R 4.3.1)
Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.1)
recipes * 1.0.10 2024-02-18 [1] CRAN (R 4.3.1)
RhpcBLASctl 0.23-42 2023-02-11 [1] CRAN (R 4.3.0)
rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.3)
rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.4.0)
rpart 4.1.23 2023-12-05 [1] CRAN (R 4.4.0)
rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.3.1)
rsample * 1.2.1 2024-03-25 [1] CRAN (R 4.3.1)
rsparse 0.5.1 2022-09-11 [1] CRAN (R 4.3.0)
rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1)
scales * 1.3.0 2023-11-28 [1] CRAN (R 4.4.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
survival 3.7-0 2024-06-05 [1] CRAN (R 4.4.0)
text2vec 0.6.4 2023-11-09 [1] CRAN (R 4.3.1)
textrecipes * 1.0.6 2023-11-15 [1] CRAN (R 4.3.1)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidymodels * 1.2.0 2024-03-25 [1] CRAN (R 4.3.1)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.1)
timeDate 4032.109 2023-12-14 [1] CRAN (R 4.3.1)
tune * 1.2.1 2024-04-18 [1] CRAN (R 4.3.1)
utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
withr 3.0.1 2024-07-31 [1] CRAN (R 4.4.0)
workflows * 1.1.4 2024-02-19 [1] CRAN (R 4.3.1)
workflowsets * 1.1.0 2024-03-21 [1] CRAN (R 4.3.1)
xfun 0.45 2024-06-16 [1] CRAN (R 4.4.0)
yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.1)
yardstick * 1.3.1 2024-03-21 [1] CRAN (R 4.3.1)
[1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
──────────────────────────────────────────────────────────────────────────────