AE 09: Code caching and data preparation

Suggested answers

Application exercise
Answers
Modified

September 30, 2024

Execute options set in YAML header

Below are the execute options defined in the YAML header.

execute:
  cache: true
  warning: false

Part 1 - code caching in Quarto

Setup

```{r}
#| label: packages
#| cache: false

library(tidymodels)
library(textrecipes)
library(ranger)
library(probably)

library(future)
plan(multisession)
```
```{r}
#| label: setup

# metrics
reg_metrics <- metric_set(mae, rsq)

# import data
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
  sample_n(5000) |>
  arrange(arrival_date) |>
  select(-arrival_date) |>
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

# split into training/test sets
set.seed(421)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

# 10-fold CV
set.seed(531)
hotel_rs <- vfold_cv(hotel_train, v = 5, strata = avg_price_per_room)
```

Create workflow

```{r}
#| label: hash-rec
#| dependson: setup

hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  step_dummy_hash(agent, num_terms = tune("agent hash")) |>
  step_dummy_hash(company, num_terms = tune("company hash")) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())
```
```{r}
#| label: model-wf
#| dependson: hash-rec

lm_spec <- linear_reg() |>
  set_mode("regression") |>
  set_engine("lm")

lm_wflow <- workflow(hash_rec, lm_spec)
```

Tune the model

```{r}
#| label: tune-wf
#| dependson: model-wf

set.seed(9)

lm_res <- lm_wflow |>
  tune_grid(
    resamples = hotel_rs,
    grid = 10,
    metrics = reg_metrics
  )
```

Inspect results:

```{r}
#| label: inspect-results
#| dependson: tune-wf

autoplot(lm_res)
show_best(lm_res, metric = "mae")
```
# A tibble: 5 × 8
  `agent hash` `company hash` .metric .estimator  mean     n std_err .config    
         <int>          <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
1         1493           3265 mae     standard    17.4     5   0.212 Preprocess…
2          783           1167 mae     standard    17.4     5   0.235 Preprocess…
3         2006           1605 mae     standard    17.4     5   0.228 Preprocess…
4          658            440 mae     standard    17.4     5   0.200 Preprocess…
5         3842            863 mae     standard    17.4     5   0.214 Preprocess…

Part 2 - prepare the coffee survey

Your turn: Implement data cleaning/preparation for the coffee survey data set.

library(tidyverse)
library(here)

# load data
coffee_survey <- read_csv(file = here("data/coffee_survey.csv"))

# only keep relevant columns that will be used for modeling
coffee_clean <- coffee_survey |>
  select(
    -submission_id,
    -ends_with("other"),
    -ends_with("specify"),
    -starts_with("coffee"),
    -prefer_abc, -prefer_ad
  ) |>
  # convert categorical variables with single answer to factors
  mutate(
    # convert all initially with factor()
    across(
      .cols = c(
        age, cups, favorite, style, strength, roast_level,
        caffeine, prefer_overall, wfh, total_spend, taste:political_affiliation
      ),
      .fns = factor
    ),
    # adjust order of levels for ordinal variables
    age = fct_relevel(
      .f = age,
      "<18 years old",
      "18-24 years old",
      "25-34 years old",
      "35-44 years old",
      "45-54 years old",
      "55-64 years old",
      ">65 years old"
    ),
    cups = fct_relevel(
      .f = cups,
      "Less than 1", "1", "2", "3", "4", "More than 4"
    ),
    strength = fct_relevel(
      .f = strength,
      "Weak", "Somewhat light", "Medium", "Somewhat strong", "Very strong"
    ),
    caffeine = fct_relevel(
      .f = caffeine,
      "Decaf", "Half caff", "Full caffeine"
    ),
    wfh = fct_relevel(
      .f = wfh,
      "I primarily work from home", "I do a mix of both",
      "I primarily work in person"
    ),
    total_spend = fct_relevel(
      .f = total_spend,
      "<$20"
    ) |>
      fct_relevel(
        ">$100",
        after = 5L
      ),
    across(
      .cols = c(most_paid, most_willing),
      .fns = \(x) fct_relevel(
        .f = x,
        "Less than $2", "$2-$4", "$4-$6", "$6-$8", "$8-$10", "$10-$15",
        "$15-$20", "More than $20"
      )
    ),
    spent_equipment = fct_relevel(
      .f = spent_equipment,
      "Less than $20", "$20-$50", "$50-$100", "$100-$300", "$300-$500",
      "$500-$1000", "More than $1,000"
    ),
    education_level = fct_relevel(
      .f = education_level,
      "Less than high school", "High school graduate",
      "Some college or associate's degree", "Bachelor's degree",
      "Master's degree", "Doctorate or professional degree"
    ),
    number_children = fct_relevel(
      .f = number_children,
      "None", "1", "2", "3", "More than 3"
    ),
  ) |>
  # fix additions and purchase columns to ensure commas are only used to separate values
  mutate(
    additions = str_replace_all(
      string = additions,
      pattern = "Milk, dairy alternative, or coffee creamer",
      replacement = "Milk dairy alternative or coffee creamer"
    ),
    purchase = str_replace_all(
      string = purchase,
      pattern = "National chain \\(e.g. Starbucks, Dunkin\\)",
      replacement = "National chain \\(e.g. Starbucks Dunkin\\)"
    )
  ) |>
  # separate columns with multiple answers into one column per answer
  separate_wider_delim(
    cols = c(
      where_drink, brew, purchase,
      additions, dairy, sweetener, why_drink
    ),
    delim = ", ",
    names_sep = "_",
    too_few = "align_start"
  ) |>
  # convert multi-answer columns to factors
  mutate(
    across(
      .cols = c(starts_with("where_drink"), starts_with("brew"),
                 starts_with("purchase"), starts_with("additions"),
                 starts_with("dairy"), starts_with("sweetener"),
                 starts_with("why_drink")),
      .fns = factor
    )
  ) |>
  # drop rows with no prefer_overall
  drop_na(prefer_overall) |>
  write_rds(file = here("data/coffee_survey_clean.rds"))

Acknowledgments

sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-10-02
 pandoc   3.4 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 backports      1.5.0      2024-05-23 [1] CRAN (R 4.4.0)
 broom        * 1.0.6      2024-05-17 [1] CRAN (R 4.4.0)
 class          7.3-22     2023-05-03 [1] CRAN (R 4.4.0)
 cli            3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 codetools      0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace     2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
 crayon         1.5.3      2024-06-20 [1] CRAN (R 4.4.0)
 data.table     1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
 dials        * 1.3.0      2024-07-30 [1] CRAN (R 4.4.0)
 DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.3.1)
 digest         0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 doFuture       1.0.1      2023-12-20 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
 fansi          1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 float          0.3-2      2023-12-10 [1] CRAN (R 4.3.1)
 foreach        1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 furrr          0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
 future       * 1.33.2     2024-03-26 [1] CRAN (R 4.3.1)
 future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.3.1)
 generics       0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 globals        0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
 glue           1.7.0      2024-01-09 [1] CRAN (R 4.3.1)
 gower          1.0.1      2022-12-22 [1] CRAN (R 4.3.0)
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.3.0)
 gtable         0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.4.0)
 here           1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 infer        * 1.0.7      2024-03-25 [1] CRAN (R 4.3.1)
 ipred          0.9-14     2023-03-09 [1] CRAN (R 4.3.0)
 iterators      1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
 knitr          1.47       2024-05-29 [1] CRAN (R 4.4.0)
 labeling       0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 lattice        0.22-6     2024-03-20 [1] CRAN (R 4.4.0)
 lava           1.8.0      2024-03-05 [1] CRAN (R 4.3.1)
 lgr            0.4.4      2022-09-05 [1] CRAN (R 4.3.0)
 lhs            1.1.6      2022-12-17 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS           7.3-61     2024-06-13 [1] CRAN (R 4.4.0)
 Matrix         1.7-0      2024-03-22 [1] CRAN (R 4.4.0)
 mlapi          0.1.1      2022-04-24 [1] CRAN (R 4.3.0)
 modeldata    * 1.4.0      2024-06-19 [1] CRAN (R 4.4.0)
 munsell        0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 nnet           7.3-19     2023-05-03 [1] CRAN (R 4.4.0)
 parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.3.1)
 parsnip      * 1.2.1      2024-03-22 [1] CRAN (R 4.3.1)
 pillar         1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 probably     * 1.0.3      2024-02-23 [1] CRAN (R 4.3.1)
 prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
 purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 ranger       * 0.16.0     2023-11-12 [1] CRAN (R 4.3.1)
 Rcpp           1.0.12     2024-01-09 [1] CRAN (R 4.3.1)
 recipes      * 1.0.10     2024-02-18 [1] CRAN (R 4.3.1)
 RhpcBLASctl    0.23-42    2023-02-11 [1] CRAN (R 4.3.0)
 rlang          1.1.4      2024-06-04 [1] CRAN (R 4.3.3)
 rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.4.0)
 rpart          4.1.23     2023-12-05 [1] CRAN (R 4.4.0)
 rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rsample      * 1.2.1      2024-03-25 [1] CRAN (R 4.3.1)
 rsparse        0.5.1      2022-09-11 [1] CRAN (R 4.3.0)
 rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 scales       * 1.3.0      2023-11-28 [1] CRAN (R 4.4.0)
 sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 survival       3.7-0      2024-06-05 [1] CRAN (R 4.4.0)
 text2vec       0.6.4      2023-11-09 [1] CRAN (R 4.3.1)
 textrecipes  * 1.0.6      2023-11-15 [1] CRAN (R 4.3.1)
 tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidymodels   * 1.2.0      2024-03-25 [1] CRAN (R 4.3.1)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 timeDate       4032.109   2023-12-14 [1] CRAN (R 4.3.1)
 tune         * 1.2.1      2024-04-18 [1] CRAN (R 4.3.1)
 utf8           1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 withr          3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
 workflows    * 1.1.4      2024-02-19 [1] CRAN (R 4.3.1)
 workflowsets * 1.1.0      2024-03-21 [1] CRAN (R 4.3.1)
 xfun           0.45       2024-06-16 [1] CRAN (R 4.4.0)
 yaml           2.3.8      2023-12-11 [1] CRAN (R 4.3.1)
 yardstick    * 1.3.1      2024-03-21 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────