AE 09: Code caching and data preparation

Part 1 - code caching in Quarto

Setup

```{r}
#| label: packages
#| cache: false

library(tidymodels)
library(textrecipes)
library(ranger)
library(probably)

library(future)
plan(multisession)
```

```{r}
#| label: setup

# metrics
reg_metrics <- metric_set(mae, rsq)

# import data
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
  sample_n(5000) |>
  arrange(arrival_date) |>
  select(-arrival_date) |>
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

# split into training/test sets
set.seed(421)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

# 10-fold CV
set.seed(531)
hotel_rs <- vfold_cv(hotel_train, v = 5, strata = avg_price_per_room)
```

Create workflow

```{r}
#| label: hash-rec
#| dependson: setup

hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  step_dummy_hash(agent, num_terms = tune("agent hash")) |>
  step_dummy_hash(company, num_terms = tune("company hash")) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())
```

```{r}
#| label: model-wf
#| dependson: hash-rec

lm_spec <- linear_reg() |>
  set_mode("regression") |>
  set_engine("lm")

lm_wflow <- workflow(hash_rec, lm_spec)
```

Tune the model

```{r}
#| label: tune-wf
#| dependson: model-wf

set.seed(9)

lm_res <- lm_wflow |>
  tune_grid(
    resamples = hotel_rs,
    grid = 10,
    metrics = reg_metrics
  )
```

Inspect results:

```{r}
#| label: inspect-results
#| dependson: tune-wf

autoplot(lm_res)
show_best(lm_res, metric = "mae")
```

# A tibble: 5 × 8
  `agent hash` `company hash` .metric .estimator  mean     n std_err .config    
         <int>          <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
1         1493           3265 mae     standard    17.4     5   0.212 Preprocess…
2          783           1167 mae     standard    17.4     5   0.235 Preprocess…
3         2006           1605 mae     standard    17.4     5   0.228 Preprocess…
4          658            440 mae     standard    17.4     5   0.200 Preprocess…
5         3842            863 mae     standard    17.4     5   0.214 Preprocess…

Part 2 - prepare the coffee survey

Your turn: Implement data cleaning/preparation for the coffee survey data set.

library(tidyverse)
library(here)

# load data
coffee_survey <- read_csv(file = here("data/coffee_survey.csv"))

# only keep relevant columns that will be used for modeling
coffee_clean <- coffee_survey |>
  select(
    -submission_id,
    -ends_with("other"),
    -ends_with("specify"),
    -starts_with("coffee"),
    -prefer_abc, -prefer_ad
  ) |>
  # convert categorical variables with single answer to factors
  mutate(
    # convert all initially with factor()
    across(
      .cols = c(
        age, cups, favorite, style, strength, roast_level,
        caffeine, prefer_overall, wfh, total_spend, taste:political_affiliation
      ),
      .fns = factor
    ),
    # adjust order of levels for ordinal variables
    age = fct_relevel(
      .f = age,
      "<18 years old",
      "18-24 years old",
      "25-34 years old",
      "35-44 years old",
      "45-54 years old",
      "55-64 years old",
      ">65 years old"
    ),
    cups = fct_relevel(
      .f = cups,
      "Less than 1", "1", "2", "3", "4", "More than 4"
    ),
    strength = fct_relevel(
      .f = strength,
      "Weak", "Somewhat light", "Medium", "Somewhat strong", "Very strong"
    ),
    caffeine = fct_relevel(
      .f = caffeine,
      "Decaf", "Half caff", "Full caffeine"
    ),
    wfh = fct_relevel(
      .f = wfh,
      "I primarily work from home", "I do a mix of both",
      "I primarily work in person"
    ),
    total_spend = fct_relevel(
      .f = total_spend,
      "<$20"
    ) |>
      fct_relevel(
        ">$100",
        after = 5L
      ),
    across(
      .cols = c(most_paid, most_willing),
      .fns = \(x) fct_relevel(
        .f = x,
        "Less than $2", "$2-$4", "$4-$6", "$6-$8", "$8-$10", "$10-$15",
        "$15-$20", "More than $20"
      )
    ),
    spent_equipment = fct_relevel(
      .f = spent_equipment,
      "Less than $20", "$20-$50", "$50-$100", "$100-$300", "$300-$500",
      "$500-$1000", "More than $1,000"
    ),
    education_level = fct_relevel(
      .f = education_level,
      "Less than high school", "High school graduate",
      "Some college or associate's degree", "Bachelor's degree",
      "Master's degree", "Doctorate or professional degree"
    ),
    number_children = fct_relevel(
      .f = number_children,
      "None", "1", "2", "3", "More than 3"
    ),
  ) |>
  # fix additions and purchase columns to ensure commas are only used to separate values
  mutate(
    additions = str_replace_all(
      string = additions,
      pattern = "Milk, dairy alternative, or coffee creamer",
      replacement = "Milk dairy alternative or coffee creamer"
    ),
    purchase = str_replace_all(
      string = purchase,
      pattern = "National chain \\(e.g. Starbucks, Dunkin\\)",
      replacement = "National chain \\(e.g. Starbucks Dunkin\\)"
    )
  ) |>
  # separate columns with multiple answers into one column per answer
  separate_wider_delim(
    cols = c(
      where_drink, brew, purchase,
      additions, dairy, sweetener, why_drink
    ),
    delim = ", ",
    names_sep = "_",
    too_few = "align_start"
  ) |>
  # convert multi-answer columns to factors
  mutate(
    across(
      .cols = c(starts_with("where_drink"), starts_with("brew"),
                 starts_with("purchase"), starts_with("additions"),
                 starts_with("dairy"), starts_with("sweetener"),
                 starts_with("why_drink")),
      .fns = factor
    )
  ) |>
  # drop rows with no prefer_overall
  drop_na(prefer_overall) |>
  write_rds(file = here("data/coffee_survey_clean.rds"))

Acknowledgments

Materials for part 1 derived in part from Machine learning with {tidymodels} and licensed under a Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA) License.

Session information

sessioninfo::session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-10-02
 pandoc   3.4 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 backports      1.5.0      2024-05-23 [1] CRAN (R 4.4.0)
 broom        * 1.0.6      2024-05-17 [1] CRAN (R 4.4.0)
 class          7.3-22     2023-05-03 [1] CRAN (R 4.4.0)
 cli            3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 codetools      0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace     2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
 crayon         1.5.3      2024-06-20 [1] CRAN (R 4.4.0)
 data.table     1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
 dials        * 1.3.0      2024-07-30 [1] CRAN (R 4.4.0)
 DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.3.1)
 digest         0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 doFuture       1.0.1      2023-12-20 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
 fansi          1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 float          0.3-2      2023-12-10 [1] CRAN (R 4.3.1)
 foreach        1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 furrr          0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
 future       * 1.33.2     2024-03-26 [1] CRAN (R 4.3.1)
 future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.3.1)
 generics       0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 globals        0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
 glue           1.7.0      2024-01-09 [1] CRAN (R 4.3.1)
 gower          1.0.1      2022-12-22 [1] CRAN (R 4.3.0)
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.3.0)
 gtable         0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.4.0)
 here           1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 infer        * 1.0.7      2024-03-25 [1] CRAN (R 4.3.1)
 ipred          0.9-14     2023-03-09 [1] CRAN (R 4.3.0)
 iterators      1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8      2023-12-04 [1] CRAN (R 4.3.1)
 knitr          1.47       2024-05-29 [1] CRAN (R 4.4.0)
 labeling       0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 lattice        0.22-6     2024-03-20 [1] CRAN (R 4.4.0)
 lava           1.8.0      2024-03-05 [1] CRAN (R 4.3.1)
 lgr            0.4.4      2022-09-05 [1] CRAN (R 4.3.0)
 lhs            1.1.6      2022-12-17 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS           7.3-61     2024-06-13 [1] CRAN (R 4.4.0)
 Matrix         1.7-0      2024-03-22 [1] CRAN (R 4.4.0)
 mlapi          0.1.1      2022-04-24 [1] CRAN (R 4.3.0)
 modeldata    * 1.4.0      2024-06-19 [1] CRAN (R 4.4.0)
 munsell        0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 nnet           7.3-19     2023-05-03 [1] CRAN (R 4.4.0)
 parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.3.1)
 parsnip      * 1.2.1      2024-03-22 [1] CRAN (R 4.3.1)
 pillar         1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 probably     * 1.0.3      2024-02-23 [1] CRAN (R 4.3.1)
 prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
 purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 ranger       * 0.16.0     2023-11-12 [1] CRAN (R 4.3.1)
 Rcpp           1.0.12     2024-01-09 [1] CRAN (R 4.3.1)
 recipes      * 1.0.10     2024-02-18 [1] CRAN (R 4.3.1)
 RhpcBLASctl    0.23-42    2023-02-11 [1] CRAN (R 4.3.0)
 rlang          1.1.4      2024-06-04 [1] CRAN (R 4.3.3)
 rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.4.0)
 rpart          4.1.23     2023-12-05 [1] CRAN (R 4.4.0)
 rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rsample      * 1.2.1      2024-03-25 [1] CRAN (R 4.3.1)
 rsparse        0.5.1      2022-09-11 [1] CRAN (R 4.3.0)
 rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 scales       * 1.3.0      2023-11-28 [1] CRAN (R 4.4.0)
 sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 survival       3.7-0      2024-06-05 [1] CRAN (R 4.4.0)
 text2vec       0.6.4      2023-11-09 [1] CRAN (R 4.3.1)
 textrecipes  * 1.0.6      2023-11-15 [1] CRAN (R 4.3.1)
 tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidymodels   * 1.2.0      2024-03-25 [1] CRAN (R 4.3.1)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 timeDate       4032.109   2023-12-14 [1] CRAN (R 4.3.1)
 tune         * 1.2.1      2024-04-18 [1] CRAN (R 4.3.1)
 utf8           1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 withr          3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
 workflows    * 1.1.4      2024-02-19 [1] CRAN (R 4.3.1)
 workflowsets * 1.1.0      2024-03-21 [1] CRAN (R 4.3.1)
 xfun           0.45       2024-06-16 [1] CRAN (R 4.4.0)
 yaml           2.3.8      2023-12-11 [1] CRAN (R 4.3.1)
 yardstick    * 1.3.1      2024-03-21 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────