AE 09: Code caching and data preparation

Application exercise

Modified

October 1, 2024

Execute options set in YAML header

Below are the execute options defined in the YAML header.

execute:
  cache: true
  warning: false

Part 1 - code caching in Quarto

Setup

library(tidymodels)
library(textrecipes)
library(ranger)
library(probably)

library(future)
plan(multisession)

# metrics
reg_metrics <- metric_set(mae, rsq)

# import data
data(hotel_rates)
set.seed(295)
hotel_rates <- hotel_rates |>
  sample_n(5000) |>
  arrange(arrival_date) |>
  select(-arrival_date) |>
  mutate(
    company = factor(as.character(company)),
    country = factor(as.character(country)),
    agent = factor(as.character(agent))
  )

# split into training/test sets
set.seed(421)
hotel_split <- initial_split(hotel_rates, strata = avg_price_per_room)

hotel_train <- training(hotel_split)
hotel_test <- testing(hotel_split)

# 10-fold CV
set.seed(531)
hotel_rs <- vfold_cv(hotel_train, v = 5, strata = avg_price_per_room)

Create workflow

hash_rec <- recipe(avg_price_per_room ~ ., data = hotel_train) |>
  step_YeoJohnson(lead_time) |>
  step_dummy_hash(agent, num_terms = tune("agent hash")) |>
  step_dummy_hash(company, num_terms = tune("company hash")) |>
  step_dummy(all_nominal_predictors()) |>
  step_zv(all_predictors())

lm_spec <- linear_reg() |>
  set_mode("regression") |>
  set_engine("lm")

lm_wflow <- workflow(hash_rec, lm_spec)

Tune the model

set.seed(9)

lm_res <- lm_wflow |>
  tune_grid(
    resamples = hotel_rs,
    grid = 10,
    metrics = reg_metrics
  )

Inspect results:

autoplot(lm_res)

show_best(lm_res, metric = "mae")

# A tibble: 5 × 8
  `agent hash` `company hash` .metric .estimator  mean     n std_err .config    
         <int>          <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
1         1493           3265 mae     standard    17.4     5   0.212 Preprocess…
2          783           1167 mae     standard    17.4     5   0.235 Preprocess…
3         2006           1605 mae     standard    17.4     5   0.228 Preprocess…
4          658            440 mae     standard    17.4     5   0.200 Preprocess…
5         3842            863 mae     standard    17.4     5   0.214 Preprocess…

Part 2 - prepare the coffee survey

Your turn: Implement data cleaning/preparation for the coffee survey data set.

Note

Fill in the TODOs with the appropriate code.

library(tidyverse)
library(here)

# load data
coffee_survey <- read_csv(file = here("data/coffee_survey.csv"))

# only keep relevant columns that will be used for modeling
coffee_clean <- coffee_survey |>
  select(
    TODO
  ) |>
  # convert categorical variables with single answer to factors
  mutate(
    # convert all initially with factor()
    across(
      .cols = c(
        age, cups, favorite, style, strength, roast_level,
        caffeine, prefer_overall, wfh, total_spend, taste:political_affiliation
      ),
      .fns = factor
    ),
    # adjust order of levels for ordinal variables
    age = fct_relevel(
      .f = age,
      "<18 years old",
      "18-24 years old",
      "25-34 years old",
      "35-44 years old",
      "45-54 years old",
      "55-64 years old",
      ">65 years old"
    ),
    cups = fct_relevel(
      .f = cups,
      "Less than 1", "1", "2", "3", "4", "More than 4"
    ),
    strength = fct_relevel(
      .f = strength,
      TODO
    ),
    caffeine = fct_relevel(
      .f = caffeine,
      TODO
    ),
    wfh = fct_relevel(
      .f = wfh,
      "I primarily work from home", "I do a mix of both",
      "I primarily work in person"
    ),
    total_spend = fct_relevel(
      .f = total_spend,
      "<$20"
    ) |>
      fct_relevel(
        ">$100",
        after = 5L
      ),
    across(
      .cols = c(most_paid, most_willing),
      .fns = \(x) fct_relevel(
        .f = x,
        TODO
      )
    ),
    spent_equipment = fct_relevel(
      .f = spent_equipment,
      "Less than $20", "$20-$50", "$50-$100", "$100-$300", "$300-$500",
      "$500-$1000", "More than $1,000"
    ),
    education_level = fct_relevel(
      .f = education_level,
      "Less than high school", "High school graduate",
      "Some college or associate's degree", "Bachelor's degree",
      "Master's degree", "Doctorate or professional degree"
    ),
    number_children = fct_relevel(
      .f = number_children,
      "None", "1", "2", "3", "More than 3"
    ),
  ) |>
  # fix additions column to ensure commas are only used to separate values
  mutate(
    additions = str_replace(
      string = additions,
      pattern = "TODO",
      replacement = "TODO"
    ),
    purchase = str_replace_all(
      string = purchase,
      pattern = "National chain \\(e.g. Starbucks, Dunkin\\)",
      replacement = "National chain \\(e.g. Starbucks Dunkin\\)"
    )
  ) |>
  # separate columns with multiple answers into one column per answer
  separate_wider_delim(
    cols = c(
      TODO
    ),
    delim = "TODO",
    names_sep = "_",
    too_few = "align_start"
  ) |>
  # convert multi-answer columns to factors
  mutate(
    across(
      .cols = c(TODO),
      .fns = factor
    )
  ) |>
  # drop rows with no prefer_overall
  drop_na(TODO) |>
  write_rds(file = here("TODO"))

Acknowledgments

Materials for part 1 derived in part from Machine learning with {tidymodels} and licensed under a Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA) License.