AE 10: Dimension reduction with cocktails

Application exercise
Modified

October 3, 2024

Cocktails

Tidy Tuesday published a data set on cocktail recipes. In this application exercise, we will use dimension reduction techniques to explore the data set based on the recipes’ ingredients.

cocktails <- read_csv("data/boston_cocktails.csv")
cocktails

Data cleaning

cocktails_parsed <- cocktails |>
  mutate(
    ingredient = str_to_lower(ingredient),
    ingredient = str_replace_all(ingredient, "-", " "),
    ingredient = str_remove(ingredient, " liqueur$"),
    ingredient = str_remove(ingredient, " (if desired)$"),
    ingredient = case_when(
      str_detect(ingredient, "bitters") ~ "bitters",
      str_detect(ingredient, "lemon") ~ "lemon juice",
      str_detect(ingredient, "lime") ~ "lime juice",
      str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
      str_detect(ingredient, "orange") ~ "orange juice",
      .default = ingredient
    ),
    measure = case_when(
      str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
      .default = measure
    ),
    measure = str_replace(measure, " ?1/2", ".5"),
    measure = str_replace(measure, " ?3/4", ".75"),
    measure = str_replace(measure, " ?1/4", ".25"),
    measure_number = parse_number(measure),
    measure_number = if_else(str_detect(measure, "dash$"),
      measure_number / 50,
      measure_number
    )
  ) |>
  # only keep ingredients that appear more than 15 times
  filter(n() >= 15, .by = ingredient) |>
  distinct(row_id, ingredient, .keep_all = TRUE) |>
  drop_na()

cocktails_parsed

# convert to wide format for modeling
cocktails_df <- cocktails_parsed |>
  select(-ingredient_number, -row_id, -measure) |>
  pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) |>
  janitor::clean_names() |>
  drop_na()

cocktails_df

Principal components analysis

Estimate PCA

Your turn: Apply PCA to the data set. Define and prepare a feature engineering recipe that scales the ingredients features and then applies PCA.

pca_rec <- recipe(TODO, data = cocktails_df) |>
  update_role(name, category, new_role = "id") |>
  TODO

pca_prep <- prep(pca_rec)
pca_prep

Interpret results

Your turn: Explore the results of the PCA by identifying how each ingredient loads (contributes to) the first five principal components. How would you describe each component? What do they seem to measure?

tidied_pca <- tidy(pca_prep, 2)

tidied_pca |>
  filter(component %in% str_glue("PC{1:5}")) |>
  mutate(component = fct_inorder(component)) |>
  ggplot(mapping = aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  scale_color_discrete_qualitative() +
  facet_wrap(facets = vars(component), nrow = 1) +
  labs(y = NULL)
tidied_pca |>
  # only first four components
  filter(component %in% str_glue("PC{1:4}")) |>
  # top 8 ingredients per component
  slice_max(order_by = abs(value), n = 10, by = component) |>
  # reorder the terms in decreasing magnitude for visualization
  mutate(terms = reorder_within(terms, abs(value), component)) |>
  ggplot(mapping = aes(x = value, y = terms, fill = value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(facets = vars(component), scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Value of contribution",
    y = NULL
  )

Add response here.

Your turn: How are the beverages distributed along the first two dimensions? Is there a pattern to these dimensions?

cocktails_pca <- bake(pca_prep, new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category))

ggplot(data = cocktails_pca, mapping = aes(x = PC1, y = PC2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  scale_color_discrete_qualitative() +
  labs(color = NULL)

ggplot(data = cocktails_pca, mapping = aes(x = PC1, y = PC2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  scale_color_discrete_qualitative() +
  labs(color = NULL)
# first four components
library(GGally)

ggpairs(
  data = cocktails_pca,
  columns = 3:6,
  lower = list(
    mapping = aes(color = category)
  ),
  diag = list(
    mapping = aes(color = category, fill = category)
  ),
  progress = FALSE
) +
  scale_color_discrete_qualitative() +
  scale_fill_discrete_qualitative() +
  labs(
    color = "Label",
    fill = "Label"
  )

Add response here.

Your turn: How much do the dimensions explain variation in the data set? If you were going to use these components for a predictive model, how many would you include?

tidy(pca_prep, number = _____, type = _____) |>
  filter(terms == "percent variance") |>
  ggplot(mapping = aes(x = component, y = value)) +
  geom_line() +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  labs(
    x = "Principal component",
    y = "Percent variance"
  )

tidy(pca_prep, number = _____, type = _____) |>
  filter(terms == "cumulative percent variance") |>
  ggplot(mapping = aes(x = component, y = value)) +
  geom_line() +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  labs(
    x = "Principal component",
    y = "Cumulative variance"
  )

Add response here.

UMAP

Your turn: Apply UMAP to the data set. Define and prepare a feature engineering recipe that scales the ingredients features and then applies UMAP, retaining four UMAP dimensions. Examine the location of cocktails along the first two UMAP dimensions. How does this compare to the PCA results?

set.seed(532)

umap_rec <- recipe(TODO, data = cocktails_df) |>
  update_role(name, category, new_role = "id") |>
  TODO

umap_prep <- prep(umap_rec)

umap_prep
bake(umap_prep, new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category)) |>
  ggplot(mapping = aes(x = UMAP1, y = UMAP2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  scale_color_discrete_qualitative() +
  labs(color = NULL)

bake(umap_prep, new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category)) |>
  ggplot(mapping = aes(x = UMAP1, y = UMAP2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  scale_color_discrete_qualitative() +
  labs(color = NULL)
# first four dimensions
umap_prep |>
  bake(new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category)) |>
  ggpairs(
    columns = 3:6,
    lower = list(
      mapping = aes(color = category)
    ),
    diag = list(
      mapping = aes(color = category, fill = category)
    ),
    progress = FALSE
  ) +
  scale_color_discrete_qualitative() +
  scale_fill_discrete_qualitative() +
  labs(
    color = "Label",
    fill = "Label"
  )

Add response here.

Acknowledgments