AE 10: Dimension reduction with cocktails

Suggested answers

Application exercise
Answers
Modified

October 3, 2024

library(tidyverse)
library(tidymodels)
library(tidytext)
library(embed)
library(colorspace)

set.seed(5312)

# preferred theme
theme_set(theme_minimal(base_size = 12, base_family = "Atkinson Hyperlegible"))

Cocktails

Tidy Tuesday published a data set on cocktail recipes. In this application exercise, we will use dimension reduction techniques to explore the data set based on the recipes’ ingredients.

cocktails <- read_csv("data/boston_cocktails.csv")
cocktails
# A tibble: 3,643 × 6
   name            category          row_id ingredient_number ingredient measure
   <chr>           <chr>              <dbl>             <dbl> <chr>      <chr>  
 1 Gauguin         Cocktail Classics      1                 1 Light Rum  2 oz   
 2 Gauguin         Cocktail Classics      1                 2 Passion F… 1 oz   
 3 Gauguin         Cocktail Classics      1                 3 Lemon Jui… 1 oz   
 4 Gauguin         Cocktail Classics      1                 4 Lime Juice 1 oz   
 5 Fort Lauderdale Cocktail Classics      2                 1 Light Rum  1 1/2 …
 6 Fort Lauderdale Cocktail Classics      2                 2 Sweet Ver… 1/2 oz 
 7 Fort Lauderdale Cocktail Classics      2                 3 Juice of … 1/4 oz 
 8 Fort Lauderdale Cocktail Classics      2                 4 Juice of … 1/4 oz 
 9 Apple Pie       Cordials and Liq…      3                 1 Apple sch… 3 oz   
10 Apple Pie       Cordials and Liq…      3                 2 Cinnamon … 1 oz   
# ℹ 3,633 more rows

Data cleaning

cocktails_parsed <- cocktails |>
  mutate(
    ingredient = str_to_lower(ingredient),
    ingredient = str_replace_all(ingredient, "-", " "),
    ingredient = str_remove(ingredient, " liqueur$"),
    ingredient = str_remove(ingredient, " (if desired)$"),
    ingredient = case_when(
      str_detect(ingredient, "bitters") ~ "bitters",
      str_detect(ingredient, "lemon") ~ "lemon juice",
      str_detect(ingredient, "lime") ~ "lime juice",
      str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
      str_detect(ingredient, "orange") ~ "orange juice",
      .default = ingredient
    ),
    measure = case_when(
      str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
      .default = measure
    ),
    measure = str_replace(measure, " ?1/2", ".5"),
    measure = str_replace(measure, " ?3/4", ".75"),
    measure = str_replace(measure, " ?1/4", ".25"),
    measure_number = parse_number(measure),
    measure_number = if_else(str_detect(measure, "dash$"),
      measure_number / 50,
      measure_number
    )
  ) |>
  # only keep ingredients that appear more than 15 times
  filter(n() >= 15, .by = ingredient) |>
  distinct(row_id, ingredient, .keep_all = TRUE) |>
  drop_na()

cocktails_parsed
# A tibble: 2,602 × 7
   name      category row_id ingredient_number ingredient measure measure_number
   <chr>     <chr>     <dbl>             <dbl> <chr>      <chr>            <dbl>
 1 Gauguin   Cocktai…      1                 1 light rum  2 oz              2   
 2 Gauguin   Cocktai…      1                 3 lemon jui… 1 oz              1   
 3 Gauguin   Cocktai…      1                 4 lime juice 1 oz              1   
 4 Fort Lau… Cocktai…      2                 1 light rum  1.5 oz            1.5 
 5 Fort Lau… Cocktai…      2                 2 sweet ver… .5 oz             0.5 
 6 Fort Lau… Cocktai…      2                 3 orange ju… .25 oz            0.25
 7 Fort Lau… Cocktai…      2                 4 lime juice .25 oz            0.25
 8 Cuban Co… Cocktai…      4                 1 lime juice .5 oz             0.5 
 9 Cuban Co… Cocktai…      4                 2 powdered … .5 oz             0.5 
10 Cuban Co… Cocktai…      4                 3 light rum  2 oz              2   
# ℹ 2,592 more rows
# convert to wide format for modeling
cocktails_df <- cocktails_parsed |>
  select(-ingredient_number, -row_id, -measure) |>
  pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) |>
  janitor::clean_names() |>
  drop_na()

cocktails_df
# A tibble: 943 × 46
   name    category light_rum lemon_juice lime_juice sweet_vermouth orange_juice
   <chr>   <chr>        <dbl>       <dbl>      <dbl>          <dbl>        <dbl>
 1 Gauguin Cocktai…      2           1          1               0           0   
 2 Fort L… Cocktai…      1.5         0          0.25            0.5         0.25
 3 Cuban … Cocktai…      2           0          0.5             0           0   
 4 Cool C… Cocktai…      0           0          0               0           1   
 5 John C… Whiskies      0           1          0               0           0   
 6 Cherry… Cocktai…      1.25        0          0               0           0   
 7 Casa B… Cocktai…      2           0          1.5             0           0   
 8 Caribb… Cocktai…      0.5         0          0               0           0   
 9 Amber … Cordial…      0           0.25       0               0           0   
10 The Jo… Whiskies      0           0.5        0               0           0   
# ℹ 933 more rows
# ℹ 39 more variables: powdered_sugar <dbl>, dark_rum <dbl>,
#   cranberry_juice <dbl>, pineapple_juice <dbl>, bourbon_whiskey <dbl>,
#   simple_syrup <dbl>, cherry_flavored_brandy <dbl>, light_cream <dbl>,
#   triple_sec <dbl>, maraschino <dbl>, amaretto <dbl>, grenadine <dbl>,
#   apple_brandy <dbl>, brandy <dbl>, gin <dbl>, anisette <dbl>,
#   dry_vermouth <dbl>, apricot_flavored_brandy <dbl>, sloe_gin <dbl>, …

Principal components analysis

Estimate PCA

Your turn: Apply PCA to the data set. Define and prepare a feature engineering recipe that scales the ingredients features and then applies PCA.

pca_rec <- recipe(~., data = cocktails_df) |>
  update_role(name, category, new_role = "id") |>
  step_normalize(all_predictors()) |>
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)
pca_prep

Interpret results

Your turn: Explore the results of the PCA by identifying how each ingredient loads (contributes to) the first five principal components. How would you describe each component? What do they seem to measure?

tidied_pca <- tidy(pca_prep, 2)

tidied_pca |>
  filter(component %in% str_glue("PC{1:5}")) |>
  mutate(component = fct_inorder(component)) |>
  ggplot(mapping = aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  scale_color_discrete_qualitative() +
  facet_wrap(facets = vars(component), nrow = 1) +
  labs(y = NULL)

Add response here. PC1 seems to indicate the difference between sweeteners used in cocktails (e.g. powdered sugar vs. simple syrup). PC2 seems defined by the contribution of vermouths. It gets harder to interpret the others, especially given the number of unique ingredients.

tidied_pca |>
  # only first four components
  filter(component %in% str_glue("PC{1:4}")) |>
  # top 8 ingredients per component
  slice_max(order_by = abs(value), n = 10, by = component) |>
  # reorder the terms in decreasing magnitude for visualization
  mutate(terms = reorder_within(terms, abs(value), component)) |>
  ggplot(mapping = aes(x = value, y = terms, fill = value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(facets = vars(component), scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Value of contribution",
    y = NULL
  )

Add response here. The top ingredients contributing to PC1 are sweeteners (powdered sugar, simple syrup) and ingredients for a margaritas (simple syrup, lime juice, tequila). PC2 is defined by vermouths (sweet, dry). PC3 is more difficult to interpret, but it seems to be defined by the contribution of egg whites and grenadine (amaretto sours and their variants?). PC4 is defined by the contribution of vodka and OJ (screwdrivers?). In the opposite direction are ingredients for rum-based drinks (light rum, triple sec, lime juice, simple syrup).

Your turn: How are the beverages distributed along the first two dimensions? Is there a pattern to these dimensions?

cocktails_pca <- bake(pca_prep, new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category))

ggplot(data = cocktails_pca, mapping = aes(x = PC1, y = PC2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward", family = "Atkinson Hyperlegible") +
  scale_color_discrete_qualitative() +
  labs(color = NULL)

ggplot(data = cocktails_pca, mapping = aes(x = PC1, y = PC2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  scale_color_discrete_qualitative() +
  labs(color = NULL)

# first four components
library(GGally)

ggpairs(
  data = cocktails_pca,
  columns = 3:6,
  lower = list(
    mapping = aes(color = category)
  ),
  diag = list(
    mapping = aes(color = category, fill = category)
  ),
  progress = FALSE
) +
  scale_color_discrete_qualitative() +
  scale_fill_discrete_qualitative() +
  labs(
    color = "Label",
    fill = "Label"
  )

Add response here. PC1 has mostly “Cocktail classics” on the left and tequila/vodka on the right. Whiskies and “other” are in the middle.

Your turn: How much do the dimensions explain variation in the data set? If you were going to use these components for a predictive model, how many would you include?

tidy(pca_prep, number = 2, type = "variance") |>
  filter(terms == "percent variance") |>
  ggplot(mapping = aes(x = component, y = value)) +
  geom_line() +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  labs(
    x = "Principal component",
    y = "Percent variance"
  )

tidy(pca_prep, number = 2, type = "variance") |>
  filter(terms == "cumulative percent variance") |>
  ggplot(mapping = aes(x = component, y = value)) +
  geom_line() +
  scale_y_continuous(labels = label_percent(scale = 1)) +
  labs(
    x = "Principal component",
    y = "Cumulative variance"
  )

Add response here. PCA is kind of doing poorly here. The first components by definition have the largest amount of variance, but overall is exceedingly small. I would probably include the first 5 components definitely, if not more. Or frankly, depending on the model I might just keep the original ingredient measures.

UMAP

Your turn: Apply UMAP to the data set. Define and prepare a feature engineering recipe that scales the ingredients features and then applies UMAP, retaining four UMAP dimensions. Examine the location of cocktails along the first two UMAP dimensions. How does this compare to the PCA results?

set.seed(532)

umap_rec <- recipe(~., data = cocktails_df) |>
  update_role(name, category, new_role = "id") |>
  step_normalize(all_predictors()) |>
  step_umap(all_predictors(), num_comp = 4)

umap_prep <- prep(umap_rec)

umap_prep
bake(umap_prep, new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category)) |>
  ggplot(mapping = aes(x = UMAP1, y = UMAP2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  geom_text(check_overlap = TRUE, hjust = "inward", family = "Atkinson Hyperlegible") +
  scale_color_discrete_qualitative() +
  labs(color = NULL)

bake(umap_prep, new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category)) |>
  ggplot(mapping = aes(x = UMAP1, y = UMAP2, label = name)) +
  geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
  scale_color_discrete_qualitative() +
  labs(color = NULL)

# first four dimensions
umap_prep |>
  bake(new_data = NULL) |>
  # collapse infrequent categories
  mutate(category = fct_lump_lowfreq(f = category)) |>
  ggpairs(
    columns = 3:6,
    lower = list(
      mapping = aes(color = category)
    ),
    diag = list(
      mapping = aes(color = category, fill = category)
    ),
    progress = FALSE
  ) +
  scale_color_discrete_qualitative() +
  scale_fill_discrete_qualitative() +
  labs(
    color = "Label",
    fill = "Label"
  )

Add response here. Clearly a more localized structure to the beverages, but not discernable trends globally.

Acknowledgments

sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14)
 os       macOS Sonoma 14.6.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-10-03
 pandoc   3.4 @ /usr/local/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version    date (UTC) lib source
 archive        1.1.8      2024-04-28 [1] CRAN (R 4.4.0)
 backports      1.5.0      2024-05-23 [1] CRAN (R 4.4.0)
 bit            4.0.5      2022-11-15 [1] CRAN (R 4.3.0)
 bit64          4.0.5      2020-08-30 [1] CRAN (R 4.3.0)
 broom        * 1.0.6      2024-05-17 [1] CRAN (R 4.4.0)
 class          7.3-22     2023-05-03 [1] CRAN (R 4.4.0)
 cli            3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
 codetools      0.2-20     2024-03-31 [1] CRAN (R 4.4.1)
 colorspace   * 2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
 crayon         1.5.3      2024-06-20 [1] CRAN (R 4.4.0)
 data.table     1.15.4     2024-03-30 [1] CRAN (R 4.3.1)
 dials        * 1.3.0      2024-07-30 [1] CRAN (R 4.4.0)
 DiceDesign     1.10       2023-12-07 [1] CRAN (R 4.3.1)
 digest         0.6.35     2024-03-11 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4      2023-11-17 [1] CRAN (R 4.3.1)
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.3.0)
 embed        * 1.1.4      2024-03-20 [1] CRAN (R 4.3.1)
 evaluate       0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
 fansi          1.0.6      2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
 fastmap        1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
 forcats      * 1.0.0      2023-01-29 [1] CRAN (R 4.3.0)
 foreach        1.5.2      2022-02-02 [1] CRAN (R 4.3.0)
 furrr          0.3.1      2022-08-15 [1] CRAN (R 4.3.0)
 future         1.33.2     2024-03-26 [1] CRAN (R 4.3.1)
 future.apply   1.11.2     2024-03-28 [1] CRAN (R 4.3.1)
 generics       0.1.3      2022-07-05 [1] CRAN (R 4.3.0)
 GGally       * 2.2.1      2024-02-14 [1] CRAN (R 4.4.0)
 ggplot2      * 3.5.1      2024-04-23 [1] CRAN (R 4.3.1)
 ggstats        0.6.0      2024-04-05 [1] CRAN (R 4.4.0)
 globals        0.16.3     2024-03-08 [1] CRAN (R 4.3.1)
 glue           1.8.0      2024-09-30 [1] CRAN (R 4.4.1)
 gower          1.0.1      2022-12-22 [1] CRAN (R 4.3.0)
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.3.0)
 gtable         0.3.5      2024-04-22 [1] CRAN (R 4.3.1)
 hardhat        1.4.0      2024-06-02 [1] CRAN (R 4.4.0)
 here           1.0.1      2020-12-13 [1] CRAN (R 4.3.0)
 hms            1.1.3      2023-03-21 [1] CRAN (R 4.3.0)
 htmltools      0.5.8.1    2024-04-04 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4      2023-12-06 [1] CRAN (R 4.3.1)
 infer        * 1.0.7      2024-03-25 [1] CRAN (R 4.3.1)
 ipred          0.9-14     2023-03-09 [1] CRAN (R 4.3.0)
 irlba          2.3.5.1    2022-10-03 [1] CRAN (R 4.4.0)
 iterators      1.0.14     2022-02-05 [1] CRAN (R 4.3.0)
 janeaustenr    1.0.0      2022-08-26 [1] CRAN (R 4.3.0)
 janitor        2.2.0      2023-02-02 [1] CRAN (R 4.3.0)
 jsonlite       1.8.9      2024-09-20 [1] CRAN (R 4.4.1)
 knitr          1.47       2024-05-29 [1] CRAN (R 4.4.0)
 labeling       0.4.3      2023-08-29 [1] CRAN (R 4.3.0)
 lattice        0.22-6     2024-03-20 [1] CRAN (R 4.4.0)
 lava           1.8.0      2024-03-05 [1] CRAN (R 4.3.1)
 lhs            1.1.6      2022-12-17 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4      2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1      2024-01-29 [1] CRAN (R 4.3.1)
 lubridate    * 1.9.3      2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.3.0)
 MASS           7.3-61     2024-06-13 [1] CRAN (R 4.4.0)
 Matrix         1.7-0      2024-03-22 [1] CRAN (R 4.4.0)
 modeldata    * 1.4.0      2024-06-19 [1] CRAN (R 4.4.0)
 munsell        0.5.1      2024-04-01 [1] CRAN (R 4.3.1)
 nnet           7.3-19     2023-05-03 [1] CRAN (R 4.4.0)
 parallelly     1.37.1     2024-02-29 [1] CRAN (R 4.3.1)
 parsnip      * 1.2.1      2024-03-22 [1] CRAN (R 4.3.1)
 pillar         1.9.0      2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.3.0)
 plyr           1.8.9      2023-10-02 [1] CRAN (R 4.3.1)
 prodlim        2023.08.28 2023-08-28 [1] CRAN (R 4.3.0)
 purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1      2021-08-19 [1] CRAN (R 4.3.0)
 RColorBrewer   1.1-3      2022-04-03 [1] CRAN (R 4.3.0)
 Rcpp           1.0.13     2024-07-17 [1] CRAN (R 4.4.0)
 RcppAnnoy      0.0.22     2024-01-23 [1] CRAN (R 4.3.1)
 readr        * 2.1.5      2024-01-10 [1] CRAN (R 4.3.1)
 recipes      * 1.0.10     2024-02-18 [1] CRAN (R 4.3.1)
 rlang          1.1.4      2024-06-04 [1] CRAN (R 4.3.3)
 rmarkdown      2.27       2024-05-17 [1] CRAN (R 4.4.0)
 rpart          4.1.23     2023-12-05 [1] CRAN (R 4.4.0)
 rprojroot      2.0.4      2023-11-05 [1] CRAN (R 4.3.1)
 rsample      * 1.2.1      2024-03-25 [1] CRAN (R 4.3.1)
 rstudioapi     0.16.0     2024-03-24 [1] CRAN (R 4.3.1)
 scales       * 1.3.0      2023-11-28 [1] CRAN (R 4.4.0)
 sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.3.0)
 snakecase      0.11.1     2023-08-27 [1] CRAN (R 4.3.0)
 SnowballC      0.7.1      2023-04-25 [1] CRAN (R 4.3.0)
 stringi        1.8.4      2024-05-06 [1] CRAN (R 4.3.1)
 stringr      * 1.5.1      2023-11-14 [1] CRAN (R 4.3.1)
 survival       3.7-0      2024-06-05 [1] CRAN (R 4.4.0)
 tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.3.0)
 tidymodels   * 1.2.0      2024-03-25 [1] CRAN (R 4.3.1)
 tidyr        * 1.3.1      2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.1      2024-03-11 [1] CRAN (R 4.3.1)
 tidytext     * 0.4.2      2024-04-10 [1] CRAN (R 4.4.0)
 tidyverse    * 2.0.0      2023-02-22 [1] CRAN (R 4.3.0)
 timechange     0.3.0      2024-01-18 [1] CRAN (R 4.3.1)
 timeDate       4032.109   2023-12-14 [1] CRAN (R 4.3.1)
 tokenizers     0.3.0      2022-12-22 [1] CRAN (R 4.3.0)
 tune         * 1.2.1      2024-04-18 [1] CRAN (R 4.3.1)
 tzdb           0.4.0      2023-05-12 [1] CRAN (R 4.3.0)
 utf8           1.2.4      2023-10-22 [1] CRAN (R 4.3.1)
 uwot           0.2.2      2024-04-21 [1] CRAN (R 4.4.0)
 vctrs          0.6.5      2023-12-01 [1] CRAN (R 4.3.1)
 vroom          1.6.5      2023-12-05 [1] CRAN (R 4.3.1)
 withr          3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
 workflows    * 1.1.4      2024-02-19 [1] CRAN (R 4.3.1)
 workflowsets * 1.1.0      2024-03-21 [1] CRAN (R 4.3.1)
 xfun           0.45       2024-06-16 [1] CRAN (R 4.4.0)
 yaml           2.3.10     2024-07-26 [1] CRAN (R 4.4.0)
 yardstick    * 1.3.1      2024-03-21 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────