AE 10: Dimension reduction with cocktails
Cocktails
Tidy Tuesday published a data set on cocktail recipes. In this application exercise, we will use dimension reduction techniques to explore the data set based on the recipes’ ingredients.
cocktails <- read_csv("data/boston_cocktails.csv")
cocktails
Data cleaning
cocktails_parsed <- cocktails |>
mutate(
ingredient = str_to_lower(ingredient),
ingredient = str_replace_all(ingredient, "-", " "),
ingredient = str_remove(ingredient, " liqueur$"),
ingredient = str_remove(ingredient, " (if desired)$"),
ingredient = case_when(
str_detect(ingredient, "bitters") ~ "bitters",
str_detect(ingredient, "lemon") ~ "lemon juice",
str_detect(ingredient, "lime") ~ "lime juice",
str_detect(ingredient, "grapefruit") ~ "grapefruit juice",
str_detect(ingredient, "orange") ~ "orange juice",
.default = ingredient
),
measure = case_when(
str_detect(ingredient, "bitters") ~ str_replace(measure, "oz$", "dash"),
.default = measure
),
measure = str_replace(measure, " ?1/2", ".5"),
measure = str_replace(measure, " ?3/4", ".75"),
measure = str_replace(measure, " ?1/4", ".25"),
measure_number = parse_number(measure),
measure_number = if_else(str_detect(measure, "dash$"),
measure_number / 50,
measure_number
)
) |>
# only keep ingredients that appear more than 15 times
filter(n() >= 15, .by = ingredient) |>
distinct(row_id, ingredient, .keep_all = TRUE) |>
drop_na()
cocktails_parsed
# convert to wide format for modeling
cocktails_df <- cocktails_parsed |>
select(-ingredient_number, -row_id, -measure) |>
pivot_wider(names_from = ingredient, values_from = measure_number, values_fill = 0) |>
janitor::clean_names() |>
drop_na()
cocktails_df
Principal components analysis
Estimate PCA
Your turn: Apply PCA to the data set. Define and prepare a feature engineering recipe that scales the ingredients features and then applies PCA.
<- recipe(TODO, data = cocktails_df) |>
pca_rec update_role(name, category, new_role = "id") |>
TODO
<- prep(pca_rec)
pca_prep pca_prep
Interpret results
Your turn: Explore the results of the PCA by identifying how each ingredient loads (contributes to) the first five principal components. How would you describe each component? What do they seem to measure?
tidied_pca <- tidy(pca_prep, 2)
tidied_pca |>
filter(component %in% str_glue("PC{1:5}")) |>
mutate(component = fct_inorder(component)) |>
ggplot(mapping = aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
scale_color_discrete_qualitative() +
facet_wrap(facets = vars(component), nrow = 1) +
labs(y = NULL)
tidied_pca |>
# only first four components
filter(component %in% str_glue("PC{1:4}")) |>
# top 8 ingredients per component
slice_max(order_by = abs(value), n = 10, by = component) |>
# reorder the terms in decreasing magnitude for visualization
mutate(terms = reorder_within(terms, abs(value), component)) |>
ggplot(mapping = aes(x = value, y = terms, fill = value > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(facets = vars(component), scales = "free_y") +
scale_y_reordered() +
labs(
x = "Value of contribution",
y = NULL
)
Add response here.
Your turn: How are the beverages distributed along the first two dimensions? Is there a pattern to these dimensions?
cocktails_pca <- bake(pca_prep, new_data = NULL) |>
# collapse infrequent categories
mutate(category = fct_lump_lowfreq(f = category))
ggplot(data = cocktails_pca, mapping = aes(x = PC1, y = PC2, label = name)) +
geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
scale_color_discrete_qualitative() +
labs(color = NULL)
ggplot(data = cocktails_pca, mapping = aes(x = PC1, y = PC2, label = name)) +
geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
scale_color_discrete_qualitative() +
labs(color = NULL)
# first four components
library(GGally)
ggpairs(
data = cocktails_pca,
columns = 3:6,
lower = list(
mapping = aes(color = category)
),
diag = list(
mapping = aes(color = category, fill = category)
),
progress = FALSE
) +
scale_color_discrete_qualitative() +
scale_fill_discrete_qualitative() +
labs(
color = "Label",
fill = "Label"
)
Add response here.
Your turn: How much do the dimensions explain variation in the data set? If you were going to use these components for a predictive model, how many would you include?
tidy(pca_prep, number = _____, type = _____) |>
filter(terms == "percent variance") |>
ggplot(mapping = aes(x = component, y = value)) +
geom_line() +
scale_y_continuous(labels = label_percent(scale = 1)) +
labs(
x = "Principal component",
y = "Percent variance"
)
tidy(pca_prep, number = _____, type = _____) |>
filter(terms == "cumulative percent variance") |>
ggplot(mapping = aes(x = component, y = value)) +
geom_line() +
scale_y_continuous(labels = label_percent(scale = 1)) +
labs(
x = "Principal component",
y = "Cumulative variance"
)
Add response here.
UMAP
Your turn: Apply UMAP to the data set. Define and prepare a feature engineering recipe that scales the ingredients features and then applies UMAP, retaining four UMAP dimensions. Examine the location of cocktails along the first two UMAP dimensions. How does this compare to the PCA results?
set.seed(532)
<- recipe(TODO, data = cocktails_df) |>
umap_rec update_role(name, category, new_role = "id") |>
TODO
<- prep(umap_rec)
umap_prep
umap_prep
bake(umap_prep, new_data = NULL) |>
# collapse infrequent categories
mutate(category = fct_lump_lowfreq(f = category)) |>
ggplot(mapping = aes(x = UMAP1, y = UMAP2, label = name)) +
geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
scale_color_discrete_qualitative() +
labs(color = NULL)
bake(umap_prep, new_data = NULL) |>
# collapse infrequent categories
mutate(category = fct_lump_lowfreq(f = category)) |>
ggplot(mapping = aes(x = UMAP1, y = UMAP2, label = name)) +
geom_point(mapping = aes(color = category), alpha = 0.7, size = 2) +
scale_color_discrete_qualitative() +
labs(color = NULL)
# first four dimensions
umap_prep |>
bake(new_data = NULL) |>
# collapse infrequent categories
mutate(category = fct_lump_lowfreq(f = category)) |>
ggpairs(
columns = 3:6,
lower = list(
mapping = aes(color = category)
),
diag = list(
mapping = aes(color = category, fill = category)
),
progress = FALSE
) +
scale_color_discrete_qualitative() +
scale_fill_discrete_qualitative() +
labs(
color = "Label",
fill = "Label"
)
Add response here.
Acknowledgments
- Application exercise drawn from PCA and UMAP with tidymodels and #TidyTuesday cocktail recipes by Julia Silge and licensed under CC BY-SA 4.0.