AE 13: Predicting legislative policy attention

Application exercise

Modified

October 17, 2024

Load packages

# load packages
library(tidyverse)
library(tidymodels)
library(arrow)
library(textdata)
library(textrecipes)
library(here)
library(themis)
library(tictoc)
library(vip)
library(tidytext)
library(hardhat)
library(glmnet)

# use parallel processing
library(future)
plan(multisession)

Import data

leg <- read_parquet(file = "data/legislation.parquet")

Split data for model fitting

# split into training and testing
set.seed(852)

# sample only 20000 observations
leg_samp <- slice_sample(.data = leg, n = 2e04)

# split into training and testing sets
leg_split <- initial_split(data = leg_samp, strata = policy_lab, prop = 0.9)

leg_train <- training(x = leg_split)
leg_test <- testing(x = leg_split)

# create 5-fold cross-validation sets
leg_folds <- vfold_cv(data = leg_train, v = 5)

Null model

Demonstration: Estimate a null model to serve as a baseline for evaluating performance. How does it perform?

# estimate a null model
null_spec <- null_model() |>
  set_mode("classification") |>
  set_engine("parsnip")

null_rs <- workflow() |>
  add_formula(policy_lab ~ id) |>
  add_model(null_spec) |>
  fit_resamples(
    resamples = leg_folds
  )

collect_metrics(null_rs)

Add response here.

Lasso regression model

Estimate a simple lasso regression model using a bag-of-words representation of the legislative descriptions.

Define recipe

Your turn: Define a feature engineering recipe that uses the description to predict policy_lab. The recipe should:

Tokenize the description column.
Remove stopwords.
Keep tokens that appear more than 500 times in the corpus.
Convert the tokens to a term frequency-inverse document frequency (TF-IDF) representation.
Downsample using policy_lab to balance the classes.

Note

Step functions for text features are found in the {textrecipes} package.

glmnet_rec <- recipe(policy_lab ~ description, data = leg_train) |>
  # add the steps for the description
  TODO |>
  step_downsample(policy_lab)
glmnet_rec

glmnet_rec |>
  prep() |>
  bake(new_data = NULL)

Model specification

Demonstration: Specify a lasso regression model tuned over the penalty parameter.

# penalized logistic regression, tune over penalty - keep mixture = 1 (lasso regression)
glmnet_spec <- multinom_reg(penalty = tune(), mixture = 1) |>
  set_mode("classification") |>
  set_engine("glmnet")

Define workflow

Demonstration: Create the workflow for the model.

Sparse encoding

Recall that many of the cells in the prepared data frame contain 0s (e.g. token not used in a specific document). Regularized regression models with text features powered using set_engine("glmnet") can be more efficiently fit if we transform the data to a sparse matrix. This is done by specifying a non-default preprocessing blueprint using the {hardhat} package.

For more information, see this case study from SMLTAR.

# use sparse blueprint for more efficient model estimation
library(hardhat)
sparse_bp <- default_recipe_blueprint(composition = "dgCMatrix")

# define workflow
glmnet_wf <- workflow() |>
  add_recipe(glmnet_rec, blueprint = sparse_bp) |>
  add_model(glmnet_spec)

Tune the model

Demonstration: Create a tuning grid for the penalty parameter and tune the model.

# define tuning grid
penalty_grid <- grid_regular(penalty(range = c(-5, 0)), levels = 30)

# tune the model
set.seed(123)

tic()
glmnet_tune_rs <- tune_grid(
  object = glmnet_wf,
  resamples = leg_folds,
  grid = penalty_grid,
  control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)
toc()

Your turn: Examine the performance of the model, both overall and using the confusion matrix. Which misclassifications are most common? Why might the model have a hard time discriminating between these policy labels?

# view average metrics
autoplot(glmnet_tune_rs)

# identify best models based on assessment set
show_best(x = glmnet_tune_rs, metric = "roc_auc")
show_best(x = glmnet_tune_rs, metric = "accuracy")

# confusion matrix for best penalty value
conf_mat_resampled(
  x = glmnet_tune_rs,
  parameters = select_best(glmnet_tune_rs, metric = "roc_auc"),
  tidy = FALSE
) |>
  autoplot(type = "heatmap")

# most frequent misclassifications (average)
conf_mat_resampled(
  x = glmnet_tune_rs,
  parameters = select_best(glmnet_tune_rs, metric = "roc_auc")
) |>
  # filter out correct predictions and sort by frequency
  filter(Prediction != Truth) |>
  arrange(-Freq)

Add response here.

Variable importance

Your turn: Calculate which tokens are used by the model to predict each of the policy labels. Visualize the top 10 tokens for each policy label. Do these tokens make sense as useful predictors?

# fit the best model
glmnet_best <- glmnet_tune_rs |>
  fit_best()

# multiclass classification - need to manually extract coefficients for each
# outcome class
lasso_vip <- coef(extract_fit_engine(glmnet_best),
  s = select_best(x = glmnet_tune_rs, metric = "roc_auc")$penalty
) |>
  # need first element from each sublist
  map(\(x) x[, 1L]) |>
  # convert to a data frame and extract the relevant pieces of information
  enframe() |>
  unnest_longer(value) |>
  rename(
    token = value_id,
    importance = value,
    class = name
  ) |>
  # ignore the intercept
  filter(token != "(Intercept)") |>
  # clean up data to focus purely on magnitude of coefficients
  mutate(
    sign = ifelse(sign(importance) == 1, "POS", "NEG"),
    importance = abs(importance),
    token = str_remove_all(token, "tfidf_description_")
  ) |>
  # remove anything with importance of 0
  filter(importance != 0)

# visualize the top 10 tokens for each class
lasso_vip |>
  # keep the top 10 coefficients for each
  filter(sign == "POS") |>
  slice_max(n = 10, order_by = importance, by = class) |>
  # change order of token levels for plotting
  mutate(token = reorder_within(token, by = importance, within = class)) |>
  ggplot(mapping = aes(
    x = importance,
    y = token
  )) +
  geom_col() +
  scale_y_reordered() +
  facet_wrap(
    facets = vars(class),
    scales = "free",
    ncol = 4,
    labeller = labeller(class = label_wrap_gen(20))
  ) +
  labs(
    title = "Most important tokens for each policy class",
    x = "Importance",
    y = NULL
  )

Add response here.

\(n\)-grams

This model will be similar to the first one, but we will now utilize \(n\)-grams to capture more complex relationships between words.

Define recipe

Your turn: Modify your previous feature engineering recipe to include \(n\)-grams, for \(n \in 1, 2, 3, 4\).

ngram_rec <- recipe(policy_lab ~ description, data = leg_train) |>
  # tokenize and prep text
  TODO |>
  step_downsample(policy_lab)
ngram_rec

ngram_rec |>
  prep() |>
  bake(new_data = NULL)

Define workflow

Demonstration: Create the workflow for the model, reusing the previous model specification and a sparse blueprint.

# define workflow
ngram_wf <- workflow() |>
  add_recipe(ngram_rec, blueprint = sparse_bp) |>
  add_model(glmnet_spec)

Tune the model

Demonstration: Tune the model.

# tune the model
set.seed(123)

tic()
ngram_tune_rs <- tune_grid(
  object = ngram_wf,
  resamples = leg_folds,
  grid = penalty_grid,
  control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)
toc()

# view average metrics
autoplot(ngram_tune_rs)

# identify best models based on assessment set
show_best(x = ngram_tune_rs, metric = "roc_auc")
show_best(x = ngram_tune_rs, metric = "accuracy")

# confusion matrix for best penalty value
conf_mat_resampled(
  x = ngram_tune_rs,
  parameters = select_best(ngram_tune_rs, metric = "roc_auc"),
  tidy = FALSE
) |>
  autoplot(type = "heatmap")

# most frequent misclassifications (average)
conf_mat_resampled(
  x = ngram_tune_rs,
  parameters = select_best(glmnet_tune_rs, metric = "roc_auc")
) |>
  # filter out correct predictions and sort by frequency
  filter(Prediction != Truth) |>
  arrange(-Freq)

Add response here.

Variable importance

Your turn: Calculate which tokens are used by the model to predict each of the policy labels. Visualize the top 10 tokens for each policy label. Do these tokens make sense as useful predictors?

# fit the best model
ngram_best <- ngram_tune_rs |>
  fit_best()

# multiclass classification - need to manually extract coefficients for each
# outcome class
ngram_vip <- coef(extract_fit_engine(ngram_best),
  s = select_best(x = ngram_tune_rs, metric = "roc_auc")$penalty
) |>
  # need first element from each sublist
  map(\(x) x[, 1L]) |>
  # convert to a data frame and extract the relevant pieces of information
  enframe() |>
  unnest_longer(value) |>
  rename(
    token = value_id,
    importance = value,
    class = name
  ) |>
  # ignore the intercept
  filter(token != "(Intercept)") |>
  # clean up data to focus purely on magnitude of coefficients
  mutate(
    sign = ifelse(sign(importance) == 1, "POS", "NEG"),
    importance = abs(importance),
    token = str_remove_all(token, "tfidf_description_")
  ) |>
  # remove anything with importance of 0
  filter(importance != 0)

# visualize the top 10 tokens for each class
ngram_vip |>
  # keep the top 10 coefficients for each
  filter(sign == "POS") |>
  slice_max(n = 10, order_by = importance, by = class) |>
  # change order of token levels for plotting
  mutate(token = reorder_within(token, by = importance, within = class)) |>
  ggplot(mapping = aes(
    x = importance,
    y = token
  )) +
  geom_col() +
  scale_y_reordered() +
  facet_wrap(
    facets = vars(class),
    scales = "free",
    ncol = 4,
    labeller = labeller(class = label_wrap_gen(20))
  ) +
  labs(
    title = "Most important tokens for each policy class",
    x = "Importance",
    y = NULL
  )

Add response here.

Word embeddings

Finally we will fit a lasso regression model using word embeddings extracted from the GLoVE model to represent each description.

Define recipe

Your turn: Import the GLoVE 6b embeddings for 100 dimensions. Define a recipe that uses these embeddings to predict policy_lab.

Note

Normally in order to import GloVe embeddings you would use the code below:

glove_embed <- embedding_glove6b(dimensions = 100)

This downloads the ZIP file containing the embeddings, stores it in a cache folder, and then imports the requested embeddings and dimensions as a data frame. Note that many of the embeddings are stored in ZIP files that are multiple gigabytes in size. Often it is easier to manually download the files and store them in the appropriate location outside of R. See the documentation for embedding_glove*() for more information.

# ##### uncomment if you are running RStudio on your personal computer
# # extract 100 dimensions from GLoVE
# glove6b <- embedding_glove6b(dimensions = 100)

# ##### uncomment if you are running RStudio on the Workbench
# # hacky way to make it work on RStudio Workbench
# glove6b <- read_delim(
#     file = "/rstudio-files/glove6b/glove.6B.100d.txt",
#     delim = " ",
#     quote = "",
#     col_names = c(
#       "token",
#       paste0("d", seq_len(100))
#     ),
#     col_types = paste0(
#       c(
#         "c",
#         rep("d", 100)
#       ),
#       collapse = ""
#     )
#   )

# initialize recipe
embed_rec <- recipe(policy_lab ~ description, data = leg_train) |>
  # tokenize
  step_tokenize(description) |>
  # convert to word embeddings
  TODO |>
  # normalize the columns
  step_zv(all_predictors()) |>
  step_normalize(all_predictors()) |>
  # downsample to keep same number of rows for each policy focus
  step_downsample(policy_lab)

embed_rec |>
  prep() |>
  bake(new_data = NULL)

Tune the penalized regression model

Demonstration: Tune the model.

Note

Since this model uses word embeddings, we will not use the sparse blueprint for the recipe. The embeddings are already in a dense format.

# define workflow
embed_wf <- workflow() |>
  add_recipe(embed_rec) |>
  add_model(glmnet_spec)

tic()
embed_tune_rs <- tune_grid(
  object = embed_wf,
  resamples = leg_folds,
  grid = penalty_grid,
  control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)
toc()

# view average metrics
autoplot(embed_tune_rs)

# identify best models based on assessment set
show_best(x = embed_tune_rs, metric = "roc_auc")
show_best(x = embed_tune_rs, metric = "accuracy")

# confusion matrix for best penalty value
conf_mat_resampled(
  x = embed_tune_rs,
  parameters = select_best(embed_tune_rs, metric = "roc_auc"),
  tidy = FALSE
) |>
  autoplot(type = "heatmap")

# most frequent misclassifications (average)
conf_mat_resampled(
  x = embed_tune_rs,
  parameters = select_best(glmnet_tune_rs, metric = "roc_auc")
) |>
  # filter out correct predictions and sort by frequency
  filter(Prediction != Truth) |>
  arrange(-Freq)

Add response here.

Variable importance

Your turn: Calculate which dimensions are used by the model to predict each of the policy labels. Visualize the top 10 dimensions for each policy label. How useful is this analysis?

# fit the best model
embed_best <- embed_tune_rs |>
  fit_best()

# multiclass classification - need to manually extract coefficients for each
# outcome class
embed_vip <- coef(extract_fit_engine(embed_best),
  s = select_best(x = embed_tune_rs, metric = "roc_auc")$penalty
) |>
  # need first element from each sublist
  map(\(x) x[, 1L]) |>
  # convert to a data frame and extract the relevant pieces of information
  enframe() |>
  unnest_longer(value) |>
  rename(
    token = value_id,
    importance = value,
    class = name
  ) |>
  # ignore the intercept
  filter(token != "(Intercept)") |>
  # clean up data to focus purely on magnitude of coefficients
  mutate(
    sign = ifelse(sign(importance) == 1, "POS", "NEG"),
    importance = abs(importance),
    token = str_remove_all(token, "wordembed_description_")
  ) |>
  # remove anything with importance of 0
  filter(importance != 0)

# visualize the top 10 tokens for each class
embed_vip |>
  # keep the top 10 coefficients for each
  filter(sign == "POS") |>
  slice_max(n = 10, order_by = importance, by = class) |>
  # change order of token levels for plotting
  mutate(token = reorder_within(token, by = importance, within = class)) |>
  ggplot(mapping = aes(
    x = importance,
    y = token
  )) +
  geom_col() +
  scale_y_reordered() +
  facet_wrap(
    facets = vars(class),
    scales = "free",
    ncol = 4,
    labeller = labeller(class = label_wrap_gen(20))
  ) +
  labs(
    title = "Most important dimensions for each policy class",
    x = "Importance",
    y = NULL
  )

Add response here.