AE 14: Predicting legislative policy attention using a dense neural network

Application exercise

Modified

October 23, 2024

Note

Some chunks require you to fill in the TODOs with the appropriate values. Fill in these values and take careful notes on the code/output as we work through the document.

Load packages

library(tidyverse)
library(tidymodels)
library(keras3)
library(textrecipes)
library(textdata)
library(arrow)
library(tidytext)

set.seed(123)

Import data

leg <- read_parquet(file = "data/legislation.parquet")

Split data for model fitting

# split into training and testing
set.seed(123)

# split into training and testing sets
leg_split <- initial_split(data = leg, strata = policy_lab, prop = 0.75)

legislation_train <- training(x = leg_split)
legislation_test <- testing(x = leg_split)

Typical length of legislative description

legislation_train |>
  unnest_tokens(output = word, input = description) |>
  anti_join(y = stop_words) |>
  count(id) |>
  ggplot(mapping = aes(x = n)) +
  geom_histogram(binwidth = 10, color = "white") +
  geom_rug() +
  labs(x = "Number of words per description",
       y = "Number of bills")

Simple dense neural network

Preprocessing

max_words <- TODO
max_length <- TODO

# generate basic recipe for sequental one-hot encoding
leg_rec <- recipe(policy_lab ~ description, data = legislation_train) |>
  step_tokenize(description) |>
  step_stopwords(description) |>
  step_stem(description) |>
  step_tokenfilter(description, max_tokens = max_words) |>
  step_sequence_onehot(description, sequence_length = max_length) |>
  # fit policy_lab to numeric encoding
  step_integer(policy_lab, zero_based = TRUE)
leg_rec

# prep and apply feature engineering recipe
leg_prep <- prep(leg_rec)

# what did we make?
leg_prep |> bake(new_data = slice(legislation_train, 1:5))

# bake to get outcome only
leg_train_outcome <- bake(leg_prep, new_data = NULL, starts_with("policy_lab")) 

# get weights for each policy topic
leg_train_weights <- count(leg_train_outcome, policy_lab) |>
  mutate(pct = n / sum(n)) |>
  select(-n) |>
  deframe() |>
  as.list()

# get outcome as vector of numeric integers for each topic
leg_train_outcome <- to_categorical(leg_train_outcome$policy_lab)
dim(leg_train_outcome)
head(leg_train_outcome)

# bake to get features only
leg_train <- bake(leg_prep, new_data = NULL, composition = "matrix", -starts_with("policy_lab"))
dim(leg_train)
head(leg_train)

Simple flattened dense network

dense_model <- keras_model_sequential(input_shape = c(TODO)) |>
  layer_embedding(input_dim = TODO,
                  output_dim = TODO) |>
  layer_flatten() |>
  layer_dense(units = 64, activation = "relu") |>
  layer_dense(units = 20, activation = "softmax")

summary(dense_model)

dense_model |>
  compile(
    optimizer = "adam", 
    loss = "categorical_crossentropy",
    metrics = c("accuracy", "auc")
  )

dense_history <- dense_model |>
  fit(
    x = leg_train,
    y = leg_train_outcome,
    batch_size = TODO,
    epochs = TODO,
    validation_split = TODO,
    class_weight = leg_train_weights
  )

plot(dense_history)

Pre-trained word embeddings

Prep embeddings

# download and extract embeddings
# ##### uncomment if you are running RStudio on your personal computer
# # extract 100 dimensions from GLoVE
# glove6b <- embedding_glove6b(dimensions = 100)

# ##### uncomment if you are running RStudio on the Workbench
# # hacky way to make it work on RStudio Workbench
# glove6b <- read_delim(
#     file = "/rstudio-files/glove6b/glove.6B.100d.txt",
#     delim = " ",
#     quote = "",
#     col_names = c(
#       "token",
#       paste0("d", seq_len(100))
#     ),
#     col_types = paste0(
#       c(
#         "c",
#         rep("d", 100)
#       ),
#       collapse = ""
#     )
#   )

# filter to only include tokens in policy description vocab
glove6b_matrix <- tidy(leg_prep, 5) |>
  select(token) |>
  left_join(glove6b) |>
  # add row to capture all tokens not in GLoVE
  add_row(.before = 1) |>
  mutate(across(.cols = starts_with("d"), .fns = \(x) replace_na(data = x, replace = 0))) |>
  select(-token) |>
  as.matrix()

Train model

# declare model specification
dense_model_pte <- keras_model_sequential(input_shape = c(TODO)) |>
  layer_embedding(input_dim = TODO,
                  output_dim = TODO,
                  weights = glove6b_matrix,
                  trainable = FALSE) |>
  layer_flatten() |>
  layer_dense(units = 64, activation = "relu") |>
  layer_dense(units = 20, activation = "softmax")
summary(dense_model_pte)

dense_model_pte |>
  compile(
    optimizer = "adam",
    loss = "categorical_crossentropy",
    metrics = c("accuracy", "auc")
  )

dense_history_pte <- dense_model_pte |>
  fit(
    x = leg_train,
    y = leg_train_outcome,
    batch_size = TODO,
    epochs = TODO,
    validation_split = TODO,
    class_weight = leg_train_weights
  )

plot(dense_history_pte)

Allow weights to adjust

# declare model specification
dense_model_pte2 <- keras_model_sequential(input_shape = c(TODO)) |>
  layer_embedding(input_dim = TODO,
                  output_dim = TODO,
                  weights = glove6b_matrix) |>
  layer_flatten() |>
  layer_dense(units = 64, activation = "relu") |>
  layer_dense(units = 20, activation = "softmax")
summary(dense_model_pte2)

dense_model_pte2 |>
  compile(
    optimizer = "adam",
    loss = "categorical_crossentropy",
    metrics = c("accuracy", "auc")
  )

dense_pte2_history <- dense_model_pte2 |>
  fit(
    x = leg_train,
    y = leg_train_outcome,
    batch_size = TODO,
    epochs = TODO,
    validation_split = TODO,
    class_weight = leg_train_weights
  )

plot(dense_pte2_history)