AE 14: Predicting legislative policy attention using a dense neural network
Application exercise
Note
Some chunks require you to fill in the TODOs
with the appropriate values. Fill in these values and take careful notes on the code/output as we work through the document.
Load packages
Import data
leg <- read_parquet(file = "data/legislation.parquet")
Split data for model fitting
# split into training and testing
set.seed(123)
# split into training and testing sets
leg_split <- initial_split(data = leg, strata = policy_lab, prop = 0.75)
legislation_train <- training(x = leg_split)
legislation_test <- testing(x = leg_split)
Typical length of legislative description
legislation_train |>
unnest_tokens(output = word, input = description) |>
anti_join(y = stop_words) |>
count(id) |>
ggplot(mapping = aes(x = n)) +
geom_histogram(binwidth = 10, color = "white") +
geom_rug() +
labs(x = "Number of words per description",
y = "Number of bills")
Simple dense neural network
Preprocessing
max_words <- TODO
max_length <- TODO
# generate basic recipe for sequental one-hot encoding
leg_rec <- recipe(policy_lab ~ description, data = legislation_train) |>
step_tokenize(description) |>
step_stopwords(description) |>
step_stem(description) |>
step_tokenfilter(description, max_tokens = max_words) |>
step_sequence_onehot(description, sequence_length = max_length) |>
# fit policy_lab to numeric encoding
step_integer(policy_lab, zero_based = TRUE)
leg_rec
# prep and apply feature engineering recipe
leg_prep <- prep(leg_rec)
# what did we make?
leg_prep |> bake(new_data = slice(legislation_train, 1:5))
# bake to get outcome only
leg_train_outcome <- bake(leg_prep, new_data = NULL, starts_with("policy_lab"))
# get weights for each policy topic
leg_train_weights <- count(leg_train_outcome, policy_lab) |>
mutate(pct = n / sum(n)) |>
select(-n) |>
deframe() |>
as.list()
# get outcome as vector of numeric integers for each topic
leg_train_outcome <- to_categorical(leg_train_outcome$policy_lab)
dim(leg_train_outcome)
head(leg_train_outcome)
# bake to get features only
leg_train <- bake(leg_prep, new_data = NULL, composition = "matrix", -starts_with("policy_lab"))
dim(leg_train)
head(leg_train)
Simple flattened dense network
dense_model <- keras_model_sequential(input_shape = c(TODO)) |>
layer_embedding(input_dim = TODO,
output_dim = TODO) |>
layer_flatten() |>
layer_dense(units = 64, activation = "relu") |>
layer_dense(units = 20, activation = "softmax")
summary(dense_model)
Pre-trained word embeddings
Prep embeddings
# download and extract embeddings
# ##### uncomment if you are running RStudio on your personal computer
# # extract 100 dimensions from GLoVE
# glove6b <- embedding_glove6b(dimensions = 100)
# ##### uncomment if you are running RStudio on the Workbench
# # hacky way to make it work on RStudio Workbench
# glove6b <- read_delim(
# file = "/rstudio-files/glove6b/glove.6B.100d.txt",
# delim = " ",
# quote = "",
# col_names = c(
# "token",
# paste0("d", seq_len(100))
# ),
# col_types = paste0(
# c(
# "c",
# rep("d", 100)
# ),
# collapse = ""
# )
# )
# filter to only include tokens in policy description vocab
glove6b_matrix <- tidy(leg_prep, 5) |>
select(token) |>
left_join(glove6b) |>
# add row to capture all tokens not in GLoVE
add_row(.before = 1) |>
mutate(across(.cols = starts_with("d"), .fns = \(x) replace_na(data = x, replace = 0))) |>
select(-token) |>
as.matrix()
Train model
# declare model specification
dense_model_pte <- keras_model_sequential(input_shape = c(TODO)) |>
layer_embedding(input_dim = TODO,
output_dim = TODO,
weights = glove6b_matrix,
trainable = FALSE) |>
layer_flatten() |>
layer_dense(units = 64, activation = "relu") |>
layer_dense(units = 20, activation = "softmax")
summary(dense_model_pte)
Allow weights to adjust
# declare model specification
dense_model_pte2 <- keras_model_sequential(input_shape = c(TODO)) |>
layer_embedding(input_dim = TODO,
output_dim = TODO,
weights = glove6b_matrix) |>
layer_flatten() |>
layer_dense(units = 64, activation = "relu") |>
layer_dense(units = 20, activation = "softmax")
summary(dense_model_pte2)