AE 15: Deploying models to the cloud using Docker

Suggested answers

Application exercise
Answers
R
Python
Modified

October 27, 2025

Load the data

library(tidyverse)
library(vetiver)

housing <- read_csv(file = "data/tompkins-home-sales.csv")
glimpse(housing)
Rows: 1,225
Columns: 12
$ sold_date    <date> 2022-09-12, 2022-09-12, 2022-09-12, 2022-09-13, 2022-03-…
$ price        <dbl> 340000, 390000, 625500, 246600, 205000, 230000, 246000, 3…
$ beds         <dbl> 2, 4, 2, 2, 2, 5, 5, 3, 5, 3, 2, 2, 4, 3, 5, 4, 3, 4, 3, …
$ baths        <dbl> 3.0, 3.0, 3.0, 1.5, 1.0, 2.0, 2.0, 2.5, 4.0, 1.0, 1.5, 2.…
$ area         <dbl> 1864, 3252, 1704, 1264, 820, 2900, 2364, 2016, 2882, 1246…
$ lot_size     <dbl> 4.50000000, 0.33999082, 65.00000000, 0.21000918, 0.239990…
$ year_built   <dbl> 1999, 1988, 1988, 1953, 1932, 1850, 1985, 1984, 2002, 196…
$ hoa_month    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ town         <chr> "Newfield", "Ithaca", "Dryden", "Ithaca", "Ithaca", "Lans…
$ municipality <chr> "Unincorporated", "Unincorporated", "Unincorporated", "It…
$ long         <dbl> -76.59488, -76.45546, -76.35953, -76.52435, -76.48761, -7…
$ lat          <dbl> 42.38609, 42.47046, 42.43971, 42.45208, 42.42739, 42.6182…
import pandas as pd
housing = pd.read_csv('data/tompkins-home-sales.csv')

Build a model

  • Log transform the price variable
  • Split into training/test set
library(tidymodels)
set.seed(123)

housing_split <- housing |>
  mutate(price = log10(price)) |>
  initial_split(prop = 0.8)

housing_train <- training(housing_split)
housing_test <- testing(housing_split)
from sklearn import model_selection
import numpy as np
np.random.seed(123)
X, y = housing[["beds", "baths", "area", "year_built", "town"]], np.log10(housing["price"])
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y,
    test_size = 0.2
)

Train a random forest model:

rf_rec <- recipe(
  price ~ beds + baths + area + year_built + town,
  data = housing_train
) |>
  step_impute_mean(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors())

housing_fit <- workflow() |>
  add_recipe(rf_rec) |>
  add_model(rand_forest(trees = 200, mode = "regression")) |>
  fit(data = housing_train)
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define feature columns
numeric_features = ["beds", "baths", "area", "year_built"]
categorical_features = ["town"]

# Create preprocessing steps
numeric_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Create pipeline with preprocessor and model
housing_fit = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, random_state=123))
])

# Prepare training data with all features
X_train_full = housing.loc[X_train.index, numeric_features + categorical_features]
housing_fit.fit(X_train_full, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['beds', 'baths', 'area',
                                                   'year_built']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['town'])])),
                ('regressor',
                 RandomForestRegressor(n_estimators=200, random_state=123))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Create a Docker container using a local board

Pin model to a local board

library(pins)

v <- vetiver_model(model = housing_fit, model_name = "tompkins-housing")
v

── tompkins-housing ─ <bundled_workflow> model for deployment 
A ranger regression modeling workflow using 5 features
board <- board_local(versioned = TRUE)

board |>
  vetiver_pin_write(v)

board |>
  pin_meta("tompkins-housing")
List of 13
 $ file       : chr "tompkins-housing.rds"
 $ file_size  : 'fs_bytes' int 736K
 $ pin_hash   : chr "2277f3cca9bfb8a2"
 $ type       : chr "rds"
 $ title      : chr "tompkins-housing: a pinned list"
 $ description: chr "A ranger regression modeling workflow"
 $ tags       : NULL
 $ urls       : NULL
 $ created    : POSIXct[1:1], format: "2025-10-27 09:35:16"
 $ api_version: int 1
 $ user       :List of 2
  ..$ required_pkgs: chr [1:4] "parsnip" "ranger" "recipes" "workflows"
  ..$ renv_lock    : NULL
 $ name       : chr "tompkins-housing"
 $ local      :List of 3
  ..$ dir    : 'fs_path' chr "~/Library/Application Support/pins/tompkins-housing/20251027T133516Z-2277f"
  ..$ url    : NULL
  ..$ version: chr "20251027T133516Z-2277f"
from pins import board_local
from vetiver import vetiver_pin_write, VetiverModel

board = board_local(versioned = True, allow_pickle_read = True)
v = VetiverModel(housing_fit, "tompkins-housing", prototype_data = X_train)
vetiver_pin_write(board, v)
Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
('The hash of pin "tompkins-housing" has not changed. Your pin will not be stored.',)
board.pin_meta("tompkins-housing")
Meta(title='tompkins-housing: a pinned Pipeline object', description='A scikit-learn Pipeline model', created='20251020T104518Z', pin_hash='861faa60ce18719a', file='tompkins-housing.joblib', file_size=17553843, type='joblib', api_version=1, version=Version(created=datetime.datetime(2025, 10, 20, 10, 45, 18), hash='861fa'), tags=None, name='tompkins-housing', user={'user': {}, 'vetiver_meta': {'prototype': '{"beds": 3, "baths": 3.0, "area": 3015, "year_built": 1932, "town": "Ithaca"}', 'python_version': [3, 13, 6, 'final', 0], 'required_pkgs': ['scikit-learn']}}, local={})

Create Docker artifacts

vetiver_prepare_docker(
  board,
  "tompkins-housing",
  docker_args = list(port = 8080)
)
import vetiver

vetiver.prepare_docker(
    board, 
    "tompkins-housing",
    port = 8080
)

Build and test Docker container

Run these commands in the Terminal tab of Positron or your local terminal, replacing <NETID> with your actual NetID :

docker build -t housing-<NETID> .
docker run -p 8080:8080 housing-<NETID>
Use your own NetID

Students using Posit Workbench are on a shared server where everyone is building and running containers from the same device. You need to ensure your container has a unique name to avoid conflicts with other users. Likewise, we need to ensure the port number where you are broadcasting the API is unique.

Run these commands in the Terminal tab of Positron or your local terminal:

docker build -t housing .
docker run -p 8080:8080 housing

Create a Docker container using a cloud board

Pin to Google Cloud

In .Renviron:

GCS_AUTH_FILE="service-auth.json"
library(googleCloudStorageR)

board <- board_gcs(bucket = "info-4940-models", prefix = "<NETID>/")

board |>
  vetiver_pin_write(v)

board |>
  pin_meta("tompkins-housing")

In .env:

GOOGLE_APPLICATION_CREDENTIALS="service-auth.json"
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

from pins import board_gcs
from vetiver import VetiverModel, vetiver_pin_write

board = board_gcs("info-4940-models/<NETID>/", cache=None, allow_pickle_read=True)
vetiver_pin_write(board, v)

board.pin_meta("tompkins-housing")

Create Docker artifacts

vetiver_prepare_docker(
  board,
  "tompkins-housing",
  docker_args = list(port = 8080)
)
vetiver.prepare_docker(
    board, 
    "tompkins-housing",
    port = 8080
)

Build and test Docker container

Run these commands in the Terminal tab of Positron or your local terminal, replacing <NETID> with your actual NetID:

docker build -t housing-<NETID> .
docker run -p 8080:8080 housing-<NETID>

Run these commands in the Terminal tab of Positron or your local terminal:

docker build -t housing .
docker run -p 8080:8080 housing

Modify Dockerfile to include GCS authentication

  • Modify plumber.R to load {googleCloudStorageR}

    vetiver_write_plumber(
      board,
      "tompkins-housing",
      file = "plumber.R",
      additional_pkgs = "googleCloudStorageR"
    )
  • Copy service-auth.json to same directory as Dockerfile

  • Modify Dockerfile to correctly incorporate service-auth.json. After the run apt-get step, add the following lines

    COPY service-auth.json /opt/ml/service-auth.json
    ENV GCS_AUTH_FILE="/opt/ml/service-auth.json"
  • Modify vetiver_requirements.txt to include gcsfs dependency

    gcsfs
  • Copy service-auth.json to same directory as Dockerfile

  • Modify Dockerfile to correctly incorporate service-auth.json. After the COPY app.py step, add the following lines

    COPY service-auth.json /vetiver/app/service-auth.json
    ENV GOOGLE_APPLICATION_CREDENTIALS="/vetiver/app/service-auth.json"

Build and test Docker container

Run these commands in the Terminal tab of Positron or your local terminal, replacing <NETID> with your actual NetID:

docker build -t housing-<NETID> .
docker run -p 8080:8080 housing-<NETID>

Run these commands in the Terminal tab of Positron or your local terminal:

docker build -t housing .
docker run -p 8080:8080 housing

Test the API

endpoint <- vetiver_endpoint("http://0.0.0.0:8080/predict")
predict(endpoint, housing_test)
from vetiver.server import predict, vetiver_endpoint

url = "http://0.0.0.0:8080/predict"
endpoint = vetiver_endpoint(url)
predict(endpoint = endpoint, data = X_test.head(5))

Compute model metrics and store in pin

housing_test_metrics <- augment(housing_fit, housing_test) |>
  metrics(truth = price, estimate = .pred)

v <- vetiver_model(
  model = housing_fit,
  model_name = "tompkins-housing",
  metadata = list(metrics = housing_test_metrics)
)
v

── tompkins-housing ─ <bundled_workflow> model for deployment 
A ranger regression modeling workflow using 5 features
board |> vetiver_pin_write(v)
from sklearn import metrics

metric_set = [
    metrics.root_mean_squared_error,
    metrics.r2_score,
    metrics.mean_absolute_error,
]
y_predictions = pd.Series(housing_fit.predict(X_test))

housing_metrics = pd.DataFrame()

for metric in metric_set:
    metric_name = str(metric.__name__)
    metric_output = metric(y_test, y_predictions)
    housing_metrics = pd.concat(
        (
            housing_metrics,
            pd.DataFrame({"name": [metric_name], "score": [metric_output]}),
        ),
        axis=0,
    )

housing_metrics.reset_index(inplace=True, drop=True)
housing_metrics
                      name     score
0  root_mean_squared_error  0.178120
1                 r2_score  0.577999
2      mean_absolute_error  0.131287
# generate vetiver model
v = VetiverModel(
    housing_fit,
    "tompkins-housing",
    prototype_data = X_train,
    metadata = housing_metrics.to_dict()
)

# write new version of pin with metrics metadata
vetiver_pin_write(board, v)
Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
('The hash of pin "tompkins-housing" has not changed. Your pin will not be stored.',)

Retrieve model metrics

extracted_metrics <- board |>
  pin_meta("tompkins-housing") |>
  pluck("user", "metrics") |>
  as_tibble()

extracted_metrics
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.175
2 rsq     standard       0.560
3 mae     standard       0.125
metadata = board.pin_meta("tompkins-housing")
extracted_metrics = pd.DataFrame(metadata.user.get("user"))
extracted_metrics
Empty DataFrame
Columns: []
Index: []

What else might you want to store as model metadata? How or when might you use model metadata?

Add response here.

Acknowledgments

sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.1 (2025-06-13)
 os       macOS Tahoe 26.0.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  C.UTF-8
 ctype    C.UTF-8
 tz       America/New_York
 date     2025-10-27
 pandoc   3.6.3 @ /Applications/Positron.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
 quarto   1.7.33 @ /Users/bcs88/Projects/info-4940/course-site/.venv/bin/quarto

─ Packages ───────────────────────────────────────────────────────────────────
 ! package      * version    date (UTC) lib source
 P archive        1.1.12     2025-03-20 [?] CRAN (R 4.5.0)
 P backports      1.5.0      2024-05-23 [?] RSPM (R 4.5.0)
 P bit            4.6.0      2025-03-06 [?] RSPM (R 4.5.0)
 P bit64          4.6.0-1    2025-01-16 [?] RSPM (R 4.5.0)
 P broom        * 1.0.9      2025-07-28 [?] RSPM (R 4.5.0)
 P bundle         0.1.2      2024-11-12 [?] RSPM
 P butcher        0.3.5      2025-03-18 [?] RSPM (R 4.5.1)
 P class          7.3-23     2025-01-01 [?] CRAN (R 4.5.1)
 P cli            3.6.5      2025-04-23 [?] RSPM (R 4.5.0)
 P codetools      0.2-20     2024-03-31 [?] CRAN (R 4.5.1)
 P crayon         1.5.3      2024-06-20 [?] RSPM (R 4.5.0)
 P data.table     1.17.8     2025-07-10 [?] RSPM (R 4.5.0)
 P dials        * 1.4.1      2025-07-29 [?] RSPM
 P DiceDesign     1.10       2023-12-07 [?] RSPM (R 4.5.0)
 P digest         0.6.37     2024-08-19 [?] RSPM (R 4.5.0)
 P dplyr        * 1.1.4      2023-11-17 [?] RSPM (R 4.5.0)
 P evaluate       1.0.4      2025-06-18 [?] RSPM (R 4.5.1)
 P farver         2.1.2      2024-05-13 [?] RSPM (R 4.5.0)
 P fastmap        1.2.0      2024-05-15 [?] RSPM (R 4.5.0)
 P forcats      * 1.0.0      2023-01-29 [?] RSPM (R 4.5.0)
 P foreach        1.5.2      2022-02-02 [?] RSPM
 P fs             1.6.6      2025-04-12 [?] RSPM (R 4.5.0)
 P furrr          0.3.1      2022-08-15 [?] RSPM
 P future         1.67.0     2025-07-29 [?] RSPM
 P future.apply   1.20.0     2025-06-06 [?] RSPM
 P generics       0.1.4      2025-05-09 [?] RSPM (R 4.5.0)
 P ggplot2      * 4.0.0      2025-09-11 [?] RSPM
 P globals        0.18.0     2025-05-08 [?] RSPM
 P glue           1.8.0      2024-09-30 [?] RSPM (R 4.5.0)
 P gower          1.0.2      2024-12-17 [?] RSPM
 P GPfit          1.0-9      2025-04-12 [?] RSPM (R 4.5.0)
 P gtable         0.3.6      2024-10-25 [?] RSPM (R 4.5.0)
 P hardhat        1.4.1      2025-01-31 [?] RSPM
 P here           1.0.1      2020-12-13 [?] RSPM (R 4.5.0)
 P hms            1.1.3      2023-03-21 [?] RSPM (R 4.5.0)
 P htmltools      0.5.8.1    2024-04-04 [?] RSPM (R 4.5.0)
 P htmlwidgets    1.6.4      2023-12-06 [?] RSPM (R 4.5.0)
 P infer        * 1.0.9      2025-06-26 [?] RSPM
 P ipred          0.9-15     2024-07-18 [?] RSPM
 P iterators      1.0.14     2022-02-05 [?] RSPM
 P jsonlite       2.0.0      2025-03-27 [?] RSPM (R 4.5.0)
 P knitr          1.50       2025-03-16 [?] RSPM (R 4.5.0)
 P lattice        0.22-7     2025-04-02 [?] CRAN (R 4.5.1)
 P lava           1.8.1      2025-01-12 [?] RSPM
 P lhs            1.2.0      2024-06-30 [?] RSPM (R 4.5.0)
 P lifecycle      1.0.4      2023-11-07 [?] RSPM (R 4.5.0)
 P listenv        0.9.1      2024-01-29 [?] RSPM
 P lubridate    * 1.9.4      2024-12-08 [?] RSPM (R 4.5.0)
 P magrittr       2.0.3      2022-03-30 [?] RSPM (R 4.5.1)
 P MASS           7.3-65     2025-02-28 [?] CRAN (R 4.5.1)
 P Matrix         1.7-3      2025-03-11 [?] CRAN (R 4.5.1)
 P modeldata    * 1.5.0      2025-07-31 [?] RSPM
 P nnet           7.3-20     2025-01-01 [?] CRAN (R 4.5.1)
 P parallelly     1.45.1     2025-07-24 [?] RSPM
 P parsnip      * 1.3.2      2025-05-28 [?] RSPM
 P pillar         1.11.0     2025-07-04 [?] RSPM (R 4.5.1)
 P pins         * 1.4.1      2025-04-30 [?] RSPM
 P pkgconfig      2.0.3      2019-09-22 [?] RSPM (R 4.5.0)
 P png            0.1-8      2022-11-29 [?] RSPM (R 4.5.0)
 P prodlim        2025.04.28 2025-04-28 [?] RSPM
 P purrr        * 1.1.0      2025-07-10 [?] RSPM (R 4.5.0)
 P R6             2.6.1      2025-02-15 [?] RSPM (R 4.5.0)
 P ranger         0.17.0     2024-11-08 [?] RSPM
 P rappdirs       0.3.3      2021-01-31 [?] RSPM (R 4.5.0)
 P RColorBrewer   1.1-3      2022-04-03 [?] RSPM (R 4.5.0)
 P Rcpp           1.1.0      2025-07-02 [?] RSPM (R 4.5.0)
 P readr        * 2.1.5      2024-01-10 [?] RSPM (R 4.5.0)
 P recipes      * 1.3.1      2025-05-21 [?] RSPM
   renv           1.1.5      2025-07-24 [1] RSPM (R 4.5.0)
 P reticulate     1.43.0     2025-07-21 [?] CRAN (R 4.5.0)
 P rlang          1.1.6      2025-04-11 [?] RSPM (R 4.5.0)
 P rmarkdown      2.29       2024-11-04 [?] RSPM
 P rpart          4.1.24     2025-01-07 [?] CRAN (R 4.5.1)
 P rprojroot      2.1.0      2025-07-12 [?] RSPM (R 4.5.0)
 P rsample      * 1.3.1      2025-07-29 [?] RSPM
 P rstudioapi     0.17.1     2024-10-22 [?] RSPM (R 4.5.0)
 P S7             0.2.0      2024-11-07 [?] RSPM (R 4.5.0)
 P scales       * 1.4.0      2025-04-24 [?] RSPM (R 4.5.0)
 P sessioninfo    1.2.3      2025-02-05 [?] RSPM (R 4.5.0)
 P sparsevctrs    0.3.4      2025-05-25 [?] RSPM
 P stringi        1.8.7      2025-03-27 [?] RSPM (R 4.5.0)
 P stringr      * 1.5.1      2023-11-14 [?] RSPM (R 4.5.1)
 P survival       3.8-3      2024-12-17 [?] CRAN (R 4.5.1)
 P tibble       * 3.3.0      2025-06-08 [?] RSPM (R 4.5.0)
 P tidymodels   * 1.3.0      2025-02-21 [?] RSPM
 P tidyr        * 1.3.1      2024-01-24 [?] RSPM (R 4.5.0)
 P tidyselect     1.2.1      2024-03-11 [?] RSPM (R 4.5.0)
 P tidyverse    * 2.0.0      2023-02-22 [?] RSPM (R 4.5.0)
 P timechange     0.3.0      2024-01-18 [?] RSPM (R 4.5.0)
 P timeDate       4041.110   2024-09-22 [?] RSPM
 P tune         * 1.3.0      2025-02-21 [?] RSPM
 P tzdb           0.5.0      2025-03-15 [?] RSPM (R 4.5.0)
 P utf8           1.2.6      2025-06-08 [?] RSPM (R 4.5.0)
 P vctrs          0.6.5      2023-12-01 [?] RSPM (R 4.5.0)
 P vetiver      * 0.2.5.9000 2025-10-21 [?] Github (rstudio/vetiver-r@11a6d97)
 P vroom          1.6.5      2023-12-05 [?] RSPM (R 4.5.1)
 P withr          3.0.2      2024-10-28 [?] RSPM (R 4.5.0)
 P workflows    * 1.2.0      2025-02-19 [?] RSPM
 P workflowsets * 1.1.1      2025-05-27 [?] RSPM
 P xfun           0.52       2025-04-02 [?] RSPM (R 4.5.1)
 P yaml           2.3.10     2024-07-26 [?] RSPM (R 4.5.0)
 P yardstick    * 1.3.2      2025-01-22 [?] RSPM

 [1] /Users/bcs88/Projects/info-4940/course-site/renv/library/macos/R-4.5/aarch64-apple-darwin20
 [2] /Users/bcs88/Library/Caches/org.R-project.R/R/renv/sandbox/macos/R-4.5/aarch64-apple-darwin20/4cd76b74

 * ── Packages attached to the search path.
 P ── Loaded and on-disk path mismatch.

─ Python configuration ───────────────────────────────────────────────────────
 python:         /Users/bcs88/Projects/info-4940/course-site/.venv/bin/python
 libpython:      /Users/bcs88/.local/share/uv/python/cpython-3.13.6-macos-aarch64-none/lib/libpython3.13.dylib
 pythonhome:     /Users/bcs88/Projects/info-4940/course-site/.venv:/Users/bcs88/Projects/info-4940/course-site/.venv
 virtualenv:     /Users/bcs88/Projects/info-4940/course-site/.venv/bin/activate_this.py
 version:        3.13.6 (main, Aug 14 2025, 16:07:26) [Clang 20.1.4 ]
 numpy:          /Users/bcs88/Projects/info-4940/course-site/.venv/lib/python3.13/site-packages/numpy
 numpy_version:  2.3.2
 
 NOTE: Python version was forced by VIRTUAL_ENV

──────────────────────────────────────────────────────────────────────────────