Lab 3: Databases and APIs

Published

2025-10-04

WarningCaution

This section is being revised. Thank you for your patience.

The original files in _labs/lab3 are below:

fs::dir_tree(path = "_labs/lab3/")
## _labs/lab3/
## ├── app-api.R
## ├── app-api.py
## ├── eda-db.qmd
## ├── model-db.qmd
## ├── py-data-load.py
## └── r-data-load.R
1
R Shiny app
2
Python Shiny app
3
R EDA Quarto file
4
Python EDA Quarto file
5
Database connection via Python
6
Database connection via R

In the previous labs, we build a Python model API with vetiver (I also repeated this lab in R). This lab continues with accessing databases and using APIs.

Comprehension questions

Databases

1. Draw two mental maps for connecting to a database, one using a database driver (in a Python or R package) vs. an ODBC or JDBC driver. You should (at a minimum) include the nodes: database package, DBI (R only), driver, system driver, ODBC, JDBC, and database.

Python and databases

In Python, there is a large database ecosystem (i.e., many driver options). ORM support with SQLAlchemy and it’s flexible (i.e., it can bypass standards if needed).

%%{init: {'theme': 'neutral', 'look': 'handDrawn', 'themeVariables': { 'fontFamily': 'monospace'}}}%%

graph TB
    subgraph PyApp["Python App"]
        PyCode(["<strong>app-api.py</strong>"])
    end

    style PyCode fill:#d8e4ff,color:#000
    
    subgraph PyPkg["<strong>Python Packages</strong>"]
        DBPkg("DB Package<br/><strong>psycopg2</strong>, <strong>pymysql</strong>, <strong>sqlite3</strong>")
        ODBCPkg("ODBC Package<br/><strong>pyodbc</strong>")
        JDBCPkg("JDBC Package<br/><strong>jaydebeapi</strong>")
    end
    
    %%% Tints of #35605a
    style PyPkg fill:#c2cfcd,color:#000

    subgraph Driver["<strong>Driver Layer</strong>"]
        NatDrv("Native Driver<br/>libpq, MySQL C API")
        ODBCDrv("ODBC Driver<br/>Vendor-specific")
        JDBCDrv("JDBC Driver<br/>.jar files")
    end

    %% Tints of #6b818c
    style Driver fill:#d2d9dc,color:#000
    
    subgraph System["<strong>System Layer</strong>"]
        ODBCSys("ODBC Manager<br/>System ODBC")
        JAVASys("Java Runtime<br/>JVM/JRE")
    end

    %% Tints of #00120b
    style System fill:#cccfce
    
    subgraph DB["Database"]
        DBServ[("<strong>Database Server</strong>")]
    end

    style DBServ fill:#31e981
    
    %% Direct pathway (bypasses system layer)
    PyCode --> DBPkg
    DBPkg --> NatDrv
    NatDrv --"`Bypass <strong>System Layer</strong>`"--> DBServ
    
    %% ODBC pathway
    PyCode --> ODBCPkg
    ODBCPkg --> ODBCDrv
    ODBCDrv --> ODBCSys
    ODBCSys --> DBServ
    
    %% JDBC pathway
    PyCode --> JDBCPkg
    JDBCPkg --> JDBCDrv
    JDBCDrv --> JAVASys
    JAVASys --> DBServ
    

Python DB Connections

R and databases

In R, the DBI package provides a universal interface (same code works across drivers). This is a simpler model because all paths converge on DBI package. There is also better integration with R’s data structures (i.e., data.frames).

%%{init: {'theme': 'neutral', 'look': 'handDrawn', 'themeVariables': { 'fontFamily': 'monospace'}}}%%

graph TB
    subgraph RApp["<strong>R App</strong>"]
        A(["<strong>app-api.R</strong>"])
    end

    style A fill:#d8e4ff,color:#000
    
    subgraph RPkgs["<strong>R Packages</strong>"]
        B("DB Packages<br/><strong>RPostgres</strong>, <strong>RMySQL</strong>, <strong>RSQLite</strong>")
        C("ODBC Package<br/><strong>odbc</strong>")
        D("JDBC Package<br/><strong>RJDBC</strong>")
    end

    %%% Tints of #35605a
    style RPkgs fill:#c2cfcd,color:#000
    
    subgraph DBI["<strong>R Standard Interface</strong>"]
        E("DBI<br/>Database Interface")
    end

    %% Tints of #6b818c

    
    subgraph Driver["<strong>Driver Layer</strong>"]
        F("Native Driver<br/>libpq, MySQL C API")
        G("ODBC Driver<br/>Vendor-specific")
        H("JDBC Driver<br/>.jar files")
    end

    %% Tints of #6b818c
    style Driver fill:#d2d9dc,color:#000
    
    subgraph System["<strong>System Layer</strong>"]
        I("ODBC Manager<br/>System ODBC")
        J("Java Runtime<br/>JVM/JRE")
    end

    style System fill:#cccfce
    
    subgraph DB["<strong>Database</strong>"]
        K[("<strong>Database Server</strong>")]
    end

    style K fill:#31e981
    
    %% All paths go through DBI
    A --> B
    A --> C  
    A --> D
    B --> E
    C --> E
    D --> E
    
    %% Direct pathway (bypasses system layer)
    E --> F
    F --"bypasses system layer"--> K
    
    %% ODBC pathway  
    E --> G
    G --> I
    I --> K
    
    %% JDBC pathway
    E --> H
    H --> J
    J --> K
    

R Database Connection Pathways

Both R and Python native database drivers bypass the system layer and connect directly to databases. The system layer is only involved when using ODBC or JDBC connections.

%%{init: {'theme': 'neutral', 'look': 'handDrawn', 'themeVariables': { 'fontFamily': 'monospace', "fontSize":"16px"}}}%%

graph TD
    subgraph RPhil["R Philosophy"]
        A(["DBI as Universal Interface"]) --> B("All paths converge to DBI")
        B --> C(["Consistent API regardless of driver"])
    end
    
    subgraph PyPhil["Python Philosophy"]
        D(["DB-API 2.0 as Standard"]) --> E("Multiple competing approaches")
        E --> F(["Choose best tool for job"])
    end
    
    style A fill:#d8e4ff
    style D fill:#d8e4ff

    style C fill:#31e981
    style F fill:#31e981
    

DB Connection Mental Model Differences

APIs

2. Draw a mental map for using an API from R and Python. You should (at a minimum) include nodes for {requests}/{httr2}, request, HTTP verb/request method, headers, query parameters, body, JSON, response, and response code.

In general, APIs are like the digital ‘plumbing’ that makes connections between modern software ecosystems possible.

%%{init: {'theme': 'neutral', 'look': 'handDrawn', 'themeVariables': { 'fontFamily': 'monospace', "fontSize":"16px"}}}%%

graph LR
    DS(["Data Scientist"]) --> Req["API Request"]
    Req --> ApiServ["API Server"]
    ApiServ --> DataProcMod["Data Processing/Model"]
    DataProcMod --> Res(["Results"])
    Res --> ApiServ
    ApiServ --> APIResp(["API Response"])
    APIResp --> DS
    
    style DS fill:#d8e4ff
    style Res fill:#31e981
    style APIResp fill:#31e981
    

APIs

APIs have standardized language that allows different software systems to communicate and share functionality (without needing to understand how the internal implementation works).

What do they communicate?

  1. What you can request (endpoints)
  2. How to make the request (HTTP methods, parameters)
  3. What you’ll get back (response format)
  4. When things go wrong (error codes)

Python APIs

%%{init: {'theme': 'neutral', 'look': 'handDrawn', 'themeVariables': { 'fontFamily': 'monospace'}}}%%

graph LR
    subgraph "Python APIs"
        A2(["<strong>Python Code</strong>"]) --> B2("<strong>requests</strong> package")
        B2 --> C2("<strong>requests.method</strong> call")
        C2 --> D2("<strong>method</strong> parameter<br/>GET/POST/PUT/DELETE")
        C2 --> E2("<strong>headers</strong> parameter<br/>dict of headers")
        C2 --> F2("<strong>params</strong> parameter<br/>URL query parameters")
        C2 --> G2("<strong>json</strong> parameter<br/>Request payload")
        D2 --> H2("HTTP Request")
        E2 --> H2
        F2 --> H2
        G2 --> H2
        H2 --> I2["API Server"]
        I2 --> J2["HTTP Response"]
        J2 --> K2["<strong>response.status_code</strong><br/>200, 404, 500, etc."]
        J2 --> L2["<strong>response.headers</strong><br/>Response metadata"]
        J2 --> M2["<strong>response.json</strong><br/>JSON parsing"]
        K2 --> N2(["<strong>Python Response Object</strong>"])
        L2 --> N2
        M2 --> N2
    end

    style A2 fill:#d8e4ff
    style N2 fill:#31e981
    

R APIs

%%{init: {'theme': 'neutral', 'look': 'handDrawn', 'themeVariables': { 'fontFamily': 'monospace'}}}%%

graph LR
    subgraph RApi["R APIs"]
        A1(["R Code"]) --> B1["<strong>httr2</strong><br>package"]
        B1 --> C1("<strong>request()</strong> object")
        C1 --> D1("<strong>req_method()</strong><br/>GET/POST/PUT/DELETE")
        C1 --> E1("<strong>req_headers()</strong><br/>Content-Type, Auth, etc.")
        C1 --> F1("<strong>req_url_query()</strong><br/>?param=value&key=data")
        C1 --> G1("<strong>req_body_json()</strong><br/>Request payload")
        D1 --> H1("HTTP Request")
        E1 --> H1
        F1 --> H1
        G1 --> H1
        H1 --> I1["API Server"]
        I1 --> J1["HTTP Response"]
        J1 --> K1["<strong>resp_status()</strong><br/>200, 404, 500, etc."]
        J1 --> L1["<strong>resp_headers()</strong><br/>Response metadata"]
        J1 --> M1["<strong>resp_body_json()</strong><br/>JSON parsing"]
        K1 --> N1(["R Response Object"])
        L1 --> N1
        M1 --> N1
    end

    style A1 fill:#d8e4ff
    style N1 fill:#31e981

3. How can environment variables be used to keep secrets secure in your code?

In lab 2

We previously turned our Python scikit-learn model into a VetiverModel(), saved it to a pins board, and then created a VetiverAPI().1

  • In R, we created a linear model (with lm() and fastDummies), converted it to a vetiver_model(), put it in a board_folder(), ‘pinned’ it with vetiver_pin_write(), then converted it into an API with plumber::pr() and vetiver_api().2

The sections below cover how to create and communicate with APIs. I’ve split each lab into separate Python/ and R/ folders (and included RStudio project files).

Python Shiny App

The original Python App files can be found in the _labs/lab3/Python/ folder.

fs::dir_tree(path = "_labs/lab3/Python/")
## _labs/lab3/Python/
## ├── Python.Rproj
## ├── app-api.py
## ├── model-db.qmd
## └── py-data-load.py

Model EDA

The model-db.qmd file contains the EDA from lab 1.

import duckdb
from pandas import get_dummies
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
con = duckdb.connect('my-db.duckdb')
df = con.execute("SELECT * FROM penguins").fetchdf().dropna()
con.close()

df.head(3)
X = get_dummies(df[['bill_length_mm', 'species', 'sex']], drop_first = True)
y = df['body_mass_g']

model = LinearRegression().fit(X, y)
print(f"R^2 {model.score(X,y)}")
print(f"Intercept {model.intercept_}")
print(f"Columns {X.columns}")
print(f"Coefficients {model.coef_}")
Expand to view model-db.qmd
---
title: "Model"
format:
  html:
    code-fold: true
---

```{python}
import duckdb
from pandas import get_dummies
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
```

## Get Data
```{python}
con = duckdb.connect('my-db.duckdb')
df = con.execute("SELECT * FROM penguins").fetchdf().dropna()
con.close()

df.head(3)
```

## Define Model and Fit
```{python}
X = get_dummies(df[['bill_length_mm', 'species', 'sex']], drop_first = True)
y = df['body_mass_g']

model = LinearRegression().fit(X, y)
```

## Get some information
```{python}
print(f"R^2 {model.score(X,y)}")
print(f"Intercept {model.intercept_}")
print(f"Columns {X.columns}")
print(f"Coefficients {model.coef_}")
```

Original app file

app-api.py creates an Python Shiny application that calls the vetiver API.

UI

from shiny import App, render, ui, reactive
import requests

api_url = 'http://127.0.0.1:8080/predict'

app_ui = ui.page_fluid(
    ui.panel_title("Penguin Mass Predictor"), 
    ui.layout_sidebar(
        ui.panel_sidebar(
            [ui.input_slider("bill_length", "Bill Length (mm)", 30, 60, 45, step = 0.1),
            ui.input_select("sex", "Sex", ["Male", "Female"]),
            ui.input_select("species", "Species", ["Adelie", "Chinstrap", "Gentoo"]),
            ui.input_action_button("predict", "Predict")]
        ),
        ui.panel_main(
            ui.h2("Penguin Parameters"),
            ui.output_text_verbatim("vals_out"),
            ui.h2("Predicted Penguin Mass (g)"), 
            ui.output_text("pred_out")
        )
    )   
)

Server

def server(input, output, session):
    @reactive.Calc
    def vals():
        d = {
            "bill_length_mm" : input.bill_length(),
            "sex_Male" : input.sex() == "Male",
            "species_Gentoo" : input.species() == "Gentoo", 
            "species_Chinstrap" : input.species() == "Chinstrap"

        }
        return d
    
    @reactive.Calc
    @reactive.event(input.predict)
    def pred():
        r = requests.post(api_url, json = vals())
        return r.json().get('predict')[0]

    @output
    @render.text
    def vals_out():
        return f"{vals()}"

    @output
    @render.text
    def pred_out():
        return f"{round(pred())}"
app = App(app_ui, server)
Expand to view app-api.py
from shiny import App, render, ui, reactive
import requests

api_url = 'http://127.0.0.1:8080/predict'

app_ui = ui.page_fluid(
    ui.panel_title("Penguin Mass Predictor"), 
    ui.layout_sidebar(
        ui.panel_sidebar(
            [ui.input_slider("bill_length", "Bill Length (mm)", 30, 60, 45, step = 0.1),
            ui.input_select("sex", "Sex", ["Male", "Female"]),
            ui.input_select("species", "Species", ["Adelie", "Chinstrap", "Gentoo"]),
            ui.input_action_button("predict", "Predict")]
        ),
        ui.panel_main(
            ui.h2("Penguin Parameters"),
            ui.output_text_verbatim("vals_out"),
            ui.h2("Predicted Penguin Mass (g)"), 
            ui.output_text("pred_out")
        )
    )   
)

def server(input, output, session):
    @reactive.Calc
    def vals():
        d = {
            "bill_length_mm" : input.bill_length(),
            "sex_Male" : input.sex() == "Male",
            "species_Gentoo" : input.species() == "Gentoo", 
            "species_Chinstrap" : input.species() == "Chinstrap"

        }
        return d
    
    @reactive.Calc
    @reactive.event(input.predict)
    def pred():
        r = requests.post(api_url, json = vals())
        return r.json().get('predict')[0]

    @output
    @render.text
    def vals_out():
        return f"{vals()}"

    @output
    @render.text
    def pred_out():
        return f"{round(pred())}"

app = App(app_ui, server)

Data load

import duckdb
from palmerpenguins import penguins

con = duckdb.connect('my-db.duckdb')
df = penguins.load_penguins()
con.execute('CREATE TABLE penguins AS SELECT * FROM df')
con.close()
Expand to view py-data-load.py
import duckdb
from palmerpenguins import penguins

con = duckdb.connect('my-db.duckdb')
df = penguins.load_penguins()
con.execute('CREATE TABLE penguins AS SELECT * FROM df')
con.close()

R Shiny App

The R App API files can be found in the _labs/lab3/ folder.

fs::dir_tree(path = "_labs/lab3/")
## _labs/lab3/
## ├── app-api.R
## ├── eda-db.qmd
## └── r-data-load.R

Model EDA

The eda-db.qmd file has the EDA R code from lab 1.

It begins by loading the libraries:

library(DBI) # add this
library(duckdb) # add this
library(palmerpenguins) # add this
library(dplyr)
library(ggplot2)
library(dbplyr)

The code below creates a connection to the database, but needs to be changed slightly so the penguins data is registered with duckdb (before the queries can be run).

# load penguins data first
penguins_data <- palmerpenguins::penguins
# then connect
con <- DBI::dbConnect(
  duckdb::duckdb(), 
  dbdir = "my-db.duckdb"
)
# register table
duckdb_register(con, "penguins_data", penguins_data)
# create table
dbExecute(con, "CREATE OR REPLACE TABLE penguins AS SELECT * FROM penguins_data")
# create df 
df <- dbGetQuery(con, "SELECT * FROM penguins") |> na.omit()

The table below displays the means of the measurements (body mass, flipper length, and bill length/depth) by species and sex:

df %>%
  group_by(species, sex) %>%
  summarise(
    across(
        ends_with("mm") | ends_with("g"),
      \(x) mean(x, na.rm = TRUE)
      )
    ) %>%
  dplyr::collect() %>%
  knitr::kable()

The graph displays the best-fit linear relationship between bill_length_mm and body_mass_g by species:

df %>%
  ggplot(
    aes(
    x = bill_length_mm, 
    y = body_mass_g, 
    color = species)
    ) +
  geom_point() + 
  geom_smooth(method = "lm")

And we finish by closing the connection to the database.

DBI::dbDisconnect(con)

To view the original code in eda-db.qmd, expand the section below:

Expand to view eda-db.qmd
---
title: "Penguins EDA"
format:
  html:
    code-fold: true
---

## *Penguin Size and Mass by Sex and Species*

```{r}
library(dplyr)
library(ggplot2)
library(dbplyr)

con <- DBI::dbConnect(
  duckdb::duckdb(), 
  dbdir = "my-db.duckdb"
  )
df <- dplyr::tbl(con, "penguins")
```

```{r}
df %>%
  group_by(species, sex) %>%
  summarise(
    across(
        ends_with("mm") | ends_with("g"),
      \(x) mean(x, na.rm = TRUE)
      )
    ) %>%
  dplyr::collect() %>%
  knitr::kable()
```

## *Penguin Size vs Mass by Species*

```{r}
df %>%
  ggplot(aes(x = bill_length_mm, y = body_mass_g, color = species)) +
  geom_point() + 
  geom_smooth(method = "lm")
```

```{r}
DBI::dbDisconnect(con)
```

Original app file

app-api.R creates an R Shiny application that calls the vetiver API. The top of the file creates an api_url:

api_url <- "http://127.0.0.1:8080/predict"

UI

The UI definition is below:

ui <- fluidPage(
  titlePanel("Penguin Mass Predictor"),

  # Model input values
  sidebarLayout(
    sidebarPanel(
      sliderInput(
        "bill_length",
        "Bill Length (mm)",
        min = 30,
        max = 60,
        value = 45,
        step = 0.1
      ),
      selectInput(
        "sex",
        "Sex",
        c("Male", "Female")
      ),
      selectInput(
        "species",
        "Species",
        c("Adelie", "Chinstrap", "Gentoo")
      ),
      # Get model predictions
      actionButton(
        "predict",
        "Predict"
      )
    ),

    mainPanel(
      h2("Penguin Parameters"),
      verbatimTextOutput("vals"),
      h2("Predicted Penguin Mass (g)"),
      textOutput("pred")
    )
  )
)

Server

The server definition is below:

server <- function(input, output) {
  # Input params
  vals <- reactive(
    list(
      bill_length_mm = input$bill_length,
      species_Chinstrap = input$species == "Chinstrap",
      species_Gentoo = input$species == "Gentoo",
      sex_male = input$sex == "Male"
    )
  )

  # Fetch prediction from API
  pred <- eventReactive(
    input$predict,
    httr2::request(api_url) |>
      httr2::req_body_json(vals()) |>
      httr2::req_perform() |>
      httr2::resp_body_json(),
    ignoreInit = TRUE
  )

  # Render to UI
  output$pred <- renderText(pred()$predict[[1]])
  output$vals <- renderPrint(vals())
}
shinyApp(ui = ui, server = server)

Data load

con <- DBI::dbConnect(duckdb::duckdb(), dbdir = "my-db.duckdb")

DBI::dbWriteTable(con, "penguins", palmerpenguins::penguins)

DBI::dbDisconnect(con)

To view the original code in r-data-load.R, expand the section below:

Expand to view r-data-load.R
con <- DBI::dbConnect(duckdb::duckdb(), dbdir = "my-db.duckdb")
DBI::dbWriteTable(con, "penguins", palmerpenguins::penguins)
DBI::dbDisconnect(con)

  1. This can be found in /_labs/lab2/model-vetiver/model-vetiver.qmd↩︎

  2. This can be found in _labs/lab2/R/model-vetiver-r.qmd.↩︎