0.1 Texas death row executed offenders website

Texas Department of Criminal Justice keeps records of every inmate they execute. We’re going to scrape the data found here.

library(rvest)     
library(jsonlite)  
library(tidyverse) 
library(tidyquant) 
library(xopen)     
library(knitr)     
library(xml2)

0.2 Scraping the data from HTML websites

Load the xml2 package and define the url with the data (here it’s webpage_url).

webpage_url <- "http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
webpage <- xml2::read_html(webpage_url)

0.2.1 Exract HTML tables

Use the rvest::html_table() to find the table in the webpage object. This is at position [[1]].

The dplyr::glimpse(78) function is helpful here.

ExOffndrsRaw <- rvest::html_table(webpage)[[1]] 
# check the data.frame
ExOffndrsRaw %>% dplyr::glimpse(78)
## Rows: 573
## Columns: 10
## $ Execution    <int> 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, 563, …
## $ Link         <chr> "Inmate Information", "Inmate Information", "Inmate Inf…
## $ Link         <chr> "Last Statement", "Last Statement", "Last Statement", "…
## $ `Last Name`  <chr> "Rhoades", "Hummel", "Jones", "Wardlow", "Ochoa", "Gard…
## $ `First Name` <chr> "Rick", "John", "Quintin", "Billy", "Abel", "John", "Tr…
## $ TDCJNumber   <int> 999049, 999567, 999379, 999137, 999450, 999516, 999505,…
## $ Age          <int> 57, 45, 41, 45, 47, 64, 46, 38, 45, 37, 64, 48, 44, 70,…
## $ Date         <chr> "9/28/2021", "6/30/2021", "5/19/2021", "7/8/2020", "2/6…
## $ Race         <chr> "White", "White", "Black", "White", "Hispanic", "White"…
## $ County       <chr> "Harris", "Tarrant", "Tarrant", "Titus", "Dallas", "Col…

0.2.2 Fix the column names

We can see the Link column is repeated, which is going to be a problem when we put these data into their own tibble because R doesn’t like to repeat the column names inside a data.frame. We’ll address the column names with base::colnames()

base::colnames(x = rvest::html_table(webpage)[[1]])
##  [1] "Execution"  "Link"       "Link"       "Last Name" 
##  [5] "First Name" "TDCJNumber" "Age"        "Date"      
##  [9] "Race"       "County"

We will use the tibble::as_tibble() function, but add the .name_repair = "unique" argument. The .name_repair argument has other options ("check_unique", "unique", "universal" and "minimal"), and you can read the help files using ?as_tibble.

In this case, "unique" will work just fine.

ExecutedOffenders <- rvest::html_table(webpage)[[1]] %>% 
  # repair the repeated columns
  tibble::as_tibble(.name_repair = "unique") %>% 
  # get unique names
  janitor::clean_names(case = "snake") %>% 
  # lower, snake case
  dplyr::rename(offender_info = link_2, 
                # rename these 
                last_statement = link_3)
ExecutedOffenders %>% glimpse(78)
## Rows: 573
## Columns: 10
## $ execution      <int> 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, 563…
## $ offender_info  <chr> "Inmate Information", "Inmate Information", "Inmate I…
## $ last_statement <chr> "Last Statement", "Last Statement", "Last Statement",…
## $ last_name      <chr> "Rhoades", "Hummel", "Jones", "Wardlow", "Ochoa", "Ga…
## $ first_name     <chr> "Rick", "John", "Quintin", "Billy", "Abel", "John", "…
## $ tdcj_number    <int> 999049, 999567, 999379, 999137, 999450, 999516, 99950…
## $ age            <int> 57, 45, 41, 45, 47, 64, 46, 38, 45, 37, 64, 48, 44, 7…
## $ date           <chr> "9/28/2021", "6/30/2021", "5/19/2021", "7/8/2020", "2…
## $ race           <chr> "White", "White", "Black", "White", "Hispanic", "Whit…
## $ county         <chr> "Harris", "Tarrant", "Tarrant", "Titus", "Dallas", "C…

0.2.3 Export raw data

Create a folder for these data:

# create raw folder
fs::dir_create("../data/wk10-dont-mess-with-texas/")
fs::dir_tree("../data", recurse = FALSE, regexp = "dont")
## ../data
## └── wk10-dont-mess-with-texas

Create an exported data file (with the path):

# create export path
executed_raw_file <- paste0("../data/wk10-dont-mess-with-texas/", noquote(lubridate::today()), "-ExecutedOffenders.csv")
executed_raw_file
## [1] "../data/wk10-dont-mess-with-texas/2021-11-30-ExecutedOffenders.csv"
# export
write_csv(x = ExecutedOffenders, file = executed_raw_file)
# verify
fs::dir_tree("../data/wk10-dont-mess-with-texas/", regexp = "ExecutedOffenders.csv")
## ../data/wk10-dont-mess-with-texas/
## ├── 2021-11-21-ExecutedOffenders.csv
## └── 2021-11-30-ExecutedOffenders.csv

0.4 Create the info pattern

Now I want the offender information links (so I omit the links with last in the pattern).

info_pattern <- !stringr::str_detect(
                            string = Links, 
                            pattern = "last")
Links[info_pattern] %>% 
  utils::head() %>% 
  base::writeLines()
## dr_info/rhoadesrick.html
## dr_info/hummeljohn.html
## dr_info/jonesquintin.html
## dr_info/wardlowbilly.html
## dr_info/ochoaabel.html
## dr_info/gardnerjohn.html

0.4.1 Verify length and rows

Check the base::length() to see if it’s identical to the number of rows in ExecutedOffenders.

base::identical(x = base::length(Links[info_pattern]), 
                y = base::nrow(ExecutedOffenders))
## [1] TRUE

Great!

Check the length() of info_links

info_links <- Links[info_pattern]
base::identical(x = base::length(info_links),
                y = base::nrow(ExecutedOffenders))
## [1] TRUE

These are also identical. Repeat the URL process from above on the info_url

0.4.2 Create info_url column

Now we combine this with the https://www.tdcj.texas.gov/death_row/ URL.

ExecutedOffenders %>% 
  dplyr::mutate(
    info_url = 
        paste0("https://www.tdcj.texas.gov/death_row/", 
                                  info_links)) %>% 
  dplyr::pull(last_url) %>% 
  utils::head(10)
##  [1] "https://www.tdcj.texas.gov/death_row/dr_info/rhoadesricklast.html"     
##  [2] "https://www.tdcj.texas.gov/death_row/dr_info/hummeljohnlast.html"      
##  [3] "https://www.tdcj.texas.gov/death_row/dr_info/jonesquintinlast.html"    
##  [4] "https://www.tdcj.texas.gov/death_row/dr_info/wardlowbillylast.html"    
##  [5] "https://www.tdcj.texas.gov/death_row/dr_info/ochoaabellast.html"       
##  [6] "https://www.tdcj.texas.gov/death_row/dr_info/gardnerjohnlast.html"     
##  [7] "https://www.tdcj.texas.gov/death_row/dr_info/runnelstravislast.html"   
##  [8] "https://www.tdcj.texas.gov/death_row/dr_info/halljustenlast.html"      
##  [9] "https://www.tdcj.texas.gov/death_row/dr_info/sparksrobertlast.html"    
## [10] "https://www.tdcj.texas.gov/death_row/dr_info/solizmarkanthonylast.html"
# assign
ExecutedOffenders <- ExecutedOffenders %>% 
  dplyr::mutate(
    info_url = 
        paste0("http://www.tdcj.state.tx.us/death_row/", 
                                  info_links))

These are complete URLs–assign this to ExecutedOffenders data frame. Put the InfoLinks into a tidy data frame.

info_links <- Links[info_pattern]

InfoLinks <- info_links %>% 
  # turn into a tibble
  tibble::as_tibble(.name_repair = "unique") %>% 
  # tidy
  tidyr::gather(key = "key",
                value = "value") %>% 
  # rename the value
  dplyr::select(dr_info_url = value) %>% 
  # create the new url with death row root
  dplyr::mutate(
    dr_info_url = paste0("http://www.tdcj.state.tx.us/death_row/", info_links))

InfoLinks %>% dplyr::glimpse(78)
## Rows: 573
## Columns: 1
## $ dr_info_url <chr> "http://www.tdcj.state.tx.us/death_row/dr_info/rhoadesri…

0.4.3 Check in browser

Test a few of these out in the browser:

xopen("http://www.tdcj.state.tx.us/death_row/dr_info/brookscharlie.html")

Now we assign these links to the ExecutedOffenders data frame. But first make sure they match up.

ExecutedOffenders %>% 
  dplyr::select(last_name, 
                first_name) %>%
  utils::head(10)
ExecutedOffenders %>% 
  dplyr::select(last_name, 
                first_name) %>%
  utils::tail(10)

0.4.4 Bind columns

Combine the ExecutedOffenders, LastUrl and InfoLinks.

# Use `dplyr::bind_cols()` to attach these columns to `ExecutedOffenders` and 
# rename to`ExOffndrsComplete`
ExecutedOffenders <- ExecutedOffenders %>% 
  # add the info_url
  dplyr::bind_cols(LastUrl) %>%
  # add the
  dplyr::bind_cols(InfoLinks) %>%
  # move the names to the front
  dplyr::select(dplyr::ends_with("name"),
                # all else
                dplyr::everything())
ExecutedOffenders %>% dplyr::glimpse(78)
## Rows: 573
## Columns: 14
## $ last_name      <chr> "Rhoades", "Hummel", "Jones", "Wardlow", "Ochoa", "Ga…
## $ first_name     <chr> "Rick", "John", "Quintin", "Billy", "Abel", "John", "…
## $ execution      <int> 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, 563…
## $ offender_info  <chr> "Inmate Information", "Inmate Information", "Inmate I…
## $ last_statement <chr> "Last Statement", "Last Statement", "Last Statement",…
## $ tdcj_number    <int> 999049, 999567, 999379, 999137, 999450, 999516, 99950…
## $ age            <int> 57, 45, 41, 45, 47, 64, 46, 38, 45, 37, 64, 48, 44, 7…
## $ date           <chr> "9/28/2021", "6/30/2021", "5/19/2021", "7/8/2020", "2…
## $ race           <chr> "White", "White", "Black", "White", "Hispanic", "Whit…
## $ county         <chr> "Harris", "Tarrant", "Tarrant", "Titus", "Dallas", "C…
## $ last_url       <chr> "https://www.tdcj.texas.gov/death_row/dr_info/rhoades…
## $ info_url       <chr> "http://www.tdcj.state.tx.us/death_row/dr_info/rhoade…
## $ name_last_url  <chr> "https://www.tdcj.texas.gov/death_row/dr_info/rhoades…
## $ dr_info_url    <chr> "http://www.tdcj.state.tx.us/death_row/dr_info/rhoade…

0.5 Create indicator for .html vs .jpgs

Create a binary variable to identify if this is a .jpg or .html path and name the new data frame ExOffndrsComplete.

0.5.1 Use case_when() to create jpg_html

ExOffndrsComplete <- ExecutedOffenders %>% 
  dplyr::mutate(jpg_html = 
        dplyr::case_when(
          str_detect(string = info_url, pattern = ".jpg") ~ "jpg", 
          str_detect(string = info_url, pattern = ".html") ~ "html")) 
ExOffndrsComplete %>% dplyr::count(jpg_html)

0.5.2 Check with sample_n()

Use dplyr::sample_n to check a few examples of this new variable.

ExOffndrsComplete %>% 
  dplyr::sample_n(size = 10) %>% 
  dplyr::select(info_url, 
                jpg_html) %>% 
  dplyr::count(jpg_html)

0.6 Export the data with a date stamp

We now have a data frame we can export into a dated folder.

# create data folder
export_path <- "../data/wk10-dont-mess-with-texas/processed/"
fs::dir_create(export_path)
# create today
tahday <- as.character(lubridate::today())
tahday_path <- paste0(export_path, tahday, "/")
tahday_path
## [1] "../data/wk10-dont-mess-with-texas/processed/2021-11-30/"
# create new data folder
fs::dir_create(tahday_path)
# create data path
tahday_data_path <- paste0(tahday_path, "ExOffndrsComplete.csv")
# export these data
vroom::vroom_write(x = ExOffndrsComplete, file = tahday_data_path, delim = ",")
fs::dir_tree(tahday_path, regexpr = "-ExOffndrsComplete.csv")
## ../data/wk10-dont-mess-with-texas/processed/2021-11-30/
## └── ExOffndrsComplete.csv