Case Study (Part 1)

0.1 Texas death row executed offenders website

Texas Department of Criminal Justice keeps records of every inmate they execute. We’re going to scrape the data found here.

library(rvest)     
library(jsonlite)  
library(tidyverse) 
library(tidyquant) 
library(xopen)     
library(knitr)     
library(xml2)

0.2 Scraping the data from HTML websites

Load the xml2 package and define the url with the data (here it’s webpage_url).

webpage_url <- "http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
webpage <- xml2::read_html(webpage_url)

0.2.1 Exract HTML tables

Use the rvest::html_table() to find the table in the webpage object. This is at position [[1]].

The dplyr::glimpse(78) function is helpful here.

ExOffndrsRaw <- rvest::html_table(webpage)[[1]] 
# check the data.frame
ExOffndrsRaw %>% dplyr::glimpse(78)

## Rows: 573
## Columns: 10
## $ Execution    <int> 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, 563, …
## $ Link         <chr> "Inmate Information", "Inmate Information", "Inmate Inf…
## $ Link         <chr> "Last Statement", "Last Statement", "Last Statement", "…
## $ `Last Name`  <chr> "Rhoades", "Hummel", "Jones", "Wardlow", "Ochoa", "Gard…
## $ `First Name` <chr> "Rick", "John", "Quintin", "Billy", "Abel", "John", "Tr…
## $ TDCJNumber   <int> 999049, 999567, 999379, 999137, 999450, 999516, 999505,…
## $ Age          <int> 57, 45, 41, 45, 47, 64, 46, 38, 45, 37, 64, 48, 44, 70,…
## $ Date         <chr> "9/28/2021", "6/30/2021", "5/19/2021", "7/8/2020", "2/6…
## $ Race         <chr> "White", "White", "Black", "White", "Hispanic", "White"…
## $ County       <chr> "Harris", "Tarrant", "Tarrant", "Titus", "Dallas", "Col…

0.2.2 Fix the column names

We can see the Link column is repeated, which is going to be a problem when we put these data into their own tibble because R doesn’t like to repeat the column names inside a data.frame. We’ll address the column names with base::colnames()

base::colnames(x = rvest::html_table(webpage)[[1]])

##  [1] "Execution"  "Link"       "Link"       "Last Name" 
##  [5] "First Name" "TDCJNumber" "Age"        "Date"      
##  [9] "Race"       "County"

We will use the tibble::as_tibble() function, but add the .name_repair = "unique" argument. The .name_repair argument has other options ("check_unique", "unique", "universal" and "minimal"), and you can read the help files using ?as_tibble.

In this case, "unique" will work just fine.

ExecutedOffenders <- rvest::html_table(webpage)[[1]] %>% 
  # repair the repeated columns
  tibble::as_tibble(.name_repair = "unique") %>% 
  # get unique names
  janitor::clean_names(case = "snake") %>% 
  # lower, snake case
  dplyr::rename(offender_info = link_2, 
                # rename these 
                last_statement = link_3)
ExecutedOffenders %>% glimpse(78)

## Rows: 573
## Columns: 10
## $ execution      <int> 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, 563…
## $ offender_info  <chr> "Inmate Information", "Inmate Information", "Inmate I…
## $ last_statement <chr> "Last Statement", "Last Statement", "Last Statement",…
## $ last_name      <chr> "Rhoades", "Hummel", "Jones", "Wardlow", "Ochoa", "Ga…
## $ first_name     <chr> "Rick", "John", "Quintin", "Billy", "Abel", "John", "…
## $ tdcj_number    <int> 999049, 999567, 999379, 999137, 999450, 999516, 99950…
## $ age            <int> 57, 45, 41, 45, 47, 64, 46, 38, 45, 37, 64, 48, 44, 7…
## $ date           <chr> "9/28/2021", "6/30/2021", "5/19/2021", "7/8/2020", "2…
## $ race           <chr> "White", "White", "Black", "White", "Hispanic", "Whit…
## $ county         <chr> "Harris", "Tarrant", "Tarrant", "Titus", "Dallas", "C…

0.2.3 Export raw data

Create a folder for these data:

# create raw folder
fs::dir_create("../data/wk10-dont-mess-with-texas/")
fs::dir_tree("../data", recurse = FALSE, regexp = "dont")

## ../data
## └── wk10-dont-mess-with-texas

Create an exported data file (with the path):

# create export path
executed_raw_file <- paste0("../data/wk10-dont-mess-with-texas/", noquote(lubridate::today()), "-ExecutedOffenders.csv")
executed_raw_file

## [1] "../data/wk10-dont-mess-with-texas/2021-11-30-ExecutedOffenders.csv"

# export
write_csv(x = ExecutedOffenders, file = executed_raw_file)
# verify
fs::dir_tree("../data/wk10-dont-mess-with-texas/", regexp = "ExecutedOffenders.csv")

## ../data/wk10-dont-mess-with-texas/
## ├── 2021-11-21-ExecutedOffenders.csv
## └── 2021-11-30-ExecutedOffenders.csv

0.3 Identify the links with selector gadget

Download the selector gadget app for your browser.

0.3.1 Using the selector gadget

You can identify the various elements in a webpage using the selector gadget. Read this tutorial to see how it works.

In order to get the nodes from the table, we need to send webpage through a few passes of rvest functions (html_nodes and html_attr) with various css tags to get the correct URL paths. This took a few tries and some trial and error, but eventually I was able to figure out the the correct combinations to get the Links to the pages.

Links <- webpage %>% 
  # this get the links in the overflow table 
  # row
  rvest::html_nodes(".overflow tr") %>% 
  # the links
  rvest::html_nodes("a") %>% 
  # the header ref
  rvest::html_attr("href")
# check Links
Links %>% utils::head(20)

##  [1] "dr_info/rhoadesrick.html"         
##  [2] "dr_info/rhoadesricklast.html"     
##  [3] "dr_info/hummeljohn.html"          
##  [4] "dr_info/hummeljohnlast.html"      
##  [5] "dr_info/jonesquintin.html"        
##  [6] "dr_info/jonesquintinlast.html"    
##  [7] "dr_info/wardlowbilly.html"        
##  [8] "dr_info/wardlowbillylast.html"    
##  [9] "dr_info/ochoaabel.html"           
## [10] "dr_info/ochoaabellast.html"       
## [11] "dr_info/gardnerjohn.html"         
## [12] "dr_info/gardnerjohnlast.html"     
## [13] "dr_info/runnelstravis.html"       
## [14] "dr_info/runnelstravislast.html"   
## [15] "dr_info/halljusten.html"          
## [16] "dr_info/halljustenlast.html"      
## [17] "dr_info/sparksrobert.html"        
## [18] "dr_info/sparksrobertlast.html"    
## [19] "dr_info/solizmarkanthony.html"    
## [20] "dr_info/solizmarkanthonylast.html"

Now Links contain:

A dr_info/ path (which makes the entire path "http://www.tdcj.state.tx.us/death_row/dr_info/").
Every offender has two links–one with their full name, the other with a last string attached to the back of their full name.

Something tells me if I check the base::length() of Links with the base::nrow()s in ExOffndrs…there will be twice as many links as rows in executed offenders.

length(Links)

## [1] 1146

nrow(ExecutedOffenders)

## [1] 573

Good–this is what I want. That means each row in ExecutedOffenders has two links associated with their name.

0.3.2 Create `last_links` statements

The stringr package can help me wrangle this long vector into the last_pattern logical vector, which I then use to subset the Links.

last_pattern <- stringr::str_detect(
                            string = Links, 
                            pattern = "last")
utils::head(Links[last_pattern])

## [1] "dr_info/rhoadesricklast.html" 
## [2] "dr_info/hummeljohnlast.html"  
## [3] "dr_info/jonesquintinlast.html"
## [4] "dr_info/wardlowbillylast.html"
## [5] "dr_info/ochoaabellast.html"   
## [6] "dr_info/gardnerjohnlast.html"

Check to see that Links[last_pattern] is same length as the number of rows in ExecutedOffenders…

base::identical(x = base::length(
                        Links[last_pattern]), 
                y = base::nrow(
                                  ExecutedOffenders))

## [1] TRUE

Great–subset the Links for the last_pattern, then give this vector a name (last_links).

last_links <- Links[last_pattern]
last_links %>% utils::head(10)

##  [1] "dr_info/rhoadesricklast.html"     
##  [2] "dr_info/hummeljohnlast.html"      
##  [3] "dr_info/jonesquintinlast.html"    
##  [4] "dr_info/wardlowbillylast.html"    
##  [5] "dr_info/ochoaabellast.html"       
##  [6] "dr_info/gardnerjohnlast.html"     
##  [7] "dr_info/runnelstravislast.html"   
##  [8] "dr_info/halljustenlast.html"      
##  [9] "dr_info/sparksrobertlast.html"    
## [10] "dr_info/solizmarkanthonylast.html"

If I check the length of items in last_links, I can see there are an identical number of rows in the data frame.

base::identical(x = base::length(last_links),
                y = base::nrow(ExecutedOffenders))

## [1] TRUE

0.3.3 Assign the `last_url` column to `ExecutedOffenders`

This means I can easily assign these as a new column in ExecutedOffenders.

ExecutedOffenders %>% glimpse()

## Rows: 573
## Columns: 10
## $ execution      <int> 573, 572, 571, 570, 569, 568, 567, …
## $ offender_info  <chr> "Inmate Information", "Inmate Infor…
## $ last_statement <chr> "Last Statement", "Last Statement",…
## $ last_name      <chr> "Rhoades", "Hummel", "Jones", "Ward…
## $ first_name     <chr> "Rick", "John", "Quintin", "Billy",…
## $ tdcj_number    <int> 999049, 999567, 999379, 999137, 999…
## $ age            <int> 57, 45, 41, 45, 47, 64, 46, 38, 45,…
## $ date           <chr> "9/28/2021", "6/30/2021", "5/19/202…
## $ race           <chr> "White", "White", "Black", "White",…
## $ county         <chr> "Harris", "Tarrant", "Tarrant", "Ti…

Not done yet–I need to add the beginning of the web address:

https://www.tdcj.texas.gov/death_row/

# test 
ExecutedOffenders %>% 
  dplyr::mutate(
    last_url = 
        paste0("https://www.tdcj.texas.gov/death_row/", 
                                  last_links)) %>% 
  dplyr::pull(last_url) %>% 
  utils::head(10)

##  [1] "https://www.tdcj.texas.gov/death_row/dr_info/rhoadesricklast.html"     
##  [2] "https://www.tdcj.texas.gov/death_row/dr_info/hummeljohnlast.html"      
##  [3] "https://www.tdcj.texas.gov/death_row/dr_info/jonesquintinlast.html"    
##  [4] "https://www.tdcj.texas.gov/death_row/dr_info/wardlowbillylast.html"    
##  [5] "https://www.tdcj.texas.gov/death_row/dr_info/ochoaabellast.html"       
##  [6] "https://www.tdcj.texas.gov/death_row/dr_info/gardnerjohnlast.html"     
##  [7] "https://www.tdcj.texas.gov/death_row/dr_info/runnelstravislast.html"   
##  [8] "https://www.tdcj.texas.gov/death_row/dr_info/halljustenlast.html"      
##  [9] "https://www.tdcj.texas.gov/death_row/dr_info/sparksrobertlast.html"    
## [10] "https://www.tdcj.texas.gov/death_row/dr_info/solizmarkanthonylast.html"

# assign
ExecutedOffenders <- ExecutedOffenders %>% 
  dplyr::mutate(
    last_url = 
        paste0("https://www.tdcj.texas.gov/death_row/", 
                                  last_links))

Now we will tidy these up into nice, clean LastUrl tibble.

## https://www.tdcj.texas.gov/death_row/dr_info/rhoadesricklast.html
## https://www.tdcj.texas.gov/death_row/dr_info/hummeljohnlast.html
## https://www.tdcj.texas.gov/death_row/dr_info/jonesquintinlast.html
## https://www.tdcj.texas.gov/death_row/dr_info/wardlowbillylast.html
## https://www.tdcj.texas.gov/death_row/dr_info/ochoaabellast.html
## https://www.tdcj.texas.gov/death_row/dr_info/gardnerjohnlast.html

Test one of the URLs out in the browser.

xopen("https://www.tdcj.texas.gov/death_row/dr_info/swearingenlarrylast.html")

0.4 Create the info pattern

Now I want the offender information links (so I omit the links with last in the pattern).

info_pattern <- !stringr::str_detect(
                            string = Links, 
                            pattern = "last")
Links[info_pattern] %>% 
  utils::head() %>% 
  base::writeLines()

## dr_info/rhoadesrick.html
## dr_info/hummeljohn.html
## dr_info/jonesquintin.html
## dr_info/wardlowbilly.html
## dr_info/ochoaabel.html
## dr_info/gardnerjohn.html

0.4.1 Verify length and rows

Check the base::length() to see if it’s identical to the number of rows in ExecutedOffenders.

base::identical(x = base::length(Links[info_pattern]), 
                y = base::nrow(ExecutedOffenders))

## [1] TRUE

Great!

Check the length() of info_links

info_links <- Links[info_pattern]
base::identical(x = base::length(info_links),
                y = base::nrow(ExecutedOffenders))

## [1] TRUE

These are also identical. Repeat the URL process from above on the info_url

0.4.2 Create `info_url` column

Now we combine this with the https://www.tdcj.texas.gov/death_row/ URL.

ExecutedOffenders %>% 
  dplyr::mutate(
    info_url = 
        paste0("https://www.tdcj.texas.gov/death_row/", 
                                  info_links)) %>% 
  dplyr::pull(last_url) %>% 
  utils::head(10)

##  [1] "https://www.tdcj.texas.gov/death_row/dr_info/rhoadesricklast.html"     
##  [2] "https://www.tdcj.texas.gov/death_row/dr_info/hummeljohnlast.html"      
##  [3] "https://www.tdcj.texas.gov/death_row/dr_info/jonesquintinlast.html"    
##  [4] "https://www.tdcj.texas.gov/death_row/dr_info/wardlowbillylast.html"    
##  [5] "https://www.tdcj.texas.gov/death_row/dr_info/ochoaabellast.html"       
##  [6] "https://www.tdcj.texas.gov/death_row/dr_info/gardnerjohnlast.html"     
##  [7] "https://www.tdcj.texas.gov/death_row/dr_info/runnelstravislast.html"   
##  [8] "https://www.tdcj.texas.gov/death_row/dr_info/halljustenlast.html"      
##  [9] "https://www.tdcj.texas.gov/death_row/dr_info/sparksrobertlast.html"    
## [10] "https://www.tdcj.texas.gov/death_row/dr_info/solizmarkanthonylast.html"

# assign
ExecutedOffenders <- ExecutedOffenders %>% 
  dplyr::mutate(
    info_url = 
        paste0("http://www.tdcj.state.tx.us/death_row/", 
                                  info_links))

These are complete URLs–assign this to ExecutedOffenders data frame. Put the InfoLinks into a tidy data frame.

info_links <- Links[info_pattern]

InfoLinks <- info_links %>% 
  # turn into a tibble
  tibble::as_tibble(.name_repair = "unique") %>% 
  # tidy
  tidyr::gather(key = "key",
                value = "value") %>% 
  # rename the value
  dplyr::select(dr_info_url = value) %>% 
  # create the new url with death row root
  dplyr::mutate(
    dr_info_url = paste0("http://www.tdcj.state.tx.us/death_row/", info_links))

InfoLinks %>% dplyr::glimpse(78)

## Rows: 573
## Columns: 1
## $ dr_info_url <chr> "http://www.tdcj.state.tx.us/death_row/dr_info/rhoadesri…

0.4.3 Check in browser

Test a few of these out in the browser:

xopen("http://www.tdcj.state.tx.us/death_row/dr_info/brookscharlie.html")

Now we assign these links to the ExecutedOffenders data frame. But first make sure they match up.

ExecutedOffenders %>% 
  dplyr::select(last_name, 
                first_name) %>%
  utils::head(10)

ExecutedOffenders %>% 
  dplyr::select(last_name, 
                first_name) %>%
  utils::tail(10)

0.4.4 Bind columns

Combine the ExecutedOffenders, LastUrl and InfoLinks.

# Use `dplyr::bind_cols()` to attach these columns to `ExecutedOffenders` and 
# rename to`ExOffndrsComplete`
ExecutedOffenders <- ExecutedOffenders %>% 
  # add the info_url
  dplyr::bind_cols(LastUrl) %>%
  # add the
  dplyr::bind_cols(InfoLinks) %>%
  # move the names to the front
  dplyr::select(dplyr::ends_with("name"),
                # all else
                dplyr::everything())
ExecutedOffenders %>% dplyr::glimpse(78)

## Rows: 573
## Columns: 14
## $ last_name      <chr> "Rhoades", "Hummel", "Jones", "Wardlow", "Ochoa", "Ga…
## $ first_name     <chr> "Rick", "John", "Quintin", "Billy", "Abel", "John", "…
## $ execution      <int> 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, 563…
## $ offender_info  <chr> "Inmate Information", "Inmate Information", "Inmate I…
## $ last_statement <chr> "Last Statement", "Last Statement", "Last Statement",…
## $ tdcj_number    <int> 999049, 999567, 999379, 999137, 999450, 999516, 99950…
## $ age            <int> 57, 45, 41, 45, 47, 64, 46, 38, 45, 37, 64, 48, 44, 7…
## $ date           <chr> "9/28/2021", "6/30/2021", "5/19/2021", "7/8/2020", "2…
## $ race           <chr> "White", "White", "Black", "White", "Hispanic", "Whit…
## $ county         <chr> "Harris", "Tarrant", "Tarrant", "Titus", "Dallas", "C…
## $ last_url       <chr> "https://www.tdcj.texas.gov/death_row/dr_info/rhoades…
## $ info_url       <chr> "http://www.tdcj.state.tx.us/death_row/dr_info/rhoade…
## $ name_last_url  <chr> "https://www.tdcj.texas.gov/death_row/dr_info/rhoades…
## $ dr_info_url    <chr> "http://www.tdcj.state.tx.us/death_row/dr_info/rhoade…

0.5 Create indicator for .html vs .jpgs

Create a binary variable to identify if this is a .jpg or .html path and name the new data frame ExOffndrsComplete.

0.5.1 Use `case_when()` to create `jpg_html`

ExOffndrsComplete <- ExecutedOffenders %>% 
  dplyr::mutate(jpg_html = 
        dplyr::case_when(
          str_detect(string = info_url, pattern = ".jpg") ~ "jpg", 
          str_detect(string = info_url, pattern = ".html") ~ "html")) 
ExOffndrsComplete %>% dplyr::count(jpg_html)

0.5.2 Check with `sample_n()`

Use dplyr::sample_n to check a few examples of this new variable.

ExOffndrsComplete %>% 
  dplyr::sample_n(size = 10) %>% 
  dplyr::select(info_url, 
                jpg_html) %>% 
  dplyr::count(jpg_html)

0.6 Export the data with a date stamp

We now have a data frame we can export into a dated folder.

# create data folder
export_path <- "../data/wk10-dont-mess-with-texas/processed/"
fs::dir_create(export_path)
# create today
tahday <- as.character(lubridate::today())
tahday_path <- paste0(export_path, tahday, "/")
tahday_path

## [1] "../data/wk10-dont-mess-with-texas/processed/2021-11-30/"

# create new data folder
fs::dir_create(tahday_path)
# create data path
tahday_data_path <- paste0(tahday_path, "ExOffndrsComplete.csv")
# export these data
vroom::vroom_write(x = ExOffndrsComplete, file = tahday_data_path, delim = ",")
fs::dir_tree(tahday_path, regexpr = "-ExOffndrsComplete.csv")

## ../data/wk10-dont-mess-with-texas/processed/2021-11-30/
## └── ExOffndrsComplete.csv