upload-data
upload-data.RmdMotivation
This vignette walks through the upload_data() function,
which is used for loading a variety of file types into the
dfdiffs shiny application.
library(dfdiffs)
library(shiny)
library(data.table)
library(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(forcats)
library(glue)
library(purrr)
library(vroom)
library(reactable)
library(haven)
library(readxl)
library(labelled)
library(gt)
library(gtsummary)External Data
Test data can be found in the ../inst/extdata/
folder:
#> ../inst/extdata/
#> ├── csv
#> │ ├── 2010-lahman
#> │ │ ├── Batting.csv
#> │ │ ├── Fielding.csv
#> │ │ └── Master.csv
#> │ ├── 2015-baseballdatabank
#> │ │ ├── Batting.csv
#> │ │ ├── Fielding.csv
#> │ │ └── Master.csv
#> │ ├── 2020-baseballdatabank
#> │ │ ├── Batting.csv
#> │ │ ├── Fielding.csv
#> │ │ └── People.csv
#> │ ├── ChangedData.csv
#> │ ├── InitialData.csv
#> │ ├── by-year
#> │ │ ├── 20
#> │ │ │ ├── PlayerBirth.csv
#> │ │ │ ├── PlayerDebut.csv
#> │ │ │ └── PlayerName.csv
#> │ │ └── 21
#> │ │ ├── PlayerBirth.csv
#> │ │ ├── PlayerDebut.csv
#> │ │ └── PlayerName.csv
#> │ ├── diffs
#> │ │ ├── diff_current.csv
#> │ │ ├── diff_modified_all_raw.csv
#> │ │ └── diff_previous.csv
#> │ └── lahman-people
#> │ ├── People2020.csv
#> │ └── People2021.csv
#> ├── dta
#> │ ├── datetime-d.dta
#> │ ├── iris.dta
#> │ ├── notes.dta
#> │ ├── tagged-na-double.dta
#> │ ├── tagged-na-int.dta
#> │ └── types.dta
#> ├── rdata
#> │ └── proc_app_data.rdata
#> ├── sas7bdat
#> │ ├── datetime.sas7bdat
#> │ ├── formats.sas7bcat
#> │ ├── hadley.sas7bdat
#> │ ├── iris.sas7bdat
#> │ ├── tagged-na.sas7bcat
#> │ └── tagged-na.sas7bdat
#> ├── sav
#> │ ├── datetime.sav
#> │ ├── iris.sav
#> │ ├── labelled-num-na.sav
#> │ ├── labelled-num.sav
#> │ ├── labelled-str.sav
#> │ ├── umlauts.sav
#> │ └── variable-label.sav
#> ├── tsv
#> │ ├── Batting.tsv
#> │ ├── Fielding.tsv
#> │ └── People.tsv
#> ├── txt
#> │ ├── Batting.txt
#> │ ├── Fielding.txt
#> │ └── People.txt
#> └── xlsx
#> ├── compare-report-text.xlsx
#> ├── lahman500.xlsx
#> └── snapshot_compare_270301_20221116_year3_preview_noDAP.xlsx
load_flat_file()
The load_flat_file() function imports all forms of flat
data files.
load_flat_file <- function(path) {
ext <- tools::file_ext(path)
data <- switch(ext,
txt = data.table::fread(path),
csv = data.table::fread(path),
tsv = data.table::fread(path),
sas7bdat = haven::read_sas(data_file = path),
sas7bcat = haven::read_sas(data_file = path),
sav = haven::read_sav(file = path),
dta = haven::read_dta(file = path)
)
return_data <- tibble::as_tibble(data)
return(return_data)
}If the file is an excel file, the name of the sheet should be passed
to sheet.
upload_data <- function(path, sheet = NULL) {
ext <- tools::file_ext(path)
if (ext == "xlsx") {
raw_data <- readxl::read_excel(
path = path,
sheet = sheet
)
uploaded <- tibble::as_tibble(raw_data)
} else {
uploaded <- load_flat_file(path = path)
}
return(uploaded)
}2010 Lahamn CSVs
lahman_2010_csv_paths <- list.files(path = "../inst/extdata/csv/2010-lahman", full.names = TRUE, pattern = ".csv$")
head(lahman_2010_csv_paths)
#> [1] "../inst/extdata/csv/2010-lahman/Batting.csv"
#> [2] "../inst/extdata/csv/2010-lahman/Fielding.csv"
#> [3] "../inst/extdata/csv/2010-lahman/Master.csv"Test this on lahman_2010_csv_paths[3]
master_2010 <- load_flat_file(path = lahman_2010_csv_paths[3])
glimpse(master_2010)
#> Rows: 17,674
#> Columns: 33
#> $ lahmanID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
#> $ playerID <chr> "aaronha01", "aaronto01", "aasedo01", "abadan01", "abadij…
#> $ managerID <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "…
#> $ hofID <chr> "aaronha01h", "", "", "", "", "", "", "", "", "", "", "",…
#> $ birthYear <int> 1934, 1939, 1954, 1972, 1854, 1877, 1869, 1866, 1862, 187…
#> $ birthMonth <int> 2, 8, 9, 8, 11, 4, 11, 10, 3, 10, 2, 8, 9, 6, 2, 9, 9, 7,…
#> $ birthDay <int> 5, 5, 8, 25, 4, 15, 29, 14, 16, 22, 16, 17, 19, 2, 18, 5,…
#> $ birthCountry <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "…
#> $ birthState <chr> "AL", "AL", "CA", "FL", "PA", "PA", "VT", "NE", "OH", "OH…
#> $ birthCity <chr> "Mobile", "Mobile", "Orange", "West Palm Beach", "Philade…
#> $ deathYear <int> NA, 1984, NA, NA, 1905, 1957, 1962, 1926, 1930, 1935, NA,…
#> $ deathMonth <int> NA, 8, NA, NA, 5, 1, 6, 4, 2, 6, NA, NA, NA, NA, NA, 4, N…
#> $ deathDay <int> NA, 16, NA, NA, 17, 6, 11, 27, 13, 11, NA, NA, NA, NA, NA…
#> $ deathCountry <chr> "", "USA", "", "", "USA", "USA", "USA", "USA", "USA", "US…
#> $ deathState <chr> "", "GA", "", "", "NJ", "FL", "VT", "CA", "MI", "CA", "",…
#> $ deathCity <chr> "", "Atlanta", "", "", "Pemberton", "Ft.Lauderdale", "Ess…
#> $ nameFirst <chr> "Hank", "Tommie", "Don", "Andy", "John", "Ed", "Bert", "C…
#> $ nameLast <chr> "Aaron", "Aaron", "Aase", "Abad", "Abadie", "Abbaticchio"…
#> $ nameNote <chr> "", "", "", "", "", "", "", "", "", "born Harry Frederick…
#> $ nameGiven <chr> "Henry Louis", "Tommie Lee", "Donald William", "", "John"…
#> $ nameNick <chr> "Hammer,Hammerin' Hank,Bad Henry", "", "", "", "", "Batty…
#> $ weight <int> 180, 190, 190, 184, 192, 170, 175, 169, 190, 180, 200, 19…
#> $ height <dbl> 72, 75, 75, 73, 72, 71, 71, 68, 71, 70, 78, 74, 75, 71, 7…
#> $ bats <chr> "R", "R", "R", "L", "R", "R", "R", "L", "R", "R", "R", "R…
#> $ throws <chr> "R", "R", "R", "L", "R", "R", "R", "", "R", "R", "R", "L"…
#> $ debut <chr> "4/13/1954 0:00:00", "4/10/1962 0:00:00", "7/26/1977 0:00…
#> $ finalGame <chr> "10/3/1976 0:00:00", "9/26/1971 0:00:00", "10/3/1990 0:00…
#> $ college <chr> "", "", "Cal St. Fullerton", "Middle Georgia JC", "", "",…
#> $ lahman40ID <chr> "aaronha01", "aaronto01", "aasedo01", "abadan01", "abadij…
#> $ lahman45ID <chr> "aaronha01", "aaronto01", "aasedo01", "abadan01", "abadij…
#> $ retroID <chr> "aaroh101", "aarot101", "aased001", "abada001", "abadj101…
#> $ holtzID <chr> "aaronha01", "aaronto01", "aasedo01", "abadan01", "abadij…
#> $ bbrefID <chr> "aaronha01", "aaronto01", "aasedo01", "abadan01", "abadij…2015 Lahamn CSVs
lahman_2015_csv_paths <- list.files(path = "../inst/extdata/csv/2015-baseballdatabank", full.names = TRUE, pattern = ".csv$")
head(lahman_2015_csv_paths)
#> [1] "../inst/extdata/csv/2015-baseballdatabank/Batting.csv"
#> [2] "../inst/extdata/csv/2015-baseballdatabank/Fielding.csv"
#> [3] "../inst/extdata/csv/2015-baseballdatabank/Master.csv"Test this on lahman_2015_csv_paths[3]
master_csv_2015 <- load_flat_file(path = lahman_2015_csv_paths[3])
glimpse(master_csv_2015)
#> Rows: 18,846
#> Columns: 24
#> $ playerID <chr> "aardsda01", "aaronha01", "aaronto01", "aasedo01", "abada…
#> $ birthYear <int> 1981, 1934, 1939, 1954, 1972, 1985, 1854, 1877, 1869, 186…
#> $ birthMonth <int> 12, 2, 8, 9, 8, 12, 11, 4, 11, 10, 3, 10, 2, 8, 9, 6, 2, …
#> $ birthDay <int> 27, 5, 5, 8, 25, 17, 4, 15, 11, 14, 16, 22, 16, 17, 19, 2…
#> $ birthCountry <chr> "USA", "USA", "USA", "USA", "USA", "D.R.", "USA", "USA", …
#> $ birthState <chr> "CO", "AL", "AL", "CA", "FL", "La Romana", "PA", "PA", "V…
#> $ birthCity <chr> "Denver", "Mobile", "Mobile", "Orange", "Palm Beach", "La…
#> $ deathYear <int> NA, NA, 1984, NA, NA, NA, 1905, 1957, 1962, 1926, 1930, 1…
#> $ deathMonth <int> NA, NA, 8, NA, NA, NA, 5, 1, 6, 4, 2, 6, NA, NA, NA, NA, …
#> $ deathDay <int> NA, NA, 16, NA, NA, NA, 17, 6, 11, 27, 13, 11, NA, NA, NA…
#> $ deathCountry <chr> "", "", "USA", "", "", "", "USA", "USA", "USA", "USA", "U…
#> $ deathState <chr> "", "", "GA", "", "", "", "NJ", "FL", "VT", "CA", "MI", "…
#> $ deathCity <chr> "", "", "Atlanta", "", "", "", "Pemberton", "Fort Lauderd…
#> $ nameFirst <chr> "David", "Hank", "Tommie", "Don", "Andy", "Fernando", "Jo…
#> $ nameLast <chr> "Aardsma", "Aaron", "Aaron", "Aase", "Abad", "Abad", "Aba…
#> $ nameGiven <chr> "David Allan", "Henry Louis", "Tommie Lee", "Donald Willi…
#> $ weight <int> 220, 180, 190, 190, 184, 220, 192, 170, 175, 169, 190, 18…
#> $ height <int> 75, 72, 75, 75, 73, 73, 72, 71, 71, 68, 71, 70, 78, 74, 7…
#> $ bats <chr> "R", "R", "R", "R", "L", "L", "R", "R", "R", "L", "R", "R…
#> $ throws <chr> "R", "R", "R", "R", "L", "L", "R", "R", "R", "L", "R", "R…
#> $ debut <IDate> 2004-04-06, 1954-04-13, 1962-04-10, 1977-07-26, 2001-09…
#> $ finalGame <IDate> 2015-08-23, 1976-10-03, 1971-09-26, 1990-10-03, 2006-04…
#> $ retroID <chr> "aardd001", "aaroh101", "aarot101", "aased001", "abada001…
#> $ bbrefID <chr> "aardsda01", "aaronha01", "aaronto01", "aasedo01", "abada…List of 2010 csvs
Now we create lahman_2010_csv_files
lahman_2010_csv_files <- map(.x = lahman_2010_csv_paths,
.f = load_flat_file) %>%
set_names(x = ., nm = basename(lahman_2010_csv_paths))
map(lahman_2010_csv_files, names)
#> $Batting.csv
#> [1] "playerID" "yearID" "stint" "teamID" "lgID" "G"
#> [7] "G_batting" "AB" "R" "H" "2B" "3B"
#> [13] "HR" "RBI" "SB" "CS" "BB" "SO"
#> [19] "IBB" "HBP" "SH" "SF" "GIDP" "G_old"
#>
#> $Fielding.csv
#> [1] "playerID" "yearID" "stint" "teamID" "lgID" "POS"
#> [7] "G" "GS" "InnOuts" "PO" "A" "E"
#> [13] "DP" "PB" "WP" "SB" "CS" "ZR"
#>
#> $Master.csv
#> [1] "lahmanID" "playerID" "managerID" "hofID" "birthYear"
#> [6] "birthMonth" "birthDay" "birthCountry" "birthState" "birthCity"
#> [11] "deathYear" "deathMonth" "deathDay" "deathCountry" "deathState"
#> [16] "deathCity" "nameFirst" "nameLast" "nameNote" "nameGiven"
#> [21] "nameNick" "weight" "height" "bats" "throws"
#> [26] "debut" "finalGame" "college" "lahman40ID" "lahman45ID"
#> [31] "retroID" "holtzID" "bbrefID"We’ll test this on the map_df() function.
tbl_2010_csv_files <- lahman_2010_csv_paths %>%
set_names() %>%
map_df(.x = .,
.f = load_flat_file, .id = "source") %>%
mutate(source = basename(source))
tbl_2010_csv_files %>% count(source)
#> # A tibble: 3 × 2
#> source n
#> <chr> <int>
#> 1 Batting.csv 93955
#> 2 Fielding.csv 160710
#> 3 Master.csv 17674Test on dta
dta_paths <- list.files(path = "../inst/extdata/dta",
full.names = TRUE, pattern = ".dta$")
tbl_dta_files <- dta_paths %>%
set_names() %>%
map_df(.x = .,
.f = load_flat_file, .id = "source") %>%
mutate(source = basename(source))
#> Registered S3 methods overwritten by 'readr':
#> method from
#> as.data.frame.spec_tbl_df vroom
#> as_tibble.spec_tbl_df vroom
#> format.col_spec vroom
#> print.col_spec vroom
#> print.collector vroom
#> print.date_names vroom
#> print.locale vroom
#> str.col_spec vroom
tbl_dta_files %>% count(source)
#> # A tibble: 6 × 2
#> source n
#> <chr> <int>
#> 1 datetime-d.dta 1
#> 2 iris.dta 150
#> 3 notes.dta 5
#> 4 tagged-na-double.dta 8
#> 5 tagged-na-int.dta 8
#> 6 types.dta 2Test on sas7bdat
sas7bdat_paths <- list.files(path = "../inst/extdata/sas7bdat",
full.names = TRUE, pattern = ".sas7bdat$")
tbl_sas7bdat_files <- sas7bdat_paths %>%
set_names() %>%
map_df(.x = .,
.f = load_flat_file, .id = "source") %>%
mutate(source = basename(source))
tbl_sas7bdat_files %>% count(source)
#> # A tibble: 4 × 2
#> source n
#> <chr> <int>
#> 1 datetime.sas7bdat 4
#> 2 hadley.sas7bdat 8
#> 3 iris.sas7bdat 150
#> 4 tagged-na.sas7bdat 8Test on sav
sav_paths <- list.files(path = "../inst/extdata/sav",
full.names = TRUE, pattern = ".sav$")
tbl_sav_files <- sav_paths %>%
set_names() %>%
map_df(.x = .,
.f = load_flat_file, .id = "source") %>%
mutate(source = basename(source))
tbl_sav_files %>% count(source)
#> # A tibble: 7 × 2
#> source n
#> <chr> <int>
#> 1 datetime.sav 2
#> 2 iris.sav 150
#> 3 labelled-num-na.sav 2
#> 4 labelled-num.sav 1
#> 5 labelled-str.sav 2
#> 6 umlauts.sav 4
#> 7 variable-label.sav 1Test on tsv
tsv_paths <- list.files(path = "../inst/extdata/tsv",
full.names = TRUE, pattern = ".tsv$")
tbl_tsv_files <- tsv_paths %>%
set_names() %>%
map_df(.x = .,
.f = load_flat_file, .id = "source") %>%
mutate(source = basename(source))
tbl_tsv_files %>% count(source)
#> # A tibble: 3 × 2
#> source n
#> <chr> <int>
#> 1 Batting.tsv 108789
#> 2 Fielding.tsv 144768
#> 3 People.tsv 20093Test on txt
txt_paths <- list.files(path = "../inst/extdata/txt",
full.names = TRUE, pattern = ".txt$")
tbl_txt_files <- txt_paths %>%
set_names() %>%
map_df(.x = .,
.f = load_flat_file, .id = "source") %>%
mutate(source = basename(source))
tbl_txt_files %>% count(source)
#> # A tibble: 3 × 2
#> source n
#> <chr> <int>
#> 1 Batting.txt 108789
#> 2 Fielding.txt 144768
#> 3 People.txt 20093