Data
data.Rmd
This document covers the #TidyTuesday
data used in the
application.
TidyTuesday R packages
tidytuesdayR
The tidytuesdayR
package is designed for, “providing functions to quickly import data
posted to the Tidy Tuesday repository.”
The tt_load()
function downloads the data using a date
input.
tt_data <- tt_load("2025-07-08")
#> ---- Compiling #TidyTuesday Information for 2025-07-08 ----
#> --- There are 3 files available ---
#>
#>
#> ── Downloading files ───────────────────────────────────────────────────────────
#>
#> 1 of 3: "answers.csv"
#> 2 of 3: "color_ranks.csv"
#> 3 of 3: "users.csv"
The downloaded object is a list, but with a few additional attributes.
str(tt_data)
#> List of 3
#> $ answers : spc_tbl_ [1,058,211 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> ..$ user_id: num [1:1058211] 1 2 2 2 2 2 2 2 4 4 ...
#> ..$ hex : chr [1:1058211] "#8240EA" "#4B31EA" "#584601" "#DA239C" ...
#> ..$ rank : num [1:1058211] 1 3 5 4 1 2 3 3 1 2 ...
#> ..- attr(*, "spec")=
#> .. .. cols(
#> .. .. user_id = col_double(),
#> .. .. hex = col_character(),
#> .. .. rank = col_double()
#> .. .. )
#> ..- attr(*, "problems")=<externalptr>
#> $ color_ranks: spc_tbl_ [949 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> ..$ color: chr [1:949] "purple" "green" "blue" "pink" ...
#> ..$ rank : num [1:949] 1 2 3 4 5 6 7 8 9 10 ...
#> ..$ hex : chr [1:949] "#7e1e9c" "#15b01a" "#0343df" "#ff81c0" ...
#> ..- attr(*, "spec")=
#> .. .. cols(
#> .. .. color = col_character(),
#> .. .. rank = col_double(),
#> .. .. hex = col_character()
#> .. .. )
#> ..- attr(*, "problems")=<externalptr>
#> $ users : spc_tbl_ [152,401 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> ..$ user_id : num [1:152401] 1 2 3 4 5 6 7 8 9 10 ...
#> ..$ monitor : chr [1:152401] "LCD" "LCD" "LCD" "LCD" ...
#> ..$ y_chromosome: num [1:152401] 1 1 1 0 1 1 1 1 1 1 ...
#> ..$ colorblind : num [1:152401] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ spam_prob : num [1:152401] 0.00209 0.07458 0.0164 0.00156 0.00238 ...
#> ..- attr(*, "spec")=
#> .. .. cols(
#> .. .. user_id = col_double(),
#> .. .. monitor = col_character(),
#> .. .. y_chromosome = col_double(),
#> .. .. colorblind = col_double(),
#> .. .. spam_prob = col_double()
#> .. .. )
#> ..- attr(*, "problems")=<externalptr>
#> - attr(*, ".tt")= 'tt' chr [1:3] "answers.csv" "color_ranks.csv" "users.csv"
#> ..- attr(*, ".files")='data.frame': 3 obs. of 3 variables:
#> .. ..$ data_files: chr [1:3] "answers.csv" "color_ranks.csv" "users.csv"
#> .. ..$ data_type : chr [1:3] "csv" "csv" "csv"
#> .. ..$ delim : chr [1:3] "," "," ","
#> ..- attr(*, ".readme")=List of 2
#> .. ..$ node:<externalptr>
#> .. ..$ doc :<externalptr>
#> .. ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
#> ..- attr(*, ".date")= Date[1:1], format: "2025-07-08"
#> - attr(*, "class")= chr "tt_data"
Class
The tt_data
list has it’s own special class
("tt_data"
).
attr(tt_data, "class")
#> [1] "tt_data"
The .tt
attribute prints the available datasets for the
date
.
attr(tt_data, ".tt")
#> Available datasets in this TidyTuesday:
#> answers.csv
#> color_ranks.csv
#> users.csv
#>
The .tt
attribute also lists the available datasets, a
.files
dataset with file type and delim, the
.readme
(as html), and the .date
:
str(attr(tt_data, ".tt"))
#> 'tt' chr [1:3] "answers.csv" "color_ranks.csv" "users.csv"
#> - attr(*, ".files")='data.frame': 3 obs. of 3 variables:
#> ..$ data_files: chr [1:3] "answers.csv" "color_ranks.csv" "users.csv"
#> ..$ data_type : chr [1:3] "csv" "csv" "csv"
#> ..$ delim : chr [1:3] "," "," ","
#> - attr(*, ".readme")=List of 2
#> ..$ node:<externalptr>
#> ..$ doc :<externalptr>
#> ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
#> - attr(*, ".date")= Date[1:1], format: "2025-07-08"
ttmeta
The ttmeta
package provides, “a summary of each weekly TidyTuesday post,
information about the articles and data sources linked in those posts,
and details about the datasets themselves, including variable names and
classes.”
tt_meta <- get_tt_tbl(min_year = 2025L, max_year = this_year())
str(tt_meta)
#> tt_tbl [29 × 8] (S3: tt_tbl/tbl_df/tbl/data.frame)
#> $ year : int [1:29] 2025 2025 2025 2025 2025 2025 2025 2025 2025 2025 ...
#> $ week : int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
#> $ date : Date[1:29], format: "2025-01-07" "2025-01-14" ...
#> $ title : chr [1:29] "Bring your own data from 2024!" "posit::conf talks" "The History of Himalayan Mountaineering Expeditions" "Water Insecurity" ...
#> $ source_title : chr [1:29] "NA" "posit::conf attendee portal 2023, posit::conf attendee portal 2024" "The Himalayan Database" "US Census Data from tidycensus" ...
#> $ source_urls :List of 29
#> ..$ : chr(0)
#> ..$ : chr [1:2] "https://reg.conf.posit.co/flow/posit/positconf23/attendee-portal/page/sessioncatalog" "https://reg.conf.posit.co/flow/posit/positconf24/attendee-portal/page/sessioncatalog"
#> ..$ : chr "https://www.himalayandatabase.com/downloads.html"
#> ..$ : chr "https://cran.r-project.org/package=tidycensus"
#> ..$ : chr "https://www.kaggle.com/datasets/prashant111/the-simpsons-dataset"
#> ..$ : chr "https://archive.org/details/20250128-cdc-datasets"
#> ..$ : chr "https://cde.ucr.cjis.gov/LATEST/webapp/#/pages/docApi"
#> ..$ : chr "https://www.ajog.org/article/S0002-9378(24)00775-0/fulltext"
#> ..$ : chr "https://data.longbeach.gov/explore/dataset/animal-shelter-intakes-and-outcomes/information/"
#> ..$ : chr "https://erictleung.com/pixarfilms/index.html"
#> ..$ : chr "https://github.com/EmilHvitfeldt/palmtrees"
#> ..$ : chr "https://ir.aboutamazon.com/annual-reports-proxies-and-shareholder-letters/default.aspx"
#> ..$ : chr "https://github.com/williamorim/pokemon/"
#> ..$ : chr "https://data.cms.gov/provider-data/dataset/apyc-v239"
#> ..$ : chr "https://www.r-project.org/"
#> ..$ : chr "https://osf.io/qnrg6/"
#> ..$ : chr "https://user2025.r-project.org/"
#> ..$ : chr "https://grant-watch.us/nsf-data.html"
#> ..$ : chr "https://www.ingv.it/"
#> ..$ : chr [1:2] "https://www.beachwatch.nsw.gov.au/waterMonitoring/waterQualityData" "https://open-meteo.com/"
#> ..$ : chr "https://www.dndbeyond.com/srd"
#> ..$ : chr "https://docs.ropensci.org/gutenbergr/"
#> ..$ : chr "https://github.com/ropensci/historydata"
#> ..$ : chr "https://apis.guru"
#> ..$ : chr "https://immunizationdata.who.int/global?topic=Provisional-measles-and-rubella-data&location="
#> ..$ : chr "https://www.eia.gov/dnav/pet/xls/PET_PRI_GND_DCUS_NUS_W.xls"
#> ..$ : chr "https://xkcd.com/color/colorsurvey.tar.gz"
#> ..$ : chr "https://docs.google.com/spreadsheets/d/1uxjiuWYZrALF2mthmiYbUPieu1dEdEwv9GB8dEAizso/edit?gid=0#gid=0"
#> ..$ : chr "https://data.ny.gov/Transportation/MTA-Permanent-Art-Catalog-Beginning-1980/4y8j-9pkd/about_data"
#> $ article_title: chr [1:29] "NA" "posit::conf(2025) in-person registration is now open!" "The Expedition Archives of Elizabeth Hawley" "Mapping water insecurity in R with tidycensus" ...
#> $ article_urls :List of 29
#> ..$ : chr(0)
#> ..$ : chr "https://posit.co/blog/positconf2025-in-person-registration-is-now-open/"
#> ..$ : chr "https://www.himalayandatabase.com/index.html"
#> ..$ : chr "https://waterdata.usgs.gov/blog/acs-maps/"
#> ..$ : chr "https://toddwschneider.com/posts/the-simpsons-by-the-data/"
#> ..$ : chr "https://www.npr.org/sections/shots-health-news/2025/01/31/nx-s1-5282274/trump-administration-purges-health-websites"
#> ..$ : chr "https://le.fbi.gov/cjis-division/cjis-link/uniform-crime-reporting-program-still-vital-after-90-years-"
#> ..$ : chr "https://katcorr.github.io/this-art-is-HARD/"
#> ..$ : chr "https://www.longbeach.gov/press-releases/long-beach-animal-care-services-hits-highest-adoption-rate-ever-surpas"| __truncated__
#> ..$ : chr "https://erictleung.com/pixarfilms/articles/pixar_film_ratings.html"
#> ..$ : chr "https://www.nature.com/articles/s41597-019-0189-0"
#> ..$ : chr "https://gregoryvdvinne.github.io/Text-Mining-Amazon-Budgets.html"
#> ..$ : chr "https://medium.com/@hanahshih46/pokemon-data-visualization-and-analysis-with-r-60970c8e37f4"
#> ..$ : chr "https://www.visualcapitalist.com/mapped-emergency-room-visit-times-by-state/"
#> ..$ : chr "https://zenodo.org/records/14902740"
#> ..$ : chr "https://osf.io/preprints/osf/tzcsy_v1"
#> ..$ : chr "https://user2025.r-project.org/"
#> ..$ : chr "https://www.nytimes.com/2025/04/22/science/trump-national-science-foundation-grants.html"
#> ..$ : chr "https://www.ingv.it/somma-vesuvio"
#> ..$ : chr "https://www.abc.net.au/news/2025-01-10/pollution-risks-in-sydney-beaches-contaminated-waterways-rain/104790856"
#> ..$ : chr "https://www.dndbeyond.com/posts/1949-you-can-now-publish-your-own-creations-using-the"
#> ..$ : chr "https://www.gutenberg.org/about/background/50years.html"
#> ..$ : chr "https://ropensci.org/blog/2023/02/07/what-does-it-mean-to-maintain-a-package/"
#> ..$ : chr "https://dslc-io.github.io/club-wapir/slides/intro.html"
#> ..$ : chr "https://abcnews.go.com/Health/measles-cases-reach-1046-us-infections-confirmed-30/story?id=122108194"
#> ..$ : chr "https://www.eia.gov/petroleum/gasdiesel/"
#> ..$ : chr "https://blog.xkcd.com/2010/05/03/color-survey-results/"
#> ..$ : chr "https://anjackson.net/2024/11/29/british-library-funding-breakdown-trends/#income-streams"
#> ..$ : chr "https://www.mta.info/agency/arts-design/permanent-art"
GitHub URLs
The #TidyTuesday
repository has a
.csv file with the Week
, Date
,
year
, data_files
, data_type
, and
delim
:
This file is used to create the tt_github_urls
data:
tt_github_urls <- vroom::vroom(
file = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/static/tt_data_type.csv",
delim = ",") |>
# create github_url column
dplyr::mutate(
github_url = glue::glue(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/{year}/{Date}/{data_files}"
)
) |>
# clean names
dplyr::rename(
week = Week,
date = Date
)
#> Rows: 771 Columns: 6
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (3): data_files, data_type, delim
#> dbl (2): Week, year
#> date (1): Date
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dplyr::glimpse(tt_github_urls)
#> Rows: 771
#> Columns: 7
#> $ week <dbl> 29, 29, 28, 27, 27, 27, 26, 25, 25, 24, 24, 24, 24, 24, 23,…
#> $ date <date> 2025-07-22, 2025-07-22, 2025-07-15, 2025-07-08, 2025-07-08…
#> $ year <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025,…
#> $ data_files <chr> "mta_art.csv", "station_lines.csv", "bl_funding.csv", "answ…
#> $ data_type <chr> "csv", "csv", "csv", "csv", "csv", "csv", "csv", "csv", "cs…
#> $ delim <chr> ",", ",", ",", ",", ",", ",", ",", ",", ",", ",", ",", ",",…
#> $ github_url <glue> "https://raw.githubusercontent.com/rfordatascience/tidytue…
The tt_github_urls
data comes from the .csv
file in TidyTuesday repo:
All TidyTuesday Data
The all_tt_data.rda
dataset contains all years (2018 -
current) from the ttmeta::get_tt_tbl()
function. This
datasets contain the following columns:
dplyr::glimpse(all_tt_data)
#> Rows: 378
#> Columns: 9
#> $ year <int> 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 20…
#> $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
#> $ date <date> 2025-01-07, 2025-01-14, 2025-01-21, 2025-01-28, 2025-02…
#> $ title <chr> "Bring your own data from 2024!", "posit::conf talks", "…
#> $ clean_title <chr> "bring_your_own_data_from_2024", "posit_conf_talks", "th…
#> $ source_title <chr> "NA", "posit::conf attendee portal 2023, posit::conf att…
#> $ source_urls <list> <>, <"https://reg.conf.posit.co/flow/posit/positconf23/…
#> $ article_title <chr> "NA", "posit::conf(2025) in-person registration is now o…
#> $ article_urls <list> <>, "https://posit.co/blog/positconf2025-in-person-regi…
All TidyTuesday Meta
The all_tt_meta.rda
dataset contains all years (2018 -
current) from the ttmeta::load_tt_datasets_metadata()
. This
dataset the following columns:
dplyr::glimpse(all_tt_meta)
#> Rows: 765
#> Columns: 6
#> $ year <int> 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025,…
#> $ week <int> 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 8, 8, 9,…
#> $ dataset_name <chr> NA, "conf2023", "conf2024", "exped_tidy", "peaks_tidy…
#> $ variables <int> NA, 9, 5, 69, 29, 7, 7, 4, 14, 3, 13, 27, 6, 6, 10, 6…
#> $ observations <int> NA, 116, 106, 882, 480, 848, 854, 6722, 151, 4459, 31…
#> $ variable_details <list> <NULL>, [<tbl_df[9 x 6]>], [<tbl_df[5 x 6]>], [<tbl_…
All TidyTuesday Variable Details
The all_tt_var_details.rda
dataset contains the
following columns:
dplyr::glimpse(all_tt_var_details)
#> Rows: 8,958
#> Columns: 11
#> $ year <int> 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 202…
#> $ week <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, …
#> $ dataset_name <chr> "conf2023", "conf2023", "conf2023", "conf2023", "conf2023…
#> $ variables <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 5, 5, 69, 69, 69, 69,…
#> $ observations <int> 116, 116, 116, 116, 116, 116, 116, 116, 116, 106, 106, 10…
#> $ variable <chr> "speaker_name", "speaker_affiliation", "session_type", "s…
#> $ class <chr> "character", "character", "character", "character", "char…
#> $ n_unique <int> 115, 81, 3, 110, 28, 2, 39, 3, 110, 106, 105, 25, 102, 10…
#> $ min <list> "Aaron Chafetz", "A Plus Associates, Posit PBC(Contracto…
#> $ max <list> "Wyl Schuth", "Washington State Department of Agricultur…
#> $ description <chr> "The name of the speaker. The data is indexed by this fie…
All TidyTuesday Combined
The all_tt_combined.rda
dataset is
all_tt_meta
left-joined with all_tt_data
by
year
and week
.
dplyr::glimpse(all_tt_combined)
#> Rows: 24,450
#> Columns: 20
#> $ title <chr> "posit::conf talks", "posit::conf talks", "posit::conf t…
#> $ clean_title <chr> "posit_conf_talks", "posit_conf_talks", "posit_conf_talk…
#> $ dataset_name <chr> "conf2023", "conf2023", "conf2023", "conf2023", "conf202…
#> $ year <dbl> 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 2025, 20…
#> $ week <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
#> $ date <date> 2025-01-14, 2025-01-14, 2025-01-14, 2025-01-14, 2025-01…
#> $ variables <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5,…
#> $ observations <int> 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 1…
#> $ variable <chr> "speaker_name", "speaker_name", "speaker_affiliation", "…
#> $ class <chr> "character", "character", "character", "character", "cha…
#> $ n_unique <int> 115, 115, 81, 81, 3, 3, 110, 110, 28, 28, 2, 2, 39, 39, …
#> $ min <list> "Aaron Chafetz", "Aaron Chafetz", "A Plus Associates, P…
#> $ max <list> "Wyl Schuth", "Wyl Schuth", "Washington State Departmen…
#> $ description <chr> "The name of the speaker. The data is indexed by this fi…
#> $ source_title <chr> "posit::conf attendee portal 2023, posit::conf attendee …
#> $ article_title <chr> "posit::conf(2025) in-person registration is now open!",…
#> $ data_files <chr> "conf2023.csv", "conf2024.csv", "conf2023.csv", "conf202…
#> $ data_type <chr> "csv", "csv", "csv", "csv", "csv", "csv", "csv", "csv", …
#> $ delim <chr> ",", ",", ",", ",", ",", ",", ",", ",", ",", ",", ",", "…
#> $ github_url <glue> "https://raw.githubusercontent.com/rfordatascience/tidy…
The clean_title
variable has been added to create an
attribute we can use to join to the all_tt_var_details
data.
load_tt_data()
The load_tt_data()
function uses the title
from all_tt_combined
to the return the datasets from the
GitHub repo.
A single dataset
If the tile contains a single dataset, the list returns
ttd
with a single element:
ttd <- load_tt_data("Netflix Titles")
#> INFO [2025-07-24 06:20:37] Starting import for netflix_titles.csv from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2021/2021-04-20/netflix_titles.csv
#> SUCCESS [2025-07-24 06:20:37] Successfully loaded netflix_titles.csv
ttd_nms <- names(ttd)
ttd_nms
#> [1] "netflix_titles.csv"
The clean_title
attribute can be used to join each
dataset back to all_tt_combined
or
all_tt_data
:
attr(ttd[["netflix_titles.csv"]], "clean_title")
#> [1] "netflix_titles"
Two datasets
If ttd
has two datasets, we can subset the list with the
name position:
ttd2 <- load_tt_data("Space Launches")
#> INFO [2025-07-24 06:20:38] Starting import for agencies.csv from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2019/2019-01-15/agencies.csv
#> SUCCESS [2025-07-24 06:20:38] Successfully loaded agencies.csv
#> INFO [2025-07-24 06:20:38] Starting import for launches.csv from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2019/2019-01-15/launches.csv
#> SUCCESS [2025-07-24 06:20:38] Successfully loaded launches.csv
# store names
ttd2_nms <- names(ttd2)
# check attr
purrr::map(.x = ttd2, .f = attr, "clean_title")
#> $agencies.csv
#> [1] "space_launches"
#>
#> $launches.csv
#> [1] "space_launches"
# check first dataset
dplyr::glimpse(ttd2[[ttd2_nms[1]]])
#> Rows: 74
#> Columns: 19
#> $ agency <chr> "RVSN", "UNKS", "NASA", "USAF", "AE", "AFSC", "VKSR…
#> $ count <dbl> 1528, 904, 469, 388, 258, 247, 200, 181, 128, 105, …
#> $ ucode <chr> "RVSN", "GUKOS", "NASA", "USAF", "AE", "AFSC", "GUK…
#> $ state_code <chr> "SU", "SU", "US", "US", "F", "US", "RU", "CN", "RU"…
#> $ type <chr> "O/LA", "O/LA", "O/LA/LV/PL/S", "O/LA/S", "O/LA", "…
#> $ class <chr> "D", "D", "C", "D", "B", "D", "D", "C", "C", "B", "…
#> $ tstart <chr> "1960", "1986 Apr 24", "1958 Oct 1", "1947 Sep 18"…
#> $ tstop <chr> "1991 Dec", "1991", "-", "-", "*", "1992 Jul 1", "…
#> $ short_name <chr> "RVSN", "UNKS", "NASA", "USAF", "Arianespace", "AFS…
#> $ name <chr> "Rakentiye Voiska Strategicheskogo Naznacheniye", "…
#> $ location <chr> "Mosvka?", "Moskva", "Washington, D.C.", "Washingto…
#> $ longitude <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "…
#> $ latitude <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "…
#> $ error <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "…
#> $ parent <chr> "-", "MO", "-", "-", "-", "USAF", "RVSN", "CASC", "…
#> $ short_english_name <chr> "-", "-", "-", "-", "Arianespace", "-", "-", "CALT"…
#> $ english_name <chr> "Strategic Rocket Forces", "-", "-", "-", "-", "-",…
#> $ unicode_name <chr> "Ракетные войска стратегического назначения", "Упра…
#> $ agency_type <chr> "state", "state", "state", "state", "private", "sta…
# check second dataset
dplyr::glimpse(ttd2[[ttd2_nms[2]]])
#> Rows: 5,726
#> Columns: 11
#> $ tag <chr> "1967-065", "1967-080", "1967-096", "1968-042", "1968-092"…
#> $ JD <dbl> 2439671, 2439726, 2439775, 2440000, 2440153, 2440426, 2440…
#> $ launch_date <date> 1967-06-29, 1967-08-23, 1967-10-11, 1968-05-23, 1968-10-2…
#> $ launch_year <dbl> 1967, 1967, 1967, 1968, 1968, 1969, 1970, 1970, 1971, 1971…
#> $ type <chr> "Thor Burner 2", "Thor Burner 2", "Thor Burner 2", "Thor B…
#> $ variant <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
#> $ mission <chr> "Secor Type II S/N 10", "DAPP 3419", "DAPP 4417", "DAPP 54…
#> $ agency <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US"…
#> $ state_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US"…
#> $ category <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"…
#> $ agency_type <chr> "state", "state", "state", "state", "state", "state", "sta…
Three datasets
If there are three datasets, we can see the names are assigned to each element in the list:
ttd3 <- load_tt_data("Moore’s Law")
#> INFO [2025-07-24 06:20:38] Starting import for cpu.csv from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2019/2019-09-03/cpu.csv
#> SUCCESS [2025-07-24 06:20:38] Successfully loaded cpu.csv
#> INFO [2025-07-24 06:20:38] Starting import for gpu.csv from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2019/2019-09-03/gpu.csv
#> SUCCESS [2025-07-24 06:20:38] Successfully loaded gpu.csv
#> INFO [2025-07-24 06:20:38] Starting import for ram.csv from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2019/2019-09-03/ram.csv
#> SUCCESS [2025-07-24 06:20:38] Successfully loaded ram.csv
ttd3_nms <- names(ttd3)
# attr
purrr::map(.x = ttd3, .f = attr, "clean_title")
#> $cpu.csv
#> [1] "moores_law"
#>
#> $gpu.csv
#> [1] "moores_law"
#>
#> $ram.csv
#> [1] "moores_law"
dplyr::glimpse(ttd3[[ttd3_nms[1]]])
#> Rows: 176
#> Columns: 6
#> $ processor <chr> "MP944 (20-bit, 6-chip)", "Intel 4004 (4-bit, 16-…
#> $ transistor_count <dbl> NA, 2250, 3500, 2500, 2800, 3000, 4100, 6000, 800…
#> $ date_of_introduction <dbl> 1970, 1971, 1972, 1973, 1973, 1974, 1974, 1974, 1…
#> $ designer <chr> "Garrett AiResearch", "Intel", "Intel", "NEC", "T…
#> $ process <dbl> NA, 10000, 10000, 7500, 6000, 10000, 6000, 6000, …
#> $ area <dbl> NA, 12, 14, NA, 32, 12, 16, 20, 11, 21, NA, NA, 2…
dplyr::glimpse(ttd3[[ttd3_nms[2]]])
#> Rows: 112
#> Columns: 8
#> $ processor <chr> "µPD7220 GDC", "ARTC HD63484", "YM7101 VDP", "Tom…
#> $ transistor_count <dbl> 4.0e+04, 6.0e+04, 1.0e+05, 7.5e+05, 1.0e+06, 1.0e…
#> $ date_of_introduction <dbl> 1982, 1984, 1988, 1993, 1994, 1994, 1995, 1996, 1…
#> $ designer_s <chr> "NEC", "Hitachi", "Sega", "Flare", "Sega", "Toshi…
#> $ manufacturer_s <chr> "NEC", "Hitachi", "Yamaha", "IBM", "Hitachi", "LS…
#> $ process <dbl> 5000, NA, NA, NA, 500, 500, 500, 350, 350, 500, 5…
#> $ area <dbl> NA, NA, NA, NA, NA, NA, 90, 81, NA, NA, NA, 90, 1…
#> $ ref <chr> "[107]", "[108]", "[109]", "[109]", "[110][111]",…
dplyr::glimpse(ttd3[[ttd3_nms[3]]])
#> Rows: 47
#> Columns: 10
#> $ chip_name <chr> "N/A", "N/A", "?", "SP95", "TMC3162", "?", "?", "…
#> $ capacity_bits <dbl> 1, 1, 8, 16, 16, NA, 256, 64, 144, 256, 1, 1, 1, …
#> $ bit_units <chr> "Bits", "Bits", "Bits", "Bits", "Bits", NA, "Bits…
#> $ ram_type <chr> "SRAM (cell)", "DRAM (cell)", "SRAM (bipolar)", "…
#> $ transistor_count <dbl> 6, 1, 48, 80, 96, NA, 256, 384, 864, 1536, 768, 3…
#> $ date_of_introduction <dbl> 1963, 1965, 1965, 1965, 1966, 1966, 1968, 1968, 1…
#> $ manufacturer_s <chr> "Fairchild", "Toshiba", "SDS, Signetics", "IBM", …
#> $ process <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 12000, NA, 80…
#> $ area <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, N…
#> $ ref <chr> "[162]", "[163][164]", "[162]", "[165]", "[160]",…
Many datasets
If there are more than two datasets, the same rules apply.
ttd_many <- load_tt_data("LEGO database")
#> INFO [2025-07-24 06:20:38] Starting import for colors.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/colors.csv.gz
#> SUCCESS [2025-07-24 06:20:39] Successfully loaded colors.csv.gz
#> INFO [2025-07-24 06:20:39] Starting import for elements.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/elements.csv.gz
#> SUCCESS [2025-07-24 06:20:39] Successfully loaded elements.csv.gz
#> INFO [2025-07-24 06:20:39] Starting import for inventories.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/inventories.csv.gz
#> SUCCESS [2025-07-24 06:20:39] Successfully loaded inventories.csv.gz
#> INFO [2025-07-24 06:20:39] Starting import for inventory_minifigs.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/inventory_minifigs.csv.gz
#> SUCCESS [2025-07-24 06:20:39] Successfully loaded inventory_minifigs.csv.gz
#> INFO [2025-07-24 06:20:39] Starting import for inventory_parts.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/inventory_parts.csv.gz
#> SUCCESS [2025-07-24 06:20:40] Successfully loaded inventory_parts.csv.gz
#> INFO [2025-07-24 06:20:40] Starting import for inventory_sets.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/inventory_sets.csv.gz
#> SUCCESS [2025-07-24 06:20:40] Successfully loaded inventory_sets.csv.gz
#> INFO [2025-07-24 06:20:40] Starting import for minifigs.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/minifigs.csv.gz
#> SUCCESS [2025-07-24 06:20:40] Successfully loaded minifigs.csv.gz
#> INFO [2025-07-24 06:20:40] Starting import for part_categories.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/part_categories.csv.gz
#> SUCCESS [2025-07-24 06:20:40] Successfully loaded part_categories.csv.gz
#> INFO [2025-07-24 06:20:40] Starting import for part_relationships.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/part_relationships.csv.gz
#> SUCCESS [2025-07-24 06:20:41] Successfully loaded part_relationships.csv.gz
#> INFO [2025-07-24 06:20:41] Starting import for parts.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/parts.csv.gz
#> SUCCESS [2025-07-24 06:20:41] Successfully loaded parts.csv.gz
#> INFO [2025-07-24 06:20:41] Starting import for sets.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/sets.csv.gz
#> SUCCESS [2025-07-24 06:20:41] Successfully loaded sets.csv.gz
#> INFO [2025-07-24 06:20:41] Starting import for themes.csv.gz from https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/main/data/2022/2022-09-06/themes.csv.gz
#> SUCCESS [2025-07-24 06:20:41] Successfully loaded themes.csv.gz
ttd_many_nms <- names(ttd_many)
# check attr
purrr::map(.x = ttd_many, .f = attr, "clean_title")
#> $colors.csv.gz
#> [1] "lego_database"
#>
#> $elements.csv.gz
#> [1] "lego_database"
#>
#> $inventories.csv.gz
#> [1] "lego_database"
#>
#> $inventory_minifigs.csv.gz
#> [1] "lego_database"
#>
#> $inventory_parts.csv.gz
#> [1] "lego_database"
#>
#> $inventory_sets.csv.gz
#> [1] "lego_database"
#>
#> $minifigs.csv.gz
#> [1] "lego_database"
#>
#> $part_categories.csv.gz
#> [1] "lego_database"
#>
#> $part_relationships.csv.gz
#> [1] "lego_database"
#>
#> $parts.csv.gz
#> [1] "lego_database"
#>
#> $sets.csv.gz
#> [1] "lego_database"
#>
#> $themes.csv.gz
#> [1] "lego_database"
Meta data
The get_tt_title_meta()
function returns info on the
columns in each dataset in a ttd
list.
ttd_meta <- get_tt_title_meta(ttd = ttd)
#> INFO [2025-07-24 06:20:41] Datasets in list: netflix_titles.csv
#> INFO [2025-07-24 06:20:41] Created metadata tibble with 14 rows covering 1 datasets
ttd_meta
#> # A tibble: 14 × 4
#> clean_title dataset col col_type
#> <chr> <chr> <chr> <chr>
#> 1 netflix_titles netflix_titles.csv release_year numeric
#> 2 netflix_titles netflix_titles.csv NA logical
#> 3 netflix_titles netflix_titles.csv show_id character
#> 4 netflix_titles netflix_titles.csv type character
#> 5 netflix_titles netflix_titles.csv title character
#> 6 netflix_titles netflix_titles.csv director character
#> 7 netflix_titles netflix_titles.csv cast character
#> 8 netflix_titles netflix_titles.csv country character
#> 9 netflix_titles netflix_titles.csv date_added character
#> 10 netflix_titles netflix_titles.csv rating character
#> 11 netflix_titles netflix_titles.csv duration character
#> 12 netflix_titles netflix_titles.csv listed_in character
#> 13 netflix_titles netflix_titles.csv description character
#> 14 netflix_titles netflix_titles.csv NA list
ttd_meta |>
dplyr::filter(col_type == "character") |>
dplyr::count(clean_title, dataset, col_type, name = "chr_cols") |>
dplyr::arrange(desc(chr_cols))
#> # A tibble: 1 × 4
#> clean_title dataset col_type chr_cols
#> <chr> <chr> <chr> <int>
#> 1 netflix_titles netflix_titles.csv character 11
ttd2_meta <- get_tt_title_meta(ttd = ttd2)
#> INFO [2025-07-24 06:20:41] Datasets in list: agencies.csv, launches.csv
#> INFO [2025-07-24 06:20:41] Created metadata tibble with 33 rows covering 2 datasets
head(ttd2_meta)
#> # A tibble: 6 × 4
#> clean_title dataset col col_type
#> <chr> <chr> <chr> <chr>
#> 1 space_launches agencies.csv count numeric
#> 2 space_launches agencies.csv NA logical
#> 3 space_launches agencies.csv agency character
#> 4 space_launches agencies.csv ucode character
#> 5 space_launches agencies.csv state_code character
#> 6 space_launches agencies.csv type character
ttd3_meta <- get_tt_title_meta(ttd = ttd3)
#> INFO [2025-07-24 06:20:42] Datasets in list: cpu.csv, gpu.csv, ram.csv
#> INFO [2025-07-24 06:20:42] Created metadata tibble with 30 rows covering 3 datasets
head(ttd3_meta)
#> # A tibble: 6 × 4
#> clean_title dataset col col_type
#> <chr> <chr> <chr> <chr>
#> 1 moores_law cpu.csv transistor_count numeric
#> 2 moores_law cpu.csv date_of_introduction numeric
#> 3 moores_law cpu.csv process numeric
#> 4 moores_law cpu.csv area numeric
#> 5 moores_law cpu.csv NA logical
#> 6 moores_law cpu.csv processor character
ttd_many_meta <- get_tt_title_meta(ttd = ttd_many)
#> INFO [2025-07-24 06:20:42] Datasets in list: colors.csv.gz, elements.csv.gz, inventories.csv.gz, inventory_minifigs.csv.gz, inventory_parts.csv.gz, inventory_sets.csv.gz, minifigs.csv.gz, part_categories.csv.gz, part_relationships.csv.gz, parts.csv.gz, sets.csv.gz, themes.csv.gz
#> INFO [2025-07-24 06:20:42] Created metadata tibble with 67 rows covering 12 datasets
head(ttd_many_meta)
#> # A tibble: 6 × 4
#> clean_title dataset col col_type
#> <chr> <chr> <chr> <chr>
#> 1 lego_database colors.csv.gz id numeric
#> 2 lego_database colors.csv.gz is_trans logical
#> 3 lego_database colors.csv.gz name character
#> 4 lego_database colors.csv.gz rgb character
#> 5 lego_database colors.csv.gz NA list
#> 6 lego_database elements.csv.gz element_id numeric
Below is the number of numeric columns per dataset
and
clean_title
:
ttd_many_meta |>
dplyr::filter(col_type == "numeric") |>
dplyr::count(clean_title, dataset, col_type, name = "num_cols") |>
dplyr::arrange(desc(num_cols))
#> # A tibble: 12 × 4
#> clean_title dataset col_type num_cols
#> <chr> <chr> <chr> <int>
#> 1 lego_database inventory_parts.csv.gz numeric 3
#> 2 lego_database sets.csv.gz numeric 3
#> 3 lego_database elements.csv.gz numeric 2
#> 4 lego_database inventories.csv.gz numeric 2
#> 5 lego_database inventory_minifigs.csv.gz numeric 2
#> 6 lego_database inventory_sets.csv.gz numeric 2
#> 7 lego_database themes.csv.gz numeric 2
#> 8 lego_database colors.csv.gz numeric 1
#> 9 lego_database minifigs.csv.gz numeric 1
#> 10 lego_database part_categories.csv.gz numeric 1
#> 11 lego_database part_relationships.csv.gz numeric 1
#> 12 lego_database parts.csv.gz numeric 1