Creating a table with the {gt} package

Author

Matthew Harris

Published

May 29, 2022

Abstract
Summarizing real estate data with the {gt} package.

Intro

The gt package is great for creating clean and informative tables. In this post I’ll being using more data from the Arlington Country Open Data Portal to create my own table.

Packages

Code
library(dplyr)
library(tidyr)
library(httr2)
library(jsonlite)
library(readr)
library(purrr)
library(stringr)
library(lubridate)
library(glue)
library(scales)
library(gt)
library(blsR)
library(janitor)
library(webshot)

Load Custom Palette

Using a custom ggplot2 color palette that I created based on the Death Stranding color scheme.

Code
raw_source <- "?raw=true"
source(glue("https://github.com/mhdemo/custom_palette_collection/blob/main/palettes/deathstranding.R{raw_source}"))

API Calls

Function

Creating a function to create calls to the Arlington County Open Data API.

Code
# The order by argument is necessary to ensure that duplicate rows aren't
# introduced when using pagination
arl_odp <- function(resource, ..., skip = 0, top = 10, orderby = NULL, f_select = NULL, r_filter = NULL) {
  params <- list(
    skip = skip,
    top = top,
    orderby = orderby,
    select = f_select,
    filter = r_filter
  ) %>%
    set_names(nm = paste0("$", names(.)))
  
  arl_resp <- request("https://datahub-v2.arlingtonva.us/api/") %>%
    req_url_path_append(resource) %>%
    req_url_query(!!!params) %>%
    req_perform() %>%
    resp_body_string() %>%
    fromJSON()
  
  if(is.data.frame(arl_resp)) {
    return(arl_resp %>% clean_names())
  } else {
    return()
  }

}

Property Data

Retrieving the property and sale history records.

Code
property_df <- map2_df(
  .x = seq(0, 75000, 10000),
  .y = 10000,
  .f = ~ arl_odp(
    resource = "RealEstate/Property",
    skip = .x,
    top = .y,
    orderby = "realEstatePropertyCode")
  )

Sale Data

Code
sale_hist_df <- map2_df(
  .x = seq(0, 300000, 10000),
  .y = 10000,
  .f = ~ arl_odp(
    resource = "RealEstate/SalesHistory",
    skip = .x,
    top = .y,
    orderby = "salesHistoryKey")
  )

CPI Data

Using the blsR package to create calls to the Bureau of Labor Statistics API.

Code
cpi_monthly <- blsR::get_series_table(
  series_id = "CUUR0000SA0",
  api_key = keyring::key_get("bls_reg_key"), # Enter your own API key here
  start_year = 1970,
  end_year = year(Sys.Date())
)
Code
cpi_avg_annual <- cpi_monthly %>%
  arrange(year) %>%
  group_by(year) %>%
  summarize(avg_cpi = mean(value, na.rm = TRUE))

curr_cpi <- cpi_avg_annual %>%
  tail(1) %>%
  pull(avg_cpi)

cpi_avg_annual <- cpi_avg_annual %>%
  mutate(inf_rate = curr_cpi / avg_cpi)

Data Wrangling

Uses the property data to filter only for property classes and sale types that could be associated with non-commercial sales. I’m also performing some categorization of property classes using case_when() and additional filtering to make the data nice and tidy.

Code
resid_sales <- sale_hist_df %>%
  select(
    real_estate_property_code,
    sale_amt, sale_date,
    sales_type_code,
    sales_type_dsc,
    property_street_nbr_name_text
  ) %>%
  inner_join(
    property_df,
    by = "real_estate_property_code"
  ) %>%
  filter(
    property_class_type_code %in% c(511, 512, 514, 513, 616, 612, 613),
    property_zip_code != 22101,
    sales_type_code %in% c("1", "L", "E", "3", NA),
    sale_amt > 0,
    !is.na(sale_amt)
  ) %>%
  mutate(
    sale_year = year(sale_date),
    sale_decade = sale_year - sale_year %% 10,
    sale_decade_str = glue("{sale_decade}s"),
    prop_class_clean = case_when(
      property_class_type_code == "511" ~ "Single Family Home",
      property_class_type_code %in% c("612", "613", "616") ~ "Condo",
      property_class_type_code == "514" ~ "Side by Side",
      property_class_type_code %in% c("512", "513") ~ "Townhouse"
    )
  ) %>%
  filter(
    sale_year >= 1970,
    sale_amt > 0,
    !is.na(sale_amt)
  ) %>%
  left_join(
    cpi_avg_annual,
    by = c("sale_year" = "year")
  ) %>%
  mutate(sale_amt_inf_adj = inf_rate * sale_amt)

Plots and Tables

Code
prop_class_sales_year <- resid_sales %>%
  group_by(sale_year, prop_class_clean) %>%
  summarize(
    `25th` = quantile(sale_amt_inf_adj, 0.25),
    `50th` = median(sale_amt_inf_adj),
    `75th` = quantile(sale_amt_inf_adj, 0.75),
    .groups = "drop"
  )

p1 <- prop_class_sales_year %>%
  filter(sale_year >= 2000) %>%
  pivot_longer(cols = matches("\\d"), names_to = "metric") %>%
  ggplot(aes(sale_year, value, col = metric)) +
  geom_line(size = 1, alpha = 0.8) +
  scale_y_continuous(labels = comma, breaks = breaks_extended(4)) +
  scale_x_continuous(breaks = breaks_width(5)) +
  scale_color_ds_d() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45)) +
  facet_wrap(~prop_class_clean, scale = "free") +
  labs(x = "Year", y = "Sale Amount", col = "Percentile")

p1

Code
prop_class_sales_dec <- resid_sales %>%
  group_by(sale_decade, sale_decade_str, prop_class_clean) %>%
  summarize(
    `25th` = quantile(sale_amt_inf_adj, 0.25),
    `50th` = median(sale_amt_inf_adj),
    `75th` = quantile(sale_amt_inf_adj, 0.75),
    sale_sd = sd(sale_amt_inf_adj, na.rm = TRUE),
    avg_sale_cnt_per_year = n() / n_distinct(sale_year),
    .groups = "drop"
  )

prop_50_order <- prop_class_sales_dec %>%
  filter(sale_decade == 2020) %>%
  select(prop_class_clean, `50th`) %>%
  arrange(desc(`50th`)) %>%
  pull(prop_class_clean)

housing_gt <- prop_class_sales_dec %>%
  filter(sale_decade %in% c(1990, 2000, 2010, 2020)) %>%
  select(-sale_decade) %>%
  gt(rowname_col = "sale_decade_str", groupname_col = "prop_class_clean") %>%
  row_group_order(groups = prop_50_order) %>%
  tab_header(
    title = md("**Arlington County Real Estate Sale Metrics**"),
    subtitle = "By Property Class and Decade"
  ) %>%
  tab_style(
    style = list(cell_text(color = "#265BB2", size = "x-large")),
    locations = cells_title("title")
  ) %>%
  tab_spanner(
    label = md("**Percentiles**"),
    columns = matches("\\d")
  ) %>%
  cols_label(
    avg_sale_cnt_per_year = md("**Avg # of <br>Sales/Year**"),
    sale_sd = md("**Std Dev**")
  ) %>%
  fmt_currency(
    columns = c(matches("\\d"), sale_sd),
    decimals = 0
  ) %>%
  fmt_number(
    columns = avg_sale_cnt_per_year,
    decimals = 0
  ) %>%
  # Fill cell color based on value
  # data_color(
  #   columns = `50th`,
  #   colors = scales::col_numeric(
  #     palette = ds_palettes[[2]],
  #     domain = NULL  #c(50, 1500)
  #   )
  # ) %>%
  tab_options(
    row_group.font.weight = "bold",
    stub.border.width = 15,
    row_group.padding.horizontal = 2
  ) %>%
  tab_source_note(source_note = "Source: Arlington County Data API was used
                      for this purpose, but it is not endorsed
                      or certified by Arlington County.") %>%
  tab_footnote(
    footnote = "Inflation adjusted to 2022.",
    locations = cells_title(groups = "title")
  )

housing_gt
Arlington County Real Estate Sale Metrics1
By Property Class and Decade
Percentiles Std Dev Avg # of
Sales/Year
25th 50th 75th
Single Family Home
1990s $362,489 $437,128 $557,023 $199,135 1,184
2000s $605,185 $788,944 $998,183 $375,107 1,304
2010s $781,718 $932,970 $1,158,460 $389,199 1,042
2020s $927,177 $1,105,702 $1,450,000 $449,643 902
Townhouse
1990s $347,182 $453,732 $594,645 $214,455 220
2000s $568,958 $764,865 $1,022,599 $391,268 275
2010s $699,599 $883,795 $1,064,975 $460,079 234
2020s $767,047 $945,132 $1,118,499 $281,411 223
Side by Side
1990s $203,852 $236,558 $280,447 $66,920 91
2000s $308,415 $421,461 $546,126 $159,904 125
2010s $411,816 $494,230 $577,893 $134,171 80
2020s $564,458 $619,086 $693,334 $133,062 77
Condo
1990s $162,774 $227,967 $287,185 $91,829 738
2000s $268,421 $387,712 $504,221 $202,637 1,128
2010s $335,870 $440,207 $542,033 $210,058 867
2020s $329,957 $447,400 $590,000 $204,304 837
Source: Arlington County Data API was used for this purpose, but it is not endorsed or certified by Arlington County.
1 Inflation adjusted to 2022.