Code
library(dplyr)
library(tidyr)
library(httr2)
library(jsonlite)
library(readr)
library(purrr)
library(stringr)
library(lubridate)
library(glue)
library(scales)
library(gt)
library(blsR)
library(janitor)
library(webshot)
Matthew Harris
May 29, 2022
The gt
package is great for creating clean and informative tables. In this post I’ll being using more data from the Arlington Country Open Data Portal to create my own table.
Using a custom ggplot2 color palette that I created based on the Death Stranding color scheme.
Creating a function to create calls to the Arlington County Open Data API.
# The order by argument is necessary to ensure that duplicate rows aren't
# introduced when using pagination
arl_odp <- function(resource, ..., skip = 0, top = 10, orderby = NULL, f_select = NULL, r_filter = NULL) {
params <- list(
skip = skip,
top = top,
orderby = orderby,
select = f_select,
filter = r_filter
) %>%
set_names(nm = paste0("$", names(.)))
arl_resp <- request("https://datahub-v2.arlingtonva.us/api/") %>%
req_url_path_append(resource) %>%
req_url_query(!!!params) %>%
req_perform() %>%
resp_body_string() %>%
fromJSON()
if(is.data.frame(arl_resp)) {
return(arl_resp %>% clean_names())
} else {
return()
}
}
Retrieving the property and sale history records.
Using the blsR
package to create calls to the Bureau of Labor Statistics API.
Uses the property data to filter only for property classes and sale types that could be associated with non-commercial sales. I’m also performing some categorization of property classes using case_when()
and additional filtering to make the data nice and tidy.
resid_sales <- sale_hist_df %>%
select(
real_estate_property_code,
sale_amt, sale_date,
sales_type_code,
sales_type_dsc,
property_street_nbr_name_text
) %>%
inner_join(
property_df,
by = "real_estate_property_code"
) %>%
filter(
property_class_type_code %in% c(511, 512, 514, 513, 616, 612, 613),
property_zip_code != 22101,
sales_type_code %in% c("1", "L", "E", "3", NA),
sale_amt > 0,
!is.na(sale_amt)
) %>%
mutate(
sale_year = year(sale_date),
sale_decade = sale_year - sale_year %% 10,
sale_decade_str = glue("{sale_decade}s"),
prop_class_clean = case_when(
property_class_type_code == "511" ~ "Single Family Home",
property_class_type_code %in% c("612", "613", "616") ~ "Condo",
property_class_type_code == "514" ~ "Side by Side",
property_class_type_code %in% c("512", "513") ~ "Townhouse"
)
) %>%
filter(
sale_year >= 1970,
sale_amt > 0,
!is.na(sale_amt)
) %>%
left_join(
cpi_avg_annual,
by = c("sale_year" = "year")
) %>%
mutate(sale_amt_inf_adj = inf_rate * sale_amt)
prop_class_sales_year <- resid_sales %>%
group_by(sale_year, prop_class_clean) %>%
summarize(
`25th` = quantile(sale_amt_inf_adj, 0.25),
`50th` = median(sale_amt_inf_adj),
`75th` = quantile(sale_amt_inf_adj, 0.75),
.groups = "drop"
)
p1 <- prop_class_sales_year %>%
filter(sale_year >= 2000) %>%
pivot_longer(cols = matches("\\d"), names_to = "metric") %>%
ggplot(aes(sale_year, value, col = metric)) +
geom_line(size = 1, alpha = 0.8) +
scale_y_continuous(labels = comma, breaks = breaks_extended(4)) +
scale_x_continuous(breaks = breaks_width(5)) +
scale_color_ds_d() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45)) +
facet_wrap(~prop_class_clean, scale = "free") +
labs(x = "Year", y = "Sale Amount", col = "Percentile")
p1
prop_class_sales_dec <- resid_sales %>%
group_by(sale_decade, sale_decade_str, prop_class_clean) %>%
summarize(
`25th` = quantile(sale_amt_inf_adj, 0.25),
`50th` = median(sale_amt_inf_adj),
`75th` = quantile(sale_amt_inf_adj, 0.75),
sale_sd = sd(sale_amt_inf_adj, na.rm = TRUE),
avg_sale_cnt_per_year = n() / n_distinct(sale_year),
.groups = "drop"
)
prop_50_order <- prop_class_sales_dec %>%
filter(sale_decade == 2020) %>%
select(prop_class_clean, `50th`) %>%
arrange(desc(`50th`)) %>%
pull(prop_class_clean)
housing_gt <- prop_class_sales_dec %>%
filter(sale_decade %in% c(1990, 2000, 2010, 2020)) %>%
select(-sale_decade) %>%
gt(rowname_col = "sale_decade_str", groupname_col = "prop_class_clean") %>%
row_group_order(groups = prop_50_order) %>%
tab_header(
title = md("**Arlington County Real Estate Sale Metrics**"),
subtitle = "By Property Class and Decade"
) %>%
tab_style(
style = list(cell_text(color = "#265BB2", size = "x-large")),
locations = cells_title("title")
) %>%
tab_spanner(
label = md("**Percentiles**"),
columns = matches("\\d")
) %>%
cols_label(
avg_sale_cnt_per_year = md("**Avg # of <br>Sales/Year**"),
sale_sd = md("**Std Dev**")
) %>%
fmt_currency(
columns = c(matches("\\d"), sale_sd),
decimals = 0
) %>%
fmt_number(
columns = avg_sale_cnt_per_year,
decimals = 0
) %>%
# Fill cell color based on value
# data_color(
# columns = `50th`,
# colors = scales::col_numeric(
# palette = ds_palettes[[2]],
# domain = NULL #c(50, 1500)
# )
# ) %>%
tab_options(
row_group.font.weight = "bold",
stub.border.width = 15,
row_group.padding.horizontal = 2
) %>%
tab_source_note(source_note = "Source: Arlington County Data API was used
for this purpose, but it is not endorsed
or certified by Arlington County.") %>%
tab_footnote(
footnote = "Inflation adjusted to 2022.",
locations = cells_title(groups = "title")
)
housing_gt
Arlington County Real Estate Sale Metrics1 | |||||
---|---|---|---|---|---|
By Property Class and Decade | |||||
Percentiles | Std Dev | Avg # of Sales/Year |
|||
25th | 50th | 75th | |||
Single Family Home | |||||
1990s | $362,489 | $437,128 | $557,023 | $199,135 | 1,184 |
2000s | $605,185 | $788,944 | $998,183 | $375,107 | 1,304 |
2010s | $781,718 | $932,970 | $1,158,460 | $389,199 | 1,042 |
2020s | $927,177 | $1,105,702 | $1,450,000 | $449,643 | 902 |
Townhouse | |||||
1990s | $347,182 | $453,732 | $594,645 | $214,455 | 220 |
2000s | $568,958 | $764,865 | $1,022,599 | $391,268 | 275 |
2010s | $699,599 | $883,795 | $1,064,975 | $460,079 | 234 |
2020s | $767,047 | $945,132 | $1,118,499 | $281,411 | 223 |
Side by Side | |||||
1990s | $203,852 | $236,558 | $280,447 | $66,920 | 91 |
2000s | $308,415 | $421,461 | $546,126 | $159,904 | 125 |
2010s | $411,816 | $494,230 | $577,893 | $134,171 | 80 |
2020s | $564,458 | $619,086 | $693,334 | $133,062 | 77 |
Condo | |||||
1990s | $162,774 | $227,967 | $287,185 | $91,829 | 738 |
2000s | $268,421 | $387,712 | $504,221 | $202,637 | 1,128 |
2010s | $335,870 | $440,207 | $542,033 | $210,058 | 867 |
2020s | $329,957 | $447,400 | $590,000 | $204,304 | 837 |
Source: Arlington County Data API was used for this purpose, but it is not endorsed or certified by Arlington County. | |||||
1 Inflation adjusted to 2022. |