Code
library(readr)
library(dplyr)
library(tidyr)
library(duckdb)
library(ggplot2)
library(scales)
library(purrr)
library(glue)
library(here)
library(stringr)
library(lubridate)
library(ggsci)
library(ggmap)
Matthew Harris
December 29, 2022
I love using R but more often then not I also need to use SQL when working in a professional capacity. This post will demonstrate some of the features of {duckdb}. I’ll be revisiting an analysis that I completed a couple of years ago on Formula 1 drivers.
Formula 1 World Championship (1950 - 2022) Data
I’m choosing to import the csv files into R first using the read_csv()
function before loading all of the data into {duckdb}. The read_csv()
function tends to provide better data type inference than the {duckdb} option.
#Creates a vector of the csv file names
proj_files <- dir(path = here("posts", "revisiting-f1", "local_data"), pattern = "*.csv")
#Maps over the files names to load each csv file into a combined list
proj_df <- proj_files |>
map(~read_csv(file = here("posts", "revisiting-f1", "local_data", .x), na = c("", "NA", "\\N")))
names(proj_df) <- str_remove(proj_files, ".csv")
#Loads the list components to the global environment
list2env(proj_df, .GlobalEnv)
<environment: R_GlobalEnv>
Next I’ll load the data frames into the {duckdb} database.
Joining table and computing simply summary metrics in SQL has always been easy. I’ve decided to complete this process in SQL and create a new table that can be referenced throughout the rest of the analysis.
create table f1_summary as
with milli_fl_summ as
(
select
raceId
, driverId
, milliseconds as fl_time
, position as fl_position
, row_number() over(partition by raceId, driverId order by milliseconds) as l_time
from
lap_times
qualify
l_time = 1
),
pit_stop_summ as
(
select
raceId
, driverId
, sum(stop) as total_pit_stops
, sum(milliseconds) as total_ps_duration
from
pit_stops
group by
1,2
)
select
r.resultId as result_id
, r.raceId as race_id
, r.driverId as driver_id
, r.constructorId as constructor_id
, r.grid
, r.position
, r.positionOrder as position_order
, r.fastestLapSpeed as fl_speed
, r.milliseconds as f_time
, d.forename
, d.surname
, d.dob
, d.nationality as driver_nationality
, rc.date as race_date
, rc.name as race_name
, rc.time as race_time
, cr.name as constructor_name
, cr.nationality as constructor_nationality
, ci.name as circuit_name
, ci.location as circuit_location
, ci.country as circuit_country
, ci.lat
, ci.lng
, s.status
, fl.fl_time
, fl.fl_position
, pt.total_pit_stops
, pt.total_ps_duration
from
results as r
left join
drivers as d
on d.driverId = r.driverId
left join
races as rc
on rc.raceId = r.raceId
left join
constructors as cr
on cr.constructorId = r.constructorId
left join
circuits as ci
on ci.circuitId = rc.circuitId
left join
status as s
on s.statusId = r.statusId
left join
milli_fl_summ as fl
on fl.raceId = r.raceId
and fl.driverId = r.driverId
left join
pit_stop_summ as pt
on pt.raceId = r.raceId
and pt.driverId = r.driverId;
Now that I have a tidy table to use for analysis I can start digging into the data to answer some interesting questions. The goal is to complete as much of the analysis as I can in SQL and pull the summarized data into an R data frame. This is a common practice for me professionally as I’m often tasked with analyzing data that can contain hundreds of millions of rows across multiple tables.
podiums_by_decade <- dbGetQuery(con_dd,
'
select
*
, row_number() over(partition by race_decade order by total_podiums desc) as podium_rank_decade
from
(
select
constructor_name
, year(race_date) - year(race_date) % 10 as race_decade
, sum(case when position <= 3 then 1 else 0 end) as total_podiums
from
f1_summary
group by
1,2
)
qualify
podium_rank_decade <= 3
')
podiums_by_decade |>
filter(race_decade >= 1960) |>
ggplot(aes(constructor_name, total_podiums, fill = factor(race_decade))) +
geom_col(position = position_dodge2(width = 0.8, preserve = "single")) +
scale_y_continuous(limits = c(0, 210)) +
scale_fill_npg() +
facet_wrap(~race_decade, scales = "free_x") +
geom_text(aes(label = total_podiums, group = constructor_name, y = total_podiums), position = position_dodge(width = 0), vjust = -0.1, size = 3, angle = 0) +
labs(x = "", y = "", title = "Top 3 Constructors by Decade and Podium Counts") +
theme_light() +
theme(legend.position = "none",
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
races_by_loc <- dbGetQuery(con_dd,
'
select
circuit_name
, lat
, lng
, count(*) as race_cnt
from
f1_summary
group by
1,2,3
')
world_data <- map_data("world") %>%
filter(region != "Antarctica")
ggplot() +
geom_map(data = world_data, map = world_data,
aes(x = long, y = lat, map_id = region),
fill = "#a8a8a8", color = "#ffffff", size = 0.3) +
geom_point(data = races_by_loc, aes(x = lng, y = lat, color = race_cnt),
alpha = 0.75, size = 1.3) +
scale_color_gradient(low = pal_npg()(5)[4],
high = pal_npg()(8)[8]) +
labs(size = "Races", title = "Race Locations", color = "Races") +
theme_void() +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
plot.title = element_text(hjust = 0.5),
legend.position = "bottom") +
coord_fixed()
select
forename
, surname
, count(*) as total_races
, sum(case when position = 1 then 1 else 0 end) as total_wins
, sum(case when position <= 3 then 1 else 0 end) as total_podiums
, round(sum(case when position = 1 then 1 else 0 end)::float / count(*)::float, 2) as win_perc
, round(sum(case when position <= 3 then 1 else 0 end)::float / count(*)::float, 2) as podium_perc
, count(distinct year(race_date)) as career_length
from
f1_summary
group by
1,2
order by
total_wins desc
limit
5
forename | surname | total_races | total_wins | total_podiums | win_perc | podium_perc | career_length |
---|---|---|---|---|---|---|---|
Lewis | Hamilton | 310 | 103 | 191 | 0.33 | 0.62 | 16 |
Michael | Schumacher | 308 | 91 | 155 | 0.30 | 0.50 | 19 |
Sebastian | Vettel | 300 | 53 | 122 | 0.18 | 0.41 | 16 |
Alain | Prost | 202 | 51 | 106 | 0.25 | 0.52 | 13 |
Ayrton | Senna | 162 | 41 | 80 | 0.25 | 0.49 | 11 |
This analysis is based on data as of 2022-11-20. The charts below demonstrate the time difference by lap for each of the next 4 drivers that finished after Mr. Hamilton. A positive difference means that a driver was behind Hamilton as they completed that lap and vice versa.
First I need to identify the race_id
for Lewis Hamilton’s most recent win.
forename | surname | race_id | race_name | race_date | circuit_name | circuit_country | f_time |
---|---|---|---|---|---|---|---|
Lewis | Hamilton | 1072 | Saudi Arabian Grand Prix | 2021-12-05 | Jeddah Corniche Circuit | Saudi Arabia | 7575118 |
Now I know which race to filter for when creating the query against the lap_times
table.
race_1072_laps <- dbGetQuery(con_dd,
'
with driver_info as
(
select distinct
f.forename
, f.surname
, f.race_name
, f.race_date
, f.driver_id
, f.race_id
, f.position as f_position
from
f1_summary as f
)
select
d.*
, l.lap
, l.milliseconds as l_time
from
lap_times as l
left join
driver_info as d
on d.driver_id = l.driverId
and d.race_id = l.raceId
where
d.race_id = 1072
and d.f_position <= 5
')
plt_race_name <- unique(race_1072_laps$race_name)
plt_race_date <- unique(race_1072_laps$race_date)
race_lap_diffs <- race_1072_laps |>
mutate(driver_name = glue('{f_position}: {forename} {surname}')) |>
group_by(driver_name) |>
arrange(lap) |>
mutate(cum_l_time = cumsum(l_time) / 1000) |>
ungroup() |>
group_by(lap) |>
mutate(winner_cum_l_time = max(ifelse(f_position == 1, cum_l_time, 0), na.rm = TRUE),
cum_l_diff = cum_l_time - winner_cum_l_time) |>
ungroup()
race_lap_diffs |>
ggplot(aes(lap, cum_l_diff, col = driver_name, group = driver_name)) +
geom_line(size = 0.8) +
geom_hline(yintercept = 0, col = "black", linetype = 2) +
geom_text(data = race_lap_diffs |> filter(lap == last(lap)), col = "black", size = 3,
aes(label = glue('+{label_number(accuracy = 0.1)(cum_l_diff)}S'), x = lap - 7, y = cum_l_diff + 7)) +
scale_y_continuous(labels = label_number(accuracy = 0.1), breaks = breaks_width(10)) +
scale_color_npg() +
facet_wrap(~driver_name) +
theme_light() +
theme(strip.text = element_text(color = "black")) +
labs(x = "Lap",
y = "Cumulative Lap Time Diff",
color = "Driver",
title = "Lap Times: Winner vs 2nd to 5th Place",
subtitle = glue("{plt_race_name}: {plt_race_date}"))
Hamilton beat the second place driver by a little less than 7 seconds. What’s also interesting is that the driver that finished in 4th managed to have a lap ahead of Hamilton while the 3rd place driver didn’t.