SQL Analysis with {duckdb}

Author

Matthew Harris

Published

December 29, 2022

Abstract
Revisiting an old analysis using {duckdb}.
Code
library(readr)
library(dplyr)
library(tidyr)
library(duckdb)
library(ggplot2)
library(scales)
library(purrr)
library(glue)
library(here)
library(stringr)
library(lubridate)
library(ggsci)
library(ggmap)

Intro

I love using R but more often then not I also need to use SQL when working in a professional capacity. This post will demonstrate some of the features of {duckdb}. I’ll be revisiting an analysis that I completed a couple of years ago on Formula 1 drivers.

Code
con_dd <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")

Loading the Data

Formula 1 World Championship (1950 - 2022) Data

I’m choosing to import the csv files into R first using the read_csv() function before loading all of the data into {duckdb}. The read_csv() function tends to provide better data type inference than the {duckdb} option.

Code
#Creates a vector of the csv file names
proj_files <- dir(path = here("posts", "revisiting-f1", "local_data"), pattern = "*.csv")

#Maps over the files names to load each csv file into a combined list
proj_df <- proj_files |>
  map(~read_csv(file = here("posts", "revisiting-f1", "local_data", .x), na = c("", "NA", "\\N")))


names(proj_df) <- str_remove(proj_files, ".csv")

#Loads the list components to the global environment
list2env(proj_df, .GlobalEnv)
<environment: R_GlobalEnv>

Next I’ll load the data frames into the {duckdb} database.

Code
map(.x = str_remove(proj_files, ".csv"),
    .f = ~duckdb::duckdb_register(con_dd, .x, get(.x)
                                 ))

SQL Table Creation

Joining table and computing simply summary metrics in SQL has always been easy. I’ve decided to complete this process in SQL and create a new table that can be referenced throughout the rest of the analysis.

Code
drop table if exists f1_summary;
Code
create table f1_summary as

with milli_fl_summ as
(
select
    raceId
    , driverId
    , milliseconds as fl_time
    , position as fl_position
  , row_number() over(partition by raceId, driverId order by milliseconds) as l_time 
from 
  lap_times
qualify 
  l_time = 1
),

pit_stop_summ as
(
select
    raceId
    , driverId
    , sum(stop) as total_pit_stops
    , sum(milliseconds) as total_ps_duration
from
    pit_stops
group by
    1,2
)

select
    r.resultId as result_id
  , r.raceId as race_id
  , r.driverId as driver_id
  , r.constructorId as constructor_id
  , r.grid
  , r.position
  , r.positionOrder as position_order
  , r.fastestLapSpeed as fl_speed
  , r.milliseconds as f_time
  , d.forename
  , d.surname
  , d.dob
  , d.nationality as driver_nationality
  , rc.date as race_date
  , rc.name as race_name
  , rc.time as race_time
  , cr.name as constructor_name
  , cr.nationality as constructor_nationality
  , ci.name as circuit_name
  , ci.location as circuit_location
  , ci.country as circuit_country
  , ci.lat
  , ci.lng
    , s.status
    , fl.fl_time
  , fl.fl_position
  , pt.total_pit_stops
  , pt.total_ps_duration
from
    results as r
left join
    drivers as d
    on d.driverId = r.driverId
left join
    races as rc
    on rc.raceId = r.raceId
left join
    constructors as cr
    on cr.constructorId = r.constructorId
left join
    circuits as ci
    on ci.circuitId = rc.circuitId
left join
    status as s
    on s.statusId = r.statusId
left join
    milli_fl_summ as fl
    on fl.raceId = r.raceId
    and fl.driverId = r.driverId
left join
    pit_stop_summ as pt
    on pt.raceId = r.raceId
    and pt.driverId = r.driverId;

Analysis

Now that I have a tidy table to use for analysis I can start digging into the data to answer some interesting questions. The goal is to complete as much of the analysis as I can in SQL and pull the summarized data into an R data frame. This is a common practice for me professionally as I’m often tasked with analyzing data that can contain hundreds of millions of rows across multiple tables.

What were the top 3 constructors by wins?

Code
podiums_by_decade <- dbGetQuery(con_dd,
'           
select
  *
  , row_number() over(partition by race_decade order by total_podiums desc) as podium_rank_decade
from
(
select
  constructor_name
  , year(race_date) - year(race_date) % 10 as race_decade
  , sum(case when position <= 3 then 1 else 0 end) as total_podiums
from
  f1_summary
group by
  1,2
)
qualify 
  podium_rank_decade <= 3
')

podiums_by_decade |>
  filter(race_decade >= 1960) |>
  ggplot(aes(constructor_name, total_podiums, fill = factor(race_decade))) +
  geom_col(position = position_dodge2(width = 0.8, preserve = "single")) + 
  scale_y_continuous(limits = c(0, 210)) +
  scale_fill_npg() +
  facet_wrap(~race_decade, scales = "free_x") +
  geom_text(aes(label = total_podiums, group = constructor_name, y = total_podiums), position = position_dodge(width = 0), vjust = -0.1, size = 3, angle = 0) +
  labs(x = "", y = "", title = "Top 3 Constructors by Decade and Podium Counts") +
  theme_light() +
  theme(legend.position = "none", 
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())

Where have races occurred across the world?

Code
races_by_loc <- dbGetQuery(con_dd,
'
select
  circuit_name
  , lat
  , lng
  , count(*) as race_cnt
from
  f1_summary
group by
  1,2,3
')

world_data <- map_data("world") %>%
    filter(region != "Antarctica")

ggplot() +
  geom_map(data = world_data, map = world_data,
           aes(x = long, y = lat, map_id = region),
           fill = "#a8a8a8", color = "#ffffff", size = 0.3) +
  geom_point(data = races_by_loc, aes(x = lng, y = lat, color = race_cnt),
             alpha = 0.75, size = 1.3) +
  scale_color_gradient(low = pal_npg()(5)[4],
                       high = pal_npg()(8)[8]) +
  labs(size = "Races", title = "Race Locations", color = "Races") +
  theme_void() +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank(),
        plot.title = element_text(hjust = 0.5),
        legend.position = "bottom") +
  coord_fixed()

Top 5 Drivers of All Time (Wins)

Code
select
  forename
  , surname
  , count(*) as total_races
  , sum(case when position = 1 then 1 else 0 end) as total_wins
  , sum(case when position <= 3 then 1 else 0 end) as total_podiums
  , round(sum(case when position = 1 then 1 else 0 end)::float / count(*)::float, 2) as win_perc
  , round(sum(case when position <= 3 then 1 else 0 end)::float / count(*)::float, 2) as podium_perc
  , count(distinct year(race_date)) as career_length
from
  f1_summary
group by
  1,2
order by
  total_wins desc
limit 
  5
5 records
forename surname total_races total_wins total_podiums win_perc podium_perc career_length
Lewis Hamilton 310 103 191 0.33 0.62 16
Michael Schumacher 308 91 155 0.30 0.50 19
Sebastian Vettel 300 53 122 0.18 0.41 16
Alain Prost 202 51 106 0.25 0.52 13
Ayrton Senna 162 41 80 0.25 0.49 11

How well did Lewis Hamilton do in the last race that he won?

This analysis is based on data as of 2022-11-20. The charts below demonstrate the time difference by lap for each of the next 4 drivers that finished after Mr. Hamilton. A positive difference means that a driver was behind Hamilton as they completed that lap and vice versa.

First I need to identify the race_id for Lewis Hamilton’s most recent win.

Code
select
  forename
  , surname
  , race_id
  , race_name
  , race_date
  , circuit_name
  , circuit_country
  , f_time
from
  f1_summary
where
  forename||surname = 'LewisHamilton'
  and position = 1
order by
  race_date desc
limit 
  1
1 records
forename surname race_id race_name race_date circuit_name circuit_country f_time
Lewis Hamilton 1072 Saudi Arabian Grand Prix 2021-12-05 Jeddah Corniche Circuit Saudi Arabia 7575118

Now I know which race to filter for when creating the query against the lap_times table.

Code
race_1072_laps <- dbGetQuery(con_dd,
'
with driver_info as 
(
select distinct
  f.forename
  , f.surname
  , f.race_name
  , f.race_date
  , f.driver_id
  , f.race_id
  , f.position as f_position
from
  f1_summary as f
)

select
  d.*
  , l.lap
  , l.milliseconds as l_time
from
  lap_times as l
left join
  driver_info as d
  on d.driver_id = l.driverId
  and d.race_id = l.raceId
where
  d.race_id = 1072
  and d.f_position <= 5
')

plt_race_name <- unique(race_1072_laps$race_name)
plt_race_date <- unique(race_1072_laps$race_date)

race_lap_diffs <- race_1072_laps |>
  mutate(driver_name = glue('{f_position}: {forename} {surname}')) |>
  group_by(driver_name) |>
  arrange(lap) |>
  mutate(cum_l_time = cumsum(l_time) / 1000) |>
  ungroup() |>
  group_by(lap) |>
  mutate(winner_cum_l_time = max(ifelse(f_position == 1, cum_l_time, 0), na.rm = TRUE), 
         cum_l_diff = cum_l_time - winner_cum_l_time) |>
  ungroup()

race_lap_diffs |>
  ggplot(aes(lap, cum_l_diff, col = driver_name, group = driver_name)) +
  geom_line(size = 0.8) +
  geom_hline(yintercept = 0, col = "black", linetype = 2) +
  geom_text(data = race_lap_diffs |> filter(lap == last(lap)), col = "black", size = 3,
            aes(label = glue('+{label_number(accuracy = 0.1)(cum_l_diff)}S'), x = lap - 7, y = cum_l_diff + 7)) +
  scale_y_continuous(labels = label_number(accuracy = 0.1), breaks = breaks_width(10)) +
  scale_color_npg() +
  facet_wrap(~driver_name) +
  theme_light() +
  theme(strip.text = element_text(color = "black")) +
  labs(x = "Lap", 
       y = "Cumulative Lap Time Diff", 
       color = "Driver", 
       title = "Lap Times: Winner vs 2nd to 5th Place", 
       subtitle = glue("{plt_race_name}: {plt_race_date}"))

Hamilton beat the second place driver by a little less than 7 seconds. What’s also interesting is that the driver that finished in 4th managed to have a lap ahead of Hamilton while the 3rd place driver didn’t.

Code
dbDisconnect(con_dd, shutdown = TRUE)