Packages

Code

library(dplyr)
library(tidyr)
library(ggplot2)
library(xml2)
library(lubridate)
library(janitor)
library(scales)
library(ggsci)

XML to RDS

The XML file format is great for storing data, but I will want to extract and transform the information that I want into separate data frames. This can be accomplished easily by creating a function that extracts all nodes that contain a given attribute.

Code

xml_to_rds <- function(xml_file, attr_select, out_path) {
  
  xml2::xml_find_all(xml_file, 
                     paste0("//", attr_select)) %>% 
    purrr::map(~xml2::xml_attrs(.x)) %>%
    purrr::map_df(~as.list(.x)) %>% 
    saveRDS(file = paste0(out_path, "/", attr_select, ".RDS"))
}

The top of the export.xml file contains some documentation on what elements are present within the file. I’ll focus primarily on the Workout nodes for this post.

Code

apple_health_export <- read_xml(x = "local_data/export.xml")
xml_to_rds(xml_file = apple_health_export, attr_select = "Workout", out_path = "local_data")

Data Wrangling

Great. So now I have data frame that contains all of my workout data. Before I can begin any type of analysis, I’ll need to perform a couple cleaning operations. I’ll start by cleaning up the column names.

Code

names(workouts)

 [1] "workoutActivityType"   "duration"              "durationUnit"         
 [4] "totalDistance"         "totalDistanceUnit"     "totalEnergyBurned"    
 [7] "totalEnergyBurnedUnit" "sourceName"            "sourceVersion"        
[10] "creationDate"          "startDate"             "endDate"              
[13] "device"

Code

workouts <- workouts %>% 
  clean_names()

names(workouts)

 [1] "workout_activity_type"    "duration"                
 [3] "duration_unit"            "total_distance"          
 [5] "total_distance_unit"      "total_energy_burned"     
 [7] "total_energy_burned_unit" "source_name"             
 [9] "source_version"           "creation_date"           
[11] "start_date"               "end_date"                
[13] "device"

Next I want to make sure that each variable has the correct data type. I can inspect the current column type with glimpse().

Code

workouts %>% 
  select(-device) %>% 
  glimpse()

Rows: 159
Columns: 12
$ workout_activity_type    <chr> "HKWorkoutActivityTypeRunning", "HKWorkoutAct…
$ duration                 <chr> "30.19625363349915", "31.56661983331044", "38…
$ duration_unit            <chr> "min", "min", "min", "min", "min", "min", "mi…
$ total_distance           <chr> "3.02360045806866", "3.07297265789395", "3.13…
$ total_distance_unit      <chr> "mi", "mi", "mi", "mi", "mi", "mi", "mi", "mi…
$ total_energy_burned      <chr> "296.1570000000004", "294.944", "347.71600000…
$ total_energy_burned_unit <chr> "Cal", "Cal", "Cal", "Cal", "Cal", "Cal", "Ca…
$ source_name              <chr> "Matthew’s Apple Watch", "Matthew’s Apple Wat…
$ source_version           <chr> "3.1", "3.2.2", "3.2.2", "3.2.2", "3.2.2", "3…
$ creation_date            <chr> "2017-05-15 20:13:44 -0400", "2017-05-29 10:3…
$ start_date               <chr> "2017-05-15 19:43:28 -0400", "2017-05-29 10:0…
$ end_date                 <chr> "2017-05-15 20:13:40 -0400", "2017-05-29 10:3…

It looks as though all of the variables are currently categorized as characters. I can fix this using a combination of the mutate() and across() functions. I’m also adjusting the time to match my current timezone. Most of the data that I will be analyzing was captured during 2020, so I’m pretty sure I didn’t travel to any other time-zones.

Code

workouts_clean <- workouts %>% 
  select(-device) %>% 
  mutate(across(.cols = contains("date"),
                .fns = ~as_datetime(.x) - seconds(18000)),
         across(.cols = c(duration, total_distance, 
                          total_energy_burned),
                .fns = ~as.double(.x)))

workouts_clean %>% 
  glimpse()

Rows: 159
Columns: 12
$ workout_activity_type    <chr> "HKWorkoutActivityTypeRunning", "HKWorkoutAct…
$ duration                 <dbl> 30.19625, 31.56662, 38.16131, 30.21184, 31.31…
$ duration_unit            <chr> "min", "min", "min", "min", "min", "min", "mi…
$ total_distance           <dbl> 3.023600, 3.072973, 3.137175, 3.098805, 2.941…
$ total_distance_unit      <chr> "mi", "mi", "mi", "mi", "mi", "mi", "mi", "mi…
$ total_energy_burned      <dbl> 296.1570, 294.9440, 347.7160, 287.5090, 272.5…
$ total_energy_burned_unit <chr> "Cal", "Cal", "Cal", "Cal", "Cal", "Cal", "Ca…
$ source_name              <chr> "Matthew’s Apple Watch", "Matthew’s Apple Wat…
$ source_version           <chr> "3.1", "3.2.2", "3.2.2", "3.2.2", "3.2.2", "3…
$ creation_date            <dttm> 2017-05-15 19:13:44, 2017-05-29 09:37:41, 20…
$ start_date               <dttm> 2017-05-15 18:43:28, 2017-05-29 09:06:03, 20…
$ end_date                 <dttm> 2017-05-15 19:13:40, 2017-05-29 09:37:37, 20…

Analysis and Visualization

On to the fun stuff. With my cleaned and formatted data, I can finally start to answer some questions. Let’s determine what workout activity I’ve done the most, on which days, and at what times.

Code

workouts_clean %>% 
  count(workout_activity_type)

# A tibble: 2 × 2
  workout_activity_type            n
  <chr>                        <int>
1 HKWorkoutActivityTypeRunning   138
2 HKWorkoutActivityTypeWalking    21

Code

workouts_clean <- workouts_clean %>% 
  filter(workout_activity_type == "HKWorkoutActivityTypeRunning") %>% 
  mutate(workout_day = weekdays(start_date),
         workout_day = factor(workout_day, 
                              levels = c("Monday", "Tuesday",
                                         "Wednesday", "Thursday",
                                         "Friday", "Saturday",
                                         "Sunday")),
         workout_hour = hour(start_date))

workouts_clean %>% 
  ggplot(aes(workout_day)) + geom_bar(fill = "#4789CC") +
  scale_y_continuous(breaks = breaks_width(5)) +
  labs(x = "Weekday", y = "Count") +
  theme_minimal(base_size = 14)

Code

workouts_clean %>% 
  ggplot(aes(workout_hour)) + geom_bar(fill = "#4789CC") +
  scale_x_continuous(breaks = breaks_width(2)) +
  labs(x = "Hour", y = "Count") +
  theme_minimal(base_size = 14)

So far I have 138 runs recorded. I’ve tried to stick to a running schedule of Monday, Wednesday, Friday, and Sunday. I also tend to go on most of my runs before 12:00 PM.

I’ve tried to use running as a good way to stay active during the pandemic and haven’t really focused on increasing my performance. It would still be interesting to see if I have either gotten faster or increased the distance that I’m running.

Code

workouts_clean %>% 
  filter(workout_activity_type == "HKWorkoutActivityTypeRunning") %>% 
  mutate(min_per_mile = duration(duration / total_distance, "minute"),
         creation_date = date(creation_date)) %>% 
  filter(creation_date > "2020-01-01", 
         total_distance >= 1.5) %>% 
  ggplot(aes(creation_date, 
             min_per_mile, 
             col = total_distance)) + 
  scale_color_gradient(low = "blue", high = "red", 
                       breaks = seq(1.5, 4, 0.5)) +
  geom_point(size = 3.5, alpha = 0.8) +
  scale_y_time(labels = time_format("%M:%S"),
               breaks = breaks_width(30)) +
  scale_x_date(breaks = breaks_width("2 month"), 
               date_labels = "%b %Y") +
  labs(x = "Date", y = "Avg. Minutes / Mile",
       col = "Miles") +
  geom_smooth(method = "lm", se = FALSE, col = "black",
              linetype = 2) +
  theme_minimal(base_size = 14)

`geom_smooth()` using formula 'y ~ x'