Visualising Race Results

◀

By: Alfie Chadwick Date: February 17, 2026 Bud

▶

As an amateur age group triathlete – and someone who’s a bit obsessed with data – I often find myself scouring through the results section of my races to see how I went compared to my peers, and, importantly, how far off I am from doing better.

So I wanted to take a quick look to see if I could make some pretty graphs from a sprint triathlon I did last weekend.

Data

So I can grab the data off the race website, and I’ll grab all the results. This is super slow and inefficient since I need to get around the anti-scraping rules, so I set it up to cache locally. I’m only grabbing the results from my age group as well to keep it small.

Code

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time, random, os, json

DATA_FILE = "race_results_cache.json"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",
    "Accept-Language": "en-AU,en;q=0.9",
    "Referer": "https://www.multisportaustralia.com.au/"
}

session = requests.Session()
session.headers.update(HEADERS)

def safe_get(url, retries=5):
    for i in range(retries):
        r = session.get(url, timeout=20)
        if r.status_code == 429:
            time.sleep(10 + i * 5)
            continue
        r.raise_for_status()
        return r
    raise Exception(f"429 block not resolved: {url}")


def extract_tables_to_dfs(link):
    response = safe_get(link)
    soup = BeautifulSoup(response.text, "html.parser")

    tables = soup.find_all("table")
    dfs = []

    for table in tables:
        rows = table.find_all("tr")

        table_data = []
        max_cols = 0

        for row in rows:
            cells = row.find_all(["th", "td"])
            row_data = [cell.get_text(strip=True) for cell in cells]
            max_cols = max(max_cols, len(row_data))
            table_data.append(row_data)

        if not table_data:
            continue

        normalized_data = [
            row + [""] * (max_cols - len(row)) for row in table_data
        ]

        if all(cell != "" for cell in normalized_data[0]):
            header = normalized_data[0]
            data = normalized_data[1:]
        else:
            header = [f"col_{i}" for i in range(max_cols)]
            data = normalized_data

        df = pd.DataFrame(data, columns=header)
        dfs.append(df)

    return dfs


def scrape_and_cache_json():
    url = 'https://www.multisportaustralia.com.au/races/2xu-triathlon-series-2526-race-4-sandringham-2026/events/3/category/Male/16'
    path = '/races/2xu-triathlon-series-2526-race-4-sandringham-2026/events/3/results/individuals/'
    base_url = 'https://www.multisportaustralia.com.au'

    response = safe_get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    pattern = re.compile(r'^' + re.escape(path) + r'\d+$')

    for a in soup.find_all('a', href=True):
        if pattern.match(a['href']):
            links.append(base_url + a['href'])

    results_json = {}

    for link in links:
        time.sleep(random.uniform(2.5, 6.0))

        athlete_id = link.split('/')[-1]
        dfs = extract_tables_to_dfs(link)

        race_dfs = [df[["Location", "Race Time"]] for df in dfs if 'Race Time' in df.columns]

        if not race_dfs:
            continue

        full_df = pd.concat(race_dfs, ignore_index=True)

        # ---- DataFrame -> JSON ----
        race_json = full_df.to_dict(orient="records")

        results_json[f"athlete_id_{athlete_id}"] = race_json

    if not results_json:
        raise Exception("No data scraped")

    with open(DATA_FILE, "w", encoding="utf-8") as f:
        json.dump(results_json, f, indent=2, ensure_ascii=False)

    return results_json


# ---------- ENTRY POINT ----------

if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "r", encoding="utf-8") as f:
        results = json.load(f)
else:
    results = scrape_and_cache_json()

print(f'{len(results)} results loaded')

40 results loaded

And if we want to see my result:

Code

bib_number = 735

result = results[f'athlete_id_{bib_number}']


print("| Location     | Race Time |")
print("|--------------|-----------|")
for entry in result:
    print(f"| {entry['Location']:<12} | {entry['Race Time']} |")

| Location     | Race Time |
|--------------|-----------|
| Start        | 00:00:00 |
| Swim         | 00:15:51 |
| T1           | 00:20:04 |
| Bike 10.1km  | 00:36:31 |
| Cycle        | 00:56:35 |
| T2           | 00:58:48 |
| Run 2.3km    | 01:10:11 |
| Run          | 01:23:23 |

Visualisations

Lets start by looking at the average result.

Code

from datetime import timedelta
from collections import defaultdict

def to_seconds(t):
    if not t or not isinstance(t, str):
        return None
    t = t.strip()
    parts = t.split(":")
    
    # handle malformed times safely
    if len(parts) != 3:
        return None
    
    try:
        h, m, s = map(int, parts)
        return h*3600 + m*60 + s
    except ValueError:
        return None

def to_hms(seconds):
    return str(timedelta(seconds=int(seconds)))

location_times = defaultdict(list)

# collect times per location
for athlete, splits in results.items():
    for entry in splits:
        location = entry.get("Location")
        time_str = entry.get("Race Time")
        time_sec = to_seconds(time_str)
        
        if time_sec is not None:   # only keep valid times
            location_times[location].append(time_sec)

# compute averages
average_times = {}
for location, times in location_times.items():
    if times:  # avoid division by zero
        avg_sec = sum(times) / len(times)
        average_times[location] = to_hms(avg_sec)

ave_result = [
    {"Location": location, "Race Time": race_time}
    for location, race_time in average_times.items()
]

print("| Location     | Race Time |")
print("|--------------|-----------|")
for entry in ave_result:
    print(f"| {entry['Location']:<12} | {entry['Race Time']} |")

| Location     | Race Time |
|--------------|-----------|
| Start        | 0:00:00 |
| Swim         | 0:16:39 |
| T1           | 0:22:53 |
| Bike 10.1km  | 0:39:16 |
| Cycle        | 1:01:38 |
| T2           | 1:04:01 |
| Run 2.3km    | 1:14:25 |
| Run          | 1:27:47 |

And I’m happy to say, I’m slightly above average !!! Massive win we, can end here. Besided this great news, I’m also a bit interested in how people are splitting their time between the diciplines. I’ve always seen myself as an average swimmer, a good cyclist, a poor runner and good in the transitions so lets see if that plays out in the data.

We can firts look at this by looking at the cumlative race times, with my time in red:

Code

from datetime import timedelta

import matplotlib.pyplot as plt
import catppuccin
import numpy as np

import matplotlib as mpl
mpl.style.use(catppuccin.PALETTE.macchiato.identifier)


# robust conversion: HH:MM:SS → minutes
def to_mins(t):
    if not t or not isinstance(t, str):
        return None
    t = t.strip()
    parts = t.split(":")
    if len(parts) != 3:
        return None
    try:
        h, m, s = map(int, parts)
        return (h*3600 + m*60 + s) / 60
    except ValueError:
        return None

# target athlete
target_id = f"athlete_id_{bib_number}"

# X-axis order (from first athlete)
locations = [entry["Location"] for entry in next(iter(results.values()))]

plt.figure()

for athlete_id, splits in results.items():
    # convert to dict for convenience
    loc_time_dict = {entry["Location"]: to_mins(entry["Race Time"]) for entry in splits}
    
    # skip incomplete athletes
    if any(loc not in loc_time_dict or loc_time_dict[loc] is None for loc in locations):
        continue

    times_min = [loc_time_dict[loc] for loc in locations]

    # highlight target athlete
    if athlete_id == target_id:
        plt.plot(locations, times_min, marker='o', linewidth=3, color='red')
    else:
        plt.plot(locations, times_min, marker='o', linewidth=1.5,)

# formatting
plt.xlabel("Location")
plt.ylabel("Cumulative Race Time (minutes)")
plt.title("Cumulative Race Times by Athlete")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

This doesn’t tell us much, so maybe we can look at the normalised times compared to me:

Code

target_id = f"athlete_id_{bib_number}"
target_splits = results.get(target_id)

if not target_splits:
    raise ValueError(f"Athlete {target_id} not found")

# convert target athlete times to seconds
target_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in target_splits}
if any(loc not in target_dict or target_dict[loc] is None for loc in locations):
    raise ValueError(f"Target athlete {target_id} does not have complete data")

target_times = [target_dict[loc] for loc in locations]

plt.figure()

for athlete_id, splits in results.items():
    # convert athlete times to seconds
    loc_time_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in splits}

    # skip athletes with missing times
    if any(loc not in loc_time_dict or loc_time_dict[loc] is None for loc in locations):
        continue

    times_sec = [loc_time_dict[loc] for loc in locations]

    # normalize: subtract target athlete's times
    times_norm = [(t - t_target)/60 for t, t_target in zip(times_sec, target_times)]  # minutes

    plt.plot(locations, times_norm, marker='o', label=athlete_id,
             linewidth=1.5 if athlete_id != target_id else 2.5,
             color='red' if athlete_id == target_id else None)

plt.axhline(0, color='black', linestyle='--', alpha=0.7)  # baseline at target

plt.xlabel("Location")
plt.ylabel("Time Difference vs Athlete {} (minutes)".format(bib_number))
plt.title("Normalized Cumulative Race Times")
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

This is better, as we can see that there are a lot of curves where I am faster than on the bike, then gaining on me during the run, and similarly times when getting off the bike and taking off out of T2.

Something that I think could be usefull is looking at how I ranked in each section, because that is what it means to be ‘average’ after all.

Code

# Split names (exclude 'Start')
locations = [entry["Location"] for entry in next(iter(results.values())) if entry["Location"] != "Start"]

# Compute split times for each athlete, **only keep complete athletes**
athlete_splits = {}
for athlete_id, splits in results.items():
    times_sec = [to_seconds(entry["Race Time"]) for entry in splits]
    
    # skip incomplete or malformed data
    if None in times_sec or len(times_sec) != len(splits):
        continue
    
    # compute split times (diff of consecutive times)
    splits_sec = [times_sec[i] - times_sec[i-1] for i in range(1, len(times_sec))]
    
    # skip athletes without full set of splits
    if len(splits_sec) != len(locations):
        continue

    athlete_splits[athlete_id] = splits_sec

# Convert to numpy array for ranking
all_splits = np.array([splits for splits in athlete_splits.values()])  # shape: athletes x splits

# compute ranks per split (1 = fastest)
ranks = np.argsort(np.argsort(all_splits, axis=0), axis=0) + 1

athlete_ids = list(athlete_splits.keys())

# plot rank per split
plt.figure()
for i, athlete_id in enumerate(athlete_ids):
    plt.plot(locations, ranks[i], marker='o', label=athlete_id,
             linewidth=2.5 if athlete_id == target_id else 1.5,
             color='red' if athlete_id == target_id else None)

plt.gca().invert_yaxis()  # rank 1 at top
plt.xlabel("Split")
plt.ylabel("Rank (1 = fastest)")
plt.title("Athlete Split Rank Comparison")
plt.grid(True, linestyle='--', alpha=0.5)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

And here we can see that, as I expected, average swimmer, good biker, bad runner, though not as bad as I thought I was (I was getting passed a lot). It’s crazy to me, my performance on T1, but maybe I am just good at putting on shoes real quick.

Because it is noisy, I’ll remove T1 and T2 from the chart to see if there are any cool athletes that just killed it in one discipline.

Code

# Split names (exclude 'Start', 'T1', 'T2')
locations = [
    entry["Location"] 
    for entry in next(iter(results.values())) 
    if entry["Location"] not in ("Start", "T1", "T2")
]

# Compute split times for each athlete, only keep complete athletes
athlete_splits = {}
for athlete_id, splits in results.items():
    times_sec = [to_seconds(entry["Race Time"]) for entry in splits]
    
    # skip incomplete or malformed data
    if None in times_sec or len(times_sec) != len(splits):
        continue
    
    # compute split times (diff of consecutive times)
    splits_sec = [times_sec[i] - times_sec[i-1] for i in range(1, len(times_sec))]
    
    # keep only splits corresponding to locations (exclude T1 and T2)
    split_names = [entry["Location"] for entry in splits][1:]  # first diff corresponds to index 1+
    filtered_splits = [t for t, loc in zip(splits_sec, split_names) if loc in locations]
    
    # skip athletes without full set of splits
    if len(filtered_splits) != len(locations):
        continue

    athlete_splits[athlete_id] = filtered_splits

# Convert to numpy array for ranking
all_splits = np.array([splits for splits in athlete_splits.values()])  # athletes x splits

# compute ranks per split (1 = fastest)
ranks = np.argsort(np.argsort(all_splits, axis=0), axis=0) + 1

athlete_ids = list(athlete_splits.keys())

# plot rank per split
plt.figure()
for i, athlete_id in enumerate(athlete_ids):
    plt.plot(
        locations,
        ranks[i],
        marker='o',
        linewidth=2.5 if athlete_id == target_id else 1.5,
        color='red' if athlete_id == target_id else None,
        label=athlete_id if athlete_id == target_id else None  # only label target
    )

plt.gca().invert_yaxis()  # rank 1 at top
plt.xlabel("Split")
plt.ylabel("Rank (1 = fastest)")
plt.title("Athlete Split Rank Comparison (T1 & T2 Excluded)")
plt.grid(True, linestyle='--', alpha=0.5)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Finaly, because this is a race, lets see where I gained and lost positions:

Code

locations = [entry["Location"] for entry in next(iter(results.values()))]

# Collect cumulative times, **only complete athletes**
athlete_times = {}
for athlete_id, splits in results.items():
    times_sec = [to_seconds(entry["Race Time"]) for entry in splits]
    if None in times_sec or len(times_sec) != len(locations):
        continue  # skip incomplete athletes
    athlete_times[athlete_id] = times_sec

# Convert to numpy array
athlete_ids = list(athlete_times.keys())
all_times = np.array([athlete_times[aid] for aid in athlete_ids])  # now all rows same length

# Compute ranks at each checkpoint (1 = fastest)
ranks = np.argsort(np.argsort(all_times, axis=0), axis=0) + 1

# Target athlete index
if target_id not in athlete_ids:
    raise ValueError(f"Target athlete {target_id} not in complete data")
target_idx = athlete_ids.index(target_id)
target_ranks = ranks[target_idx]

# Compute overtakes (change in position vs previous checkpoint)
overtakes = np.diff(target_ranks)  # +ve = lost, -ve = gained

# Plot overtakes per checkpoint
checkpoint_labels = locations[1:]  # diff reduces one element
plt.figure()
plt.bar(checkpoint_labels, -overtakes)  # invert so positive = positions gained
plt.axhline(0, color='black', linestyle='--')
plt.ylabel("Positions Gained (+) / Lost (-)")
plt.title(f"Positions Gained/Lost per Checkpoint for Athlete {bib_number}")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

So after coming out of the water middle of the pack, I gained a heap of spots in T1 and then just held on for the rest of the race? The data says so, but it didn’t feel like that.

So lets compare to athletes that finished around the same time as me (+- 5 mins)

Code

# split locations (exclude T1/T2 if desired)
locations = [entry["Location"] for entry in target_splits]

# target athlete cumulative times
target_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in target_splits}
if any(loc not in target_dict or target_dict[loc] is None for loc in locations):
    raise ValueError(f"Target athlete {target_id} does not have complete data")
target_times = [target_dict[loc] for loc in locations]
target_final_time = target_times[-1]

# Catppuccin palette

plt.figure()

for athlete_id, splits in results.items():
    # skip target athlete (we’ll plot separately)
    if athlete_id == target_id:
        continue

    # convert athlete cumulative times
    loc_time_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in splits}
    if any(loc not in loc_time_dict or loc_time_dict[loc] is None for loc in locations):
        continue

    times_sec = [loc_time_dict[loc] for loc in locations]

    # filter: only within ±5 minutes of target
    if abs(times_sec[-1] - target_final_time) > 5*60:
        continue

    # normalize: difference vs target in minutes
    times_norm = [(t - t_target)/60 for t, t_target in zip(times_sec, target_times)]

    # plot with Catppuccin color
    plt.plot(locations, times_norm, marker='o', linewidth=1.5, alpha=0.8)

# plot target athlete in red
plt.plot(locations, [0]*len(locations), marker='o', color='red', linewidth=2.5, label=f"Target {bib_number}")

plt.axhline(0, color='black', linestyle='--', alpha=0.7)  # baseline at target
plt.xlabel("Location")
plt.ylabel(f"Time Difference vs Athlete {bib_number} (minutes)")
plt.title("Normalized Cumulative Race Times (±5 min from Target)")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

And this tells a different story. To my closest competitors, I did in fact swim slightly below average, then had a ripper of a T1 and bike to gain places on them, before all of them bar one ran it in faster than me.