Visualising Race Results

  • Soil:
    By: Alfie Chadwick Date: February 17, 2026 Bud
    Seeds:
  • As an amateur age group triathlete – and someone who’s a bit obsessed with data – I often find myself scouring through the results section of my races to see how I went compared to my peers, and, importantly, how far off I am from doing better.

    So I wanted to take a quick look to see if I could make some pretty graphs from a sprint triathlon I did last weekend.

    Data

    So I can grab the data off the race website, and I’ll grab all the results. This is super slow and inefficient since I need to get around the anti-scraping rules, so I set it up to cache locally. I’m only grabbing the results from my age group as well to keep it small.

    Code
    import requests
    from bs4 import BeautifulSoup
    import re
    import pandas as pd
    import time, random, os, json
    
    DATA_FILE = "race_results_cache.json"
    
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",
        "Accept-Language": "en-AU,en;q=0.9",
        "Referer": "https://www.multisportaustralia.com.au/"
    }
    
    session = requests.Session()
    session.headers.update(HEADERS)
    
    def safe_get(url, retries=5):
        for i in range(retries):
            r = session.get(url, timeout=20)
            if r.status_code == 429:
                time.sleep(10 + i * 5)
                continue
            r.raise_for_status()
            return r
        raise Exception(f"429 block not resolved: {url}")
    
    
    def extract_tables_to_dfs(link):
        response = safe_get(link)
        soup = BeautifulSoup(response.text, "html.parser")
    
        tables = soup.find_all("table")
        dfs = []
    
        for table in tables:
            rows = table.find_all("tr")
    
            table_data = []
            max_cols = 0
    
            for row in rows:
                cells = row.find_all(["th", "td"])
                row_data = [cell.get_text(strip=True) for cell in cells]
                max_cols = max(max_cols, len(row_data))
                table_data.append(row_data)
    
            if not table_data:
                continue
    
            normalized_data = [
                row + [""] * (max_cols - len(row)) for row in table_data
            ]
    
            if all(cell != "" for cell in normalized_data[0]):
                header = normalized_data[0]
                data = normalized_data[1:]
            else:
                header = [f"col_{i}" for i in range(max_cols)]
                data = normalized_data
    
            df = pd.DataFrame(data, columns=header)
            dfs.append(df)
    
        return dfs
    
    
    def scrape_and_cache_json():
        url = 'https://www.multisportaustralia.com.au/races/2xu-triathlon-series-2526-race-4-sandringham-2026/events/3/category/Male/16'
        path = '/races/2xu-triathlon-series-2526-race-4-sandringham-2026/events/3/results/individuals/'
        base_url = 'https://www.multisportaustralia.com.au'
    
        response = safe_get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
    
        links = []
        pattern = re.compile(r'^' + re.escape(path) + r'\d+$')
    
        for a in soup.find_all('a', href=True):
            if pattern.match(a['href']):
                links.append(base_url + a['href'])
    
        results_json = {}
    
        for link in links:
            time.sleep(random.uniform(2.5, 6.0))
    
            athlete_id = link.split('/')[-1]
            dfs = extract_tables_to_dfs(link)
    
            race_dfs = [df[["Location", "Race Time"]] for df in dfs if 'Race Time' in df.columns]
    
            if not race_dfs:
                continue
    
            full_df = pd.concat(race_dfs, ignore_index=True)
    
            # ---- DataFrame -> JSON ----
            race_json = full_df.to_dict(orient="records")
    
            results_json[f"athlete_id_{athlete_id}"] = race_json
    
        if not results_json:
            raise Exception("No data scraped")
    
        with open(DATA_FILE, "w", encoding="utf-8") as f:
            json.dump(results_json, f, indent=2, ensure_ascii=False)
    
        return results_json
    
    
    # ---------- ENTRY POINT ----------
    
    if os.path.exists(DATA_FILE):
        with open(DATA_FILE, "r", encoding="utf-8") as f:
            results = json.load(f)
    else:
        results = scrape_and_cache_json()
    
    print(f'{len(results)} results loaded')
    40 results loaded

    And if we want to see my result:

    Code
    bib_number = 735
    
    result = results[f'athlete_id_{bib_number}']
    
    
    print("| Location     | Race Time |")
    print("|--------------|-----------|")
    for entry in result:
        print(f"| {entry['Location']:<12} | {entry['Race Time']} |")
    | Location     | Race Time |
    |--------------|-----------|
    | Start        | 00:00:00 |
    | Swim         | 00:15:51 |
    | T1           | 00:20:04 |
    | Bike 10.1km  | 00:36:31 |
    | Cycle        | 00:56:35 |
    | T2           | 00:58:48 |
    | Run 2.3km    | 01:10:11 |
    | Run          | 01:23:23 |

    Visualisations

    Lets start by looking at the average result.

    Code
    from datetime import timedelta
    from collections import defaultdict
    
    def to_seconds(t):
        if not t or not isinstance(t, str):
            return None
        t = t.strip()
        parts = t.split(":")
        
        # handle malformed times safely
        if len(parts) != 3:
            return None
        
        try:
            h, m, s = map(int, parts)
            return h*3600 + m*60 + s
        except ValueError:
            return None
    
    def to_hms(seconds):
        return str(timedelta(seconds=int(seconds)))
    
    location_times = defaultdict(list)
    
    # collect times per location
    for athlete, splits in results.items():
        for entry in splits:
            location = entry.get("Location")
            time_str = entry.get("Race Time")
            time_sec = to_seconds(time_str)
            
            if time_sec is not None:   # only keep valid times
                location_times[location].append(time_sec)
    
    # compute averages
    average_times = {}
    for location, times in location_times.items():
        if times:  # avoid division by zero
            avg_sec = sum(times) / len(times)
            average_times[location] = to_hms(avg_sec)
    
    ave_result = [
        {"Location": location, "Race Time": race_time}
        for location, race_time in average_times.items()
    ]
    
    print("| Location     | Race Time |")
    print("|--------------|-----------|")
    for entry in ave_result:
        print(f"| {entry['Location']:<12} | {entry['Race Time']} |")
    | Location     | Race Time |
    |--------------|-----------|
    | Start        | 0:00:00 |
    | Swim         | 0:16:39 |
    | T1           | 0:22:53 |
    | Bike 10.1km  | 0:39:16 |
    | Cycle        | 1:01:38 |
    | T2           | 1:04:01 |
    | Run 2.3km    | 1:14:25 |
    | Run          | 1:27:47 |

    And I’m happy to say, I’m slightly above average !!! Massive win we, can end here. Besided this great news, I’m also a bit interested in how people are splitting their time between the diciplines. I’ve always seen myself as an average swimmer, a good cyclist, a poor runner and good in the transitions so lets see if that plays out in the data.

    We can firts look at this by looking at the cumlative race times, with my time in red:

    Code
    from datetime import timedelta
    
    import matplotlib.pyplot as plt
    import catppuccin
    import numpy as np
    
    import matplotlib as mpl
    mpl.style.use(catppuccin.PALETTE.macchiato.identifier)
    
    
    # robust conversion: HH:MM:SS → minutes
    def to_mins(t):
        if not t or not isinstance(t, str):
            return None
        t = t.strip()
        parts = t.split(":")
        if len(parts) != 3:
            return None
        try:
            h, m, s = map(int, parts)
            return (h*3600 + m*60 + s) / 60
        except ValueError:
            return None
    
    # target athlete
    target_id = f"athlete_id_{bib_number}"
    
    # X-axis order (from first athlete)
    locations = [entry["Location"] for entry in next(iter(results.values()))]
    
    plt.figure()
    
    for athlete_id, splits in results.items():
        # convert to dict for convenience
        loc_time_dict = {entry["Location"]: to_mins(entry["Race Time"]) for entry in splits}
        
        # skip incomplete athletes
        if any(loc not in loc_time_dict or loc_time_dict[loc] is None for loc in locations):
            continue
    
        times_min = [loc_time_dict[loc] for loc in locations]
    
        # highlight target athlete
        if athlete_id == target_id:
            plt.plot(locations, times_min, marker='o', linewidth=3, color='red')
        else:
            plt.plot(locations, times_min, marker='o', linewidth=1.5,)
    
    # formatting
    plt.xlabel("Location")
    plt.ylabel("Cumulative Race Time (minutes)")
    plt.title("Cumulative Race Times by Athlete")
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

    This doesn’t tell us much, so maybe we can look at the normalised times compared to me:

    Code
    target_id = f"athlete_id_{bib_number}"
    target_splits = results.get(target_id)
    
    if not target_splits:
        raise ValueError(f"Athlete {target_id} not found")
    
    # convert target athlete times to seconds
    target_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in target_splits}
    if any(loc not in target_dict or target_dict[loc] is None for loc in locations):
        raise ValueError(f"Target athlete {target_id} does not have complete data")
    
    target_times = [target_dict[loc] for loc in locations]
    
    plt.figure()
    
    for athlete_id, splits in results.items():
        # convert athlete times to seconds
        loc_time_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in splits}
    
        # skip athletes with missing times
        if any(loc not in loc_time_dict or loc_time_dict[loc] is None for loc in locations):
            continue
    
        times_sec = [loc_time_dict[loc] for loc in locations]
    
        # normalize: subtract target athlete's times
        times_norm = [(t - t_target)/60 for t, t_target in zip(times_sec, target_times)]  # minutes
    
        plt.plot(locations, times_norm, marker='o', label=athlete_id,
                 linewidth=1.5 if athlete_id != target_id else 2.5,
                 color='red' if athlete_id == target_id else None)
    
    plt.axhline(0, color='black', linestyle='--', alpha=0.7)  # baseline at target
    
    plt.xlabel("Location")
    plt.ylabel("Time Difference vs Athlete {} (minutes)".format(bib_number))
    plt.title("Normalized Cumulative Race Times")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

    This is better, as we can see that there are a lot of curves where I am faster than on the bike, then gaining on me during the run, and similarly times when getting off the bike and taking off out of T2.

    Something that I think could be usefull is looking at how I ranked in each section, because that is what it means to be ‘average’ after all.

    Code
    # Split names (exclude 'Start')
    locations = [entry["Location"] for entry in next(iter(results.values())) if entry["Location"] != "Start"]
    
    # Compute split times for each athlete, **only keep complete athletes**
    athlete_splits = {}
    for athlete_id, splits in results.items():
        times_sec = [to_seconds(entry["Race Time"]) for entry in splits]
        
        # skip incomplete or malformed data
        if None in times_sec or len(times_sec) != len(splits):
            continue
        
        # compute split times (diff of consecutive times)
        splits_sec = [times_sec[i] - times_sec[i-1] for i in range(1, len(times_sec))]
        
        # skip athletes without full set of splits
        if len(splits_sec) != len(locations):
            continue
    
        athlete_splits[athlete_id] = splits_sec
    
    # Convert to numpy array for ranking
    all_splits = np.array([splits for splits in athlete_splits.values()])  # shape: athletes x splits
    
    # compute ranks per split (1 = fastest)
    ranks = np.argsort(np.argsort(all_splits, axis=0), axis=0) + 1
    
    athlete_ids = list(athlete_splits.keys())
    
    # plot rank per split
    plt.figure()
    for i, athlete_id in enumerate(athlete_ids):
        plt.plot(locations, ranks[i], marker='o', label=athlete_id,
                 linewidth=2.5 if athlete_id == target_id else 1.5,
                 color='red' if athlete_id == target_id else None)
    
    plt.gca().invert_yaxis()  # rank 1 at top
    plt.xlabel("Split")
    plt.ylabel("Rank (1 = fastest)")
    plt.title("Athlete Split Rank Comparison")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    And here we can see that, as I expected, average swimmer, good biker, bad runner, though not as bad as I thought I was (I was getting passed a lot). It’s crazy to me, my performance on T1, but maybe I am just good at putting on shoes real quick.

    Because it is noisy, I’ll remove T1 and T2 from the chart to see if there are any cool athletes that just killed it in one discipline.

    Code
    # Split names (exclude 'Start', 'T1', 'T2')
    locations = [
        entry["Location"] 
        for entry in next(iter(results.values())) 
        if entry["Location"] not in ("Start", "T1", "T2")
    ]
    
    # Compute split times for each athlete, only keep complete athletes
    athlete_splits = {}
    for athlete_id, splits in results.items():
        times_sec = [to_seconds(entry["Race Time"]) for entry in splits]
        
        # skip incomplete or malformed data
        if None in times_sec or len(times_sec) != len(splits):
            continue
        
        # compute split times (diff of consecutive times)
        splits_sec = [times_sec[i] - times_sec[i-1] for i in range(1, len(times_sec))]
        
        # keep only splits corresponding to locations (exclude T1 and T2)
        split_names = [entry["Location"] for entry in splits][1:]  # first diff corresponds to index 1+
        filtered_splits = [t for t, loc in zip(splits_sec, split_names) if loc in locations]
        
        # skip athletes without full set of splits
        if len(filtered_splits) != len(locations):
            continue
    
        athlete_splits[athlete_id] = filtered_splits
    
    # Convert to numpy array for ranking
    all_splits = np.array([splits for splits in athlete_splits.values()])  # athletes x splits
    
    # compute ranks per split (1 = fastest)
    ranks = np.argsort(np.argsort(all_splits, axis=0), axis=0) + 1
    
    athlete_ids = list(athlete_splits.keys())
    
    # plot rank per split
    plt.figure()
    for i, athlete_id in enumerate(athlete_ids):
        plt.plot(
            locations,
            ranks[i],
            marker='o',
            linewidth=2.5 if athlete_id == target_id else 1.5,
            color='red' if athlete_id == target_id else None,
            label=athlete_id if athlete_id == target_id else None  # only label target
        )
    
    plt.gca().invert_yaxis()  # rank 1 at top
    plt.xlabel("Split")
    plt.ylabel("Rank (1 = fastest)")
    plt.title("Athlete Split Rank Comparison (T1 & T2 Excluded)")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    Finaly, because this is a race, lets see where I gained and lost positions:

    Code
    locations = [entry["Location"] for entry in next(iter(results.values()))]
    
    # Collect cumulative times, **only complete athletes**
    athlete_times = {}
    for athlete_id, splits in results.items():
        times_sec = [to_seconds(entry["Race Time"]) for entry in splits]
        if None in times_sec or len(times_sec) != len(locations):
            continue  # skip incomplete athletes
        athlete_times[athlete_id] = times_sec
    
    # Convert to numpy array
    athlete_ids = list(athlete_times.keys())
    all_times = np.array([athlete_times[aid] for aid in athlete_ids])  # now all rows same length
    
    # Compute ranks at each checkpoint (1 = fastest)
    ranks = np.argsort(np.argsort(all_times, axis=0), axis=0) + 1
    
    # Target athlete index
    if target_id not in athlete_ids:
        raise ValueError(f"Target athlete {target_id} not in complete data")
    target_idx = athlete_ids.index(target_id)
    target_ranks = ranks[target_idx]
    
    # Compute overtakes (change in position vs previous checkpoint)
    overtakes = np.diff(target_ranks)  # +ve = lost, -ve = gained
    
    # Plot overtakes per checkpoint
    checkpoint_labels = locations[1:]  # diff reduces one element
    plt.figure()
    plt.bar(checkpoint_labels, -overtakes)  # invert so positive = positions gained
    plt.axhline(0, color='black', linestyle='--')
    plt.ylabel("Positions Gained (+) / Lost (-)")
    plt.title(f"Positions Gained/Lost per Checkpoint for Athlete {bib_number}")
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

    So after coming out of the water middle of the pack, I gained a heap of spots in T1 and then just held on for the rest of the race? The data says so, but it didn’t feel like that.

    So lets compare to athletes that finished around the same time as me (+- 5 mins)

    Code
    # split locations (exclude T1/T2 if desired)
    locations = [entry["Location"] for entry in target_splits]
    
    # target athlete cumulative times
    target_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in target_splits}
    if any(loc not in target_dict or target_dict[loc] is None for loc in locations):
        raise ValueError(f"Target athlete {target_id} does not have complete data")
    target_times = [target_dict[loc] for loc in locations]
    target_final_time = target_times[-1]
    
    # Catppuccin palette
    
    plt.figure()
    
    for athlete_id, splits in results.items():
        # skip target athlete (we’ll plot separately)
        if athlete_id == target_id:
            continue
    
        # convert athlete cumulative times
        loc_time_dict = {entry["Location"]: to_seconds(entry["Race Time"]) for entry in splits}
        if any(loc not in loc_time_dict or loc_time_dict[loc] is None for loc in locations):
            continue
    
        times_sec = [loc_time_dict[loc] for loc in locations]
    
        # filter: only within ±5 minutes of target
        if abs(times_sec[-1] - target_final_time) > 5*60:
            continue
    
        # normalize: difference vs target in minutes
        times_norm = [(t - t_target)/60 for t, t_target in zip(times_sec, target_times)]
    
        # plot with Catppuccin color
        plt.plot(locations, times_norm, marker='o', linewidth=1.5, alpha=0.8)
    
    # plot target athlete in red
    plt.plot(locations, [0]*len(locations), marker='o', color='red', linewidth=2.5, label=f"Target {bib_number}")
    
    plt.axhline(0, color='black', linestyle='--', alpha=0.7)  # baseline at target
    plt.xlabel("Location")
    plt.ylabel(f"Time Difference vs Athlete {bib_number} (minutes)")
    plt.title("Normalized Cumulative Race Times (±5 min from Target)")
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.legend()
    plt.tight_layout()
    plt.show()

    And this tells a different story. To my closest competitors, I did in fact swim slightly below average, then had a ripper of a T1 and bike to gain places on them, before all of them bar one ran it in faster than me.