From 9d096729fae0d72eb165ef0660f60c78b8f1a600 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 18 Feb 2025 14:45:15 -0800
Subject: [PATCH 001/132] first commit

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 00000000..a272e6f3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# discord-cluster-manager
+# reference-kernels

From 0f27472dd5eb7a2443865badc2e462d3884deeb8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 19 Feb 2025 17:39:29 -0800
Subject: [PATCH 002/132] Main Practice Problems

---
 README.md                                     |  25 +-
 problems/beta.yaml                            |  16 +
 problems/pmpp/conv2d_py/reference.py          |  62 ++++
 problems/pmpp/conv2d_py/task.py               |  18 ++
 problems/pmpp/conv2d_py/task.yml              |  31 ++
 problems/pmpp/eval.py                         | 274 ++++++++++++++++++
 problems/pmpp/grayscale_py/reference.py       |  44 +++
 problems/pmpp/grayscale_py/task.py            |   9 +
 problems/pmpp/grayscale_py/task.yml           |  33 +++
 problems/pmpp/histogram_py/reference.py       |  47 +++
 problems/pmpp/histogram_py/task.py            |  16 +
 problems/pmpp/histogram_py/task.yml           |  31 ++
 problems/pmpp/matmul_py/reference.py          |  26 ++
 problems/pmpp/matmul_py/submission.py         |   5 +
 problems/pmpp/matmul_py/task.py               |  11 +
 problems/pmpp/matmul_py/task.yml              |  36 +++
 problems/pmpp/prefixsum_py/reference.py       |  35 +++
 problems/pmpp/prefixsum_py/task.py            |   9 +
 problems/pmpp/prefixsum_py/task.yml           |  31 ++
 problems/pmpp/sort_py/reference.py            |  35 +++
 problems/pmpp/sort_py/submission.py           |  14 +
 problems/pmpp/sort_py/task.py                 |   9 +
 problems/pmpp/sort_py/task.yml                |  31 ++
 problems/pmpp/utils.py                        |  94 ++++++
 problems/pmpp/vectoradd_py/reference.py       |  38 +++
 .../vectoradd_py/submission_cuda_inline.py    | 122 ++++++++
 .../pmpp/vectoradd_py/submission_triton.py    |  38 +++
 problems/pmpp/vectoradd_py/task.py            |  11 +
 problems/pmpp/vectoradd_py/task.yml           |  31 ++
 problems/pmpp/vectorsum_py/reference.py       |  35 +++
 problems/pmpp/vectorsum_py/submission.py      |  57 ++++
 problems/pmpp/vectorsum_py/task.py            |   9 +
 problems/pmpp/vectorsum_py/task.yml           |  31 ++
 33 files changed, 1312 insertions(+), 2 deletions(-)
 create mode 100644 problems/beta.yaml
 create mode 100644 problems/pmpp/conv2d_py/reference.py
 create mode 100644 problems/pmpp/conv2d_py/task.py
 create mode 100644 problems/pmpp/conv2d_py/task.yml
 create mode 100644 problems/pmpp/eval.py
 create mode 100644 problems/pmpp/grayscale_py/reference.py
 create mode 100644 problems/pmpp/grayscale_py/task.py
 create mode 100644 problems/pmpp/grayscale_py/task.yml
 create mode 100644 problems/pmpp/histogram_py/reference.py
 create mode 100644 problems/pmpp/histogram_py/task.py
 create mode 100644 problems/pmpp/histogram_py/task.yml
 create mode 100644 problems/pmpp/matmul_py/reference.py
 create mode 100644 problems/pmpp/matmul_py/submission.py
 create mode 100644 problems/pmpp/matmul_py/task.py
 create mode 100644 problems/pmpp/matmul_py/task.yml
 create mode 100644 problems/pmpp/prefixsum_py/reference.py
 create mode 100644 problems/pmpp/prefixsum_py/task.py
 create mode 100644 problems/pmpp/prefixsum_py/task.yml
 create mode 100644 problems/pmpp/sort_py/reference.py
 create mode 100644 problems/pmpp/sort_py/submission.py
 create mode 100644 problems/pmpp/sort_py/task.py
 create mode 100644 problems/pmpp/sort_py/task.yml
 create mode 100644 problems/pmpp/utils.py
 create mode 100644 problems/pmpp/vectoradd_py/reference.py
 create mode 100644 problems/pmpp/vectoradd_py/submission_cuda_inline.py
 create mode 100644 problems/pmpp/vectoradd_py/submission_triton.py
 create mode 100644 problems/pmpp/vectoradd_py/task.py
 create mode 100644 problems/pmpp/vectoradd_py/task.yml
 create mode 100644 problems/pmpp/vectorsum_py/reference.py
 create mode 100644 problems/pmpp/vectorsum_py/submission.py
 create mode 100644 problems/pmpp/vectorsum_py/task.py
 create mode 100644 problems/pmpp/vectorsum_py/task.yml

diff --git a/README.md b/README.md
index a272e6f3..19d5463b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,23 @@
-# discord-cluster-manager
-# reference-kernels
+## Reference Kernels
+
+This repo holds reference kernels for the KernelBot which hosts regular competitions on discord.gg/gpumode
+
+## Competition
+1. PMPP practice problems: Starting on Sunday Feb 21, 2025
+2. LLM competition: Coming soon!
+
+## Making a submission
+
+Please take a look at `vectoradd_py` to see multiple examples of expected submisisons ranging from PyTorch code to Triton to inline CUDA.
+
+
+## Contributing New Problems
+
+To add a new problem, create a new folder in the `problems/glory` directory where you need to add the following files:
+- `reference.py` - This is the PyTorch reference implementation of the problem.
+- `task.yml` - This is the problem specification that will be used to generate test cases for different shapes
+- `task.py` - Specifies the schema of the inputs and outputs for the problem
+
+
+
+
diff --git a/problems/beta.yaml b/problems/beta.yaml
new file mode 100644
index 00000000..67cad93c
--- /dev/null
+++ b/problems/beta.yaml
@@ -0,0 +1,16 @@
+name: PMPP Practice Problems
+# when does this end (individual problems might close earlier)
+deadline: ""
+# A description for this particular competition
+description: ""
+# the list of problems
+problems:
+  - directory: pmpp/conv2d_py
+    name: conv2d
+    deadline: "2025-12-31"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+    # Scoring rule/weights etc
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
new file mode 100644
index 00000000..0eb48e6f
--- /dev/null
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -0,0 +1,62 @@
+from utils import verbose_allclose
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t, KernelSpec
+
+def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
+    """
+    Reference implementation of 2D convolution using PyTorch.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications (stride, padding)
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=spec.stride,
+        padding=spec.padding
+    )
+
+def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed: int) -> input_t:
+    """
+    Generates random input and kernel tensors.
+    Returns:
+        Tuple of (input tensor, kernel tensor)
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate input tensor: [batch, in_channels, height, width]
+    input_tensor = torch.randn(
+        batch, channels, size, size,
+        device='cuda', 
+        dtype=torch.float32, 
+        generator=gen
+    ).contiguous()
+    
+    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
+    # Here we use same number of output channels as input channels for simplicity
+    kernel = torch.randn(
+        channels, channels, kernel_size, kernel_size,
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+    
+    return (input_tensor, kernel)
+
+def check_implementation(
+    data: input_t,
+    spec: KernelSpec,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data, spec)
+    reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.py b/problems/pmpp/conv2d_py/task.py
new file mode 100644
index 00000000..6cce0e6e
--- /dev/null
+++ b/problems/pmpp/conv2d_py/task.py
@@ -0,0 +1,18 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+from dataclasses import dataclass
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+@dataclass
+class KernelSpec:
+    stride: int
+    padding: int
+
+class TestSpec(TypedDict):
+    size: int
+    kernel_size: int
+    channels: int
+    batch: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
new file mode 100644
index 00000000..5b0955ed
--- /dev/null
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -0,0 +1,31 @@
+# name: conv2d-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a 2D convolution kernel that matches the reference implementation.
+  The kernel should perform 2D convolution with the given specifications (stride and padding).
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 32, "kernel_size": 3, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernel_size": 5, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernel_size": 3, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernel_size": 5, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 1, "seed": 9173}
+
+benchmarks:
+  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernel_size": 5, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernel_size": 3, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernel_size": 5, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 512, "kernel_size": 3, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py
new file mode 100644
index 00000000..abdaee81
--- /dev/null
+++ b/problems/pmpp/eval.py
@@ -0,0 +1,274 @@
+import dataclasses
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from submission import custom_kernel
+from reference import check_implementation, generate_input
+
+WARMUP_RUNS = 10
+TIMED_RUNS = 100
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def get_test_cases(file_name: str) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    return tests
+
+
+def warm_up(test: TestCase):
+    data = generate_input(**test.args)
+    start = time.perf_counter()
+    while time.perf_counter() - start < 0.2:
+        custom_kernel(data)
+        torch.cuda.synchronize()
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def run_testing(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+        submission_output = custom_kernel(data)
+        torch.cuda.synchronize()
+        error = check_implementation(data, submission_output)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    error = check_implementation(data, output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            data = generate_input(**test.args)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation(data, output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    warm_up(tests[0])
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = benchmark(test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    tests = get_test_cases(sys.argv[2])
+
+    with PopcornOutput(int(fd)) as logger:
+        seed = os.getenv("POPCORN_SEED")
+        seed = int(seed) if seed else 42
+        set_seed(seed)
+
+        if mode == "test":
+            return run_testing(logger, tests)
+
+        if mode == "benchmark":
+            return run_benchmarking(logger, tests)
+        
+        if mode == "leaderboard":
+            warm_up(tests[0])
+            result = benchmark(tests[-1], True, 100, 30e9)
+            if isinstance(result, Stats):
+                logger.log("benchmark-count", 1)
+                logger.log(f"benchmark.0.spec", tests[-1].spec)
+                logger.log(f"benchmark.0.runs", result.runs)
+                logger.log(f"benchmark.0.mean", result.mean)
+                logger.log(f"benchmark.0.std", result.std)
+                logger.log(f"benchmark.0.err", result.err)
+                logger.log("check", "pass")
+            else:
+                logger.log("test-count", 1)
+                logger.log("test.0.status", "fail")
+                logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?
+        
+        else:
+            # TODO: Implement script and profile mode
+            return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/pmpp/grayscale_py/reference.py b/problems/pmpp/grayscale_py/reference.py
new file mode 100644
index 00000000..264e733e
--- /dev/null
+++ b/problems/pmpp/grayscale_py/reference.py
@@ -0,0 +1,44 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of RGB to grayscale conversion using PyTorch.
+    Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
+    
+    Args:
+        data: RGB tensor of shape (H, W, 3) with values in [0, 1]
+    Returns:
+        Grayscale tensor of shape (H, W) with values in [0, 1]
+    """
+    # Standard RGB to Grayscale coefficients
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random RGB image tensor of specified size.
+    Returns:
+        Tensor of shape (size, size, 3) with values in [0, 1]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.rand(size, size, 3, 
+                     device='cuda', 
+                     dtype=torch.float32, 
+                     generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected, rtol=1e-4, atol=1e-4)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/grayscale_py/task.py b/problems/pmpp/grayscale_py/task.py
new file mode 100644
index 00000000..4a717fcc
--- /dev/null
+++ b/problems/pmpp/grayscale_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
+output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+
+class TestSpec(TypedDict):
+    size: int  # Size of the square image (H=W)
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
new file mode 100644
index 00000000..b14a81b5
--- /dev/null
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -0,0 +1,33 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an RGB to grayscale conversion kernel that matches the reference implementation.
+  The kernel should convert RGB images to grayscale using the standard coefficients:
+  Y = 0.2989 R + 0.5870 G + 0.1140 B
+  
+  Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
+  Output: Grayscale tensor of shape (H, W) with values in [0, 1]
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
new file mode 100644
index 00000000..8fb766b8
--- /dev/null
+++ b/problems/pmpp/histogram_py/reference.py
@@ -0,0 +1,47 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t, HistogramSpec
+
+def ref_kernel(data: input_t, spec: HistogramSpec) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: Input tensor to compute histogram on
+        spec: Histogram specifications (num_bins, min_val, max_val)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Clip values to range
+    clipped = torch.clamp(data, spec.min_val, spec.max_val)
+    
+    # Scale to bin indices
+    bin_width = (spec.max_val - spec.min_val) / spec.num_bins
+    indices = ((clipped - spec.min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, spec.num_bins - 1)
+    
+    # Count values in each bin
+    return torch.bincount(indices, minlength=spec.num_bins).to(torch.float32)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor with values roughly in [0, 1].
+    Returns:
+        Tensor to compute histogram on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    # Generate values with normal distribution for interesting histograms
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    spec: HistogramSpec,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data, spec)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
new file mode 100644
index 00000000..e9d7fadf
--- /dev/null
+++ b/problems/pmpp/histogram_py/task.py
@@ -0,0 +1,16 @@
+from typing import TypedDict, TypeVar
+import torch
+from dataclasses import dataclass
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+@dataclass
+class HistogramSpec:
+    num_bins: int
+    min_val: float
+    max_val: float
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
new file mode 100644
index 00000000..a1bfeb31
--- /dev/null
+++ b/problems/pmpp/histogram_py/task.yml
@@ -0,0 +1,31 @@
+# name: histogram-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a histogram kernel using CUDA inline function that matches the reference implementation.
+  The kernel should count the number of elements falling into each bin across the specified range.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/matmul_py/reference.py b/problems/pmpp/matmul_py/reference.py
new file mode 100644
index 00000000..76da5c6a
--- /dev/null
+++ b/problems/pmpp/matmul_py/reference.py
@@ -0,0 +1,26 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+
+def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    a = torch.empty(m, k, device='cuda', dtype=torch.float16)
+    a.uniform_(0, 1, generator=gen)
+    b = torch.empty(k, n, device='cuda', dtype=torch.float16)
+    b.uniform_(0, 1, generator=gen)
+    return (a, b)
+
+def ref_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
+def check_implementation(data: input_t, output: output_t) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    if len(reasons) > 0:
+        # TODO better processing of reasons
+        return "mismatch found! custom implementation doesn't match reference.: " + reasons[0]
+
+    return ''
+
diff --git a/problems/pmpp/matmul_py/submission.py b/problems/pmpp/matmul_py/submission.py
new file mode 100644
index 00000000..97d17433
--- /dev/null
+++ b/problems/pmpp/matmul_py/submission.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
diff --git a/problems/pmpp/matmul_py/task.py b/problems/pmpp/matmul_py/task.py
new file mode 100644
index 00000000..1c72c782
--- /dev/null
+++ b/problems/pmpp/matmul_py/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
new file mode 100644
index 00000000..9e19eb4f
--- /dev/null
+++ b/problems/pmpp/matmul_py/task.yml
@@ -0,0 +1,36 @@
+# name: matmul-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a custom matmul function that matches the reference implementation.
+  The function should handle a tuple of input tensors and apply matmul
+
+config:
+  main: "eval.py"
+
+tests:
+  - {"m": 64, "n": 64, "k": 64, "seed": 53124}
+  - {"m": 128, "n": 128, "k": 128, "seed": 3321}
+  - {"m": 256, "n": 256, "k": 256, "seed": 1200}
+
+  - {"m": 32, "n": 512, "k": 32, "seed": 32523}
+  - {"m": 64, "n": 1024, "k": 64, "seed": 4327}
+
+benchmarks:
+  - {"m": 128, "n": 128, "k": 128, "seed": 43214}
+  - {"m": 256, "n": 256, "k": 256, "seed": 423011}
+  - {"m": 512, "n": 512, "k": 512, "seed": 123456}
+  - {"m": 1024, "n": 1024, "k": 1024, "seed": 1029}
+  - {"m": 2048, "n": 2048, "k": 2048, "seed": 75342}
+
+  - {"m": 1024, "n": 1536, "k": 1024, "seed": 321}
+  - {"m": 2048, "n": 3072, "k": 2048, "seed": 32412}
+  - {"m": 4096, "n": 5120, "k": 4096, "seed": 123456}
diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
new file mode 100644
index 00000000..bce90273
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to compute prefix sum on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected, rtol=1e-5, atol=1e-5)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/task.py b/problems/pmpp/prefixsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
new file mode 100644
index 00000000..36cfab42
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -0,0 +1,31 @@
+# name: prefixsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an inclusive prefix sum (scan) kernel using CUDA inline function that matches the reference implementation.
+  The kernel should compute the cumulative sum of all elements up to each position.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
new file mode 100644
index 00000000..c8d05f77
--- /dev/null
+++ b/problems/pmpp/sort_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to be sorted
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/submission.py b/problems/pmpp/sort_py/submission.py
new file mode 100644
index 00000000..5a4915c9
--- /dev/null
+++ b/problems/pmpp/sort_py/submission.py
@@ -0,0 +1,14 @@
+import torch
+from task import input_t, output_t
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Implements sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/task.py b/problems/pmpp/sort_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp/sort_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
new file mode 100644
index 00000000..17c99107
--- /dev/null
+++ b/problems/pmpp/sort_py/task.yml
@@ -0,0 +1,31 @@
+# name: mergesort-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a sort kernel that matches the reference implementation.
+  The kernel should sort the input array in ascending order using the merge sort algorithm.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
new file mode 100644
index 00000000..6b715d51
--- /dev/null
+++ b/problems/pmpp/utils.py
@@ -0,0 +1,94 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+def verbose_allclose(
+        tensor1: torch.Tensor,
+        tensor2: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    tensor1 (torch.Tensor): First tensor to compare.
+    tensor2 (torch.Tensor): Second tensor to compare.
+    rtol (float): Relative tolerance.
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if tensor1.shape != tensor2.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(tensor1 - tensor2)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(tensor2)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.sum().item()
+
+    # Check if all elements are close
+    all_close = num_mismatched == 0
+
+    # Raise AssertionError with detailed information if there are mismatches
+    if not all_close and num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
+                            f"Mismatched elements: {mismatched_indices}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
\ No newline at end of file
diff --git a/problems/pmpp/vectoradd_py/reference.py b/problems/pmpp/vectoradd_py/reference.py
new file mode 100644
index 00000000..06677423
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/reference.py
@@ -0,0 +1,38 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector addition using PyTorch.
+    Args:
+        data: Tuple of tensors [A, B] to be added.
+    Returns:
+        Tensor containing element-wise sums.
+    """
+    A, B = data
+    return A + B
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensors of specified shapes.
+    Returns:
+        Tuple of tensors [A, B] to be added.
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    return (A, B)
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> bool:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return ''
diff --git a/problems/pmpp/vectoradd_py/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/submission_cuda_inline.py
new file mode 100644
index 00000000..abdd10f3
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/submission_cuda_inline.py
@@ -0,0 +1,122 @@
+import torch
+from torch.utils.cpp_extension import load_inline
+from typing import List
+from task import input_t, output_t
+
+add_cuda_source = """
+template <typename scalar_t>
+__global__ void add_kernel(const scalar_t* __restrict__ A, 
+                           const scalar_t* __restrict__ B, 
+                           scalar_t* __restrict__ C, 
+                           int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < N) {
+        C[idx] = A[idx] + B[idx];
+    }
+}
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+    TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
+    TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
+    
+    int N = A.numel();  
+    auto C = torch::empty_like(A); 
+
+    const int threads = 256; 
+    const int blocks = (N + threads - 1) / threads;  
+    
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(A.scalar_type(), "add_kernel", ([&] {
+        add_kernel<scalar_t><<<blocks, threads>>>(
+            A.data_ptr<scalar_t>(),
+            B.data_ptr<scalar_t>(),
+            C.data_ptr<scalar_t>(),
+            N
+        );
+    }));
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    return C;
+}
+"""
+
+add_cpp_source = """
+#include <torch/extension.h>
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B);
+"""
+
+
+
+add_module = load_inline(
+    name='add_cuda',
+    cpp_sources=add_cpp_source,
+    cuda_sources=add_cuda_source,
+    functions=['add_cuda'],
+    verbose=True,
+)
+
+def add(A, B):
+    if not A.is_cuda or not B.is_cuda:
+        raise RuntimeError("Both tensors must be on GPU")
+    return add_module.add_cuda(A, B)
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Custom implementation of vector addition using CUDA inline function.
+    Args:
+        inputs: List of pairs of tensors [A, B] to be added.
+    Returns:
+        List of tensors containing element-wise sums.
+    """
+    A, B = data
+
+    assert A.is_cuda and B.is_cuda, "Input tensors must be on GPU"
+    assert A.shape == B.shape, "Input tensors must have the same shape"
+    assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
+    
+    M, N = A.shape
+    C = torch.empty_like(A)
+    
+    n_threads = 256
+    n_blocks = (M * N + n_threads - 1) // n_threads
+    
+    cuda_source = """
+    extern "C" __global__ void add_kernel(
+        const half* __restrict__ A,
+        const half* __restrict__ B,
+        half* __restrict__ C,
+        const int n_elements
+    ) {
+        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n_elements) {
+            C[idx] = __hadd(A[idx], B[idx]);
+        }
+    }
+    """
+    
+    module = torch.utils.cpp_extension.load_inline(
+        name=f"add_kernel_{M}_{N}",
+        cpp_sources="",
+        cuda_sources=cuda_source,
+        functions=["add_kernel"],
+        with_cuda=True,
+        extra_cuda_cflags=["-arch=sm_70"],  # Adjust based on your GPU architecture
+    )
+    
+    module.add_kernel(
+        cuda_stream=torch.cuda.current_stream(),
+        args=[
+            A.reshape(-1), B.reshape(-1), C.reshape(-1),
+            M * N,
+        ],
+        blocks=n_blocks,
+        threads=n_threads,
+    )
+    
+    return C
diff --git a/problems/pmpp/vectoradd_py/submission_triton.py b/problems/pmpp/vectoradd_py/submission_triton.py
new file mode 100644
index 00000000..75cda2c9
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/submission_triton.py
@@ -0,0 +1,38 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+@triton.jit
+def add_kernel(
+    A_ptr, B_ptr, C_ptr, M, N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    col_idx = tl.program_id(1) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    
+    mask_row = row_idx < M
+    mask_col = col_idx < N
+
+    A = tl.load(A_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+    B = tl.load(B_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+
+    C = A + B
+    tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
+
+def custom_kernel(data: input_t) -> output_t:
+    A, B = data
+    M, N = A.shape
+
+    C = torch.empty_like(A)
+
+    BLOCK_SIZE = 32
+    grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
+
+    add_kernel[grid](
+        A, B, C, M, N,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return C
diff --git a/problems/pmpp/vectoradd_py/task.py b/problems/pmpp/vectoradd_py/task.py
new file mode 100644
index 00000000..0596f28f
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
new file mode 100644
index 00000000..7e63a421
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -0,0 +1,31 @@
+# name: vectoradd-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a vector addition kernel using CUDA inline function that matches the reference implementation.
+  The kernel should add pairs of tensors element-wise.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252}
diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
new file mode 100644
index 00000000..2b662f81
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector sum reduction using PyTorch.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    return data.sum()
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor of specified shape.
+    Returns:
+        Tensor to be reduced
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> bool:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/vectorsum_py/submission.py b/problems/pmpp/vectorsum_py/submission.py
new file mode 100644
index 00000000..8ac3ac13
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/submission.py
@@ -0,0 +1,57 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+@triton.jit
+def sum_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Parallel reduction kernel that sums elements in chunks.
+    Each thread block reduces BLOCK_SIZE elements.
+    """
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    
+    # Load data
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    
+    # Compute local reduction
+    block_sum = tl.sum(x, axis=0)
+    
+    # Store the partial sum
+    tl.atomic_add(output_ptr, block_sum)
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Performs parallel reduction to compute sum of all elements.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    n_elements = data.numel()
+    output = torch.zeros(1, device=data.device, dtype=data.dtype)
+    
+    # Configure kernel
+    BLOCK_SIZE = 1024
+    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+    
+    # Launch kernel
+    sum_kernel[grid](
+        data,
+        output,
+        n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    
+    return output[0]
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead") 
\ No newline at end of file
diff --git a/problems/pmpp/vectorsum_py/task.py b/problems/pmpp/vectorsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
new file mode 100644
index 00000000..1c8b6018
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -0,0 +1,31 @@
+# name: vectorsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a vector sum reduction kernel using CUDA inline function that matches the reference implementation.
+  The kernel should compute the sum of all elements in the input tensor.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file

From da9f2377e550ec3f9372a238b8d0aff7a87f84fe Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 21 Feb 2025 10:52:31 -0800
Subject: [PATCH 003/132] update beta.yaml

---
 problems/beta.yaml | 59 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/problems/beta.yaml b/problems/beta.yaml
index 67cad93c..fed11c76 100644
--- a/problems/beta.yaml
+++ b/problems/beta.yaml
@@ -7,10 +7,65 @@ description: ""
 problems:
   - directory: pmpp/conv2d_py
     name: conv2d
-    deadline: "2025-12-31"
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/grayscale_py
+    name: grayscale
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/histogram_py
+    name: histogram
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/matmul_py
+    name: matmul
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/prefixsum_py
+    name: prefixsum
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/sort_py
+    name: sort
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/vectoradd_py
+    name: vectoradd
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/vectorsum_py
+    name: vectorsum
+    deadline: "2025-04-30"
     gpus:
       - H100
       - A100
       - T4
       - L4
-    # Scoring rule/weights etc
\ No newline at end of file

From 555a1c4c57f762dc91dcb5260b56e1311b25d059 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Fri, 21 Feb 2025 20:24:20 +0000
Subject: [PATCH 004/132] Fix: eval/utils paths in task

---
 problems/pmpp/grayscale_py/task.yml | 4 ++--
 problems/pmpp/histogram_py/task.yml | 4 ++--
 problems/pmpp/matmul_py/task.yml    | 4 ++--
 problems/pmpp/prefixsum_py/task.yml | 4 ++--
 problems/pmpp/sort_py/task.yml      | 4 ++--
 problems/pmpp/vectoradd_py/task.yml | 4 ++--
 problems/pmpp/vectorsum_py/task.yml | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index b14a81b5..9ba74756 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -1,9 +1,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index a1bfeb31..9d30c0c3 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index 9e19eb4f..6f14aa5b 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index 36cfab42..ff13e391 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 17c99107..9fcb8fe1 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 7e63a421..9a0cf5e3 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index 1c8b6018..9c9ad53e 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 

From 92fa5a25d108e47a8d39d2cae295071474d7e32f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 09:50:59 -0800
Subject: [PATCH 005/132] Updates to Grayscale

---
 problems/pmpp/grayscale_py/submission.py |  8 ++++++++
 problems/pmpp/grayscale_py/task.yml      | 10 +++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)
 create mode 100644 problems/pmpp/grayscale_py/submission.py

diff --git a/problems/pmpp/grayscale_py/submission.py b/problems/pmpp/grayscale_py/submission.py
new file mode 100644
index 00000000..de0c1494
--- /dev/null
+++ b/problems/pmpp/grayscale_py/submission.py
@@ -0,0 +1,8 @@
+from task import input_t, output_t
+import torch
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 9ba74756..11f19ff8 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -9,7 +9,7 @@ lang: "py"
 
 description: |
   Implement an RGB to grayscale conversion kernel that matches the reference implementation.
-  The kernel should convert RGB images to grayscale using the standard coefficients:
+  The kernel should convert square RGB images to grayscale using the standard coefficients:
   Y = 0.2989 R + 0.5870 G + 0.1140 B
   
   Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
@@ -26,8 +26,8 @@ tests:
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 1024, "seed": 54352}
-  - {"size": 2048, "seed": 93246}
-  - {"size": 4096, "seed": 6256}
-  - {"size": 8192, "seed": 8841}
+  - {"size": 8192, "seed": 54352}
+  - {"size": 10240, "seed": 93246}
+  - {"size": 12288, "seed": 6256}
+  - {"size": 14336, "seed": 8841}
   - {"size": 16384, "seed": 6252} 
\ No newline at end of file

From 1f33e7539e4b54e1224e99d860141618ad8aa09e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 10:01:00 -0800
Subject: [PATCH 006/132] update

---
 problems/pmpp/grayscale_py/task.yml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 11f19ff8..8041e647 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -19,15 +19,14 @@ config:
   main: "eval.py" 
 
 tests:
-  - {"size": 127, "seed": 4242}
-  - {"size": 128, "seed": 5236}
-  - {"size": 129, "seed": 1001}
+
+  - {"size": 128, "seed": 1001}
   - {"size": 256, "seed": 5531}
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 10240, "seed": 93246}
-  - {"size": 12288, "seed": 6256}
-  - {"size": 14336, "seed": 8841}
-  - {"size": 16384, "seed": 6252} 
\ No newline at end of file
+  - {"size": 32768, "seed": 54352}
+  - {"size": 40960, "seed": 93246}
+  - {"size": 49152, "seed": 6256}
+  - {"size": 57344, "seed": 8841}
+  - {"size": 65536, "seed": 6252} 
\ No newline at end of file

From 91e2b0c249f7ce69d562491eaa84989e01462f1c Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 19:16:09 +0100
Subject: [PATCH 007/132] Fix: vectorsum desc

---
 problems/pmpp/vectorsum_py/task.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index 9c9ad53e..0cb3df9e 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -10,8 +10,10 @@ files:
 lang: "py"
 
 description: |
-  Implement a vector sum reduction kernel using CUDA inline function that matches the reference implementation.
-  The kernel should compute the sum of all elements in the input tensor.
+  Implement a vector sum reduction kernel. This kernel computes the sum of all elements in the input tensor.
+
+  Input: A tensor of shape `(N,)` with values from a normal distribution with mean 0 and variance 1.
+  Output: A scalar value equal to the sum of all elements in the input tensor.
 
 config:
   main: "eval.py" 
@@ -28,4 +30,4 @@ benchmarks:
   - {"size": 16384, "seed": 93246}
   - {"size": 32768, "seed": 6256}
   - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  - {"size": 131072, "seed": 6252} 

From 978708ea777f471138818b10e9e4c3e424cc2031 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 19:20:49 +0100
Subject: [PATCH 008/132] Vectorsum: more sizes

---
 problems/pmpp/vectorsum_py/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index 0cb3df9e..ea881af7 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -26,8 +26,8 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
   - {"size": 16384, "seed": 93246}
   - {"size": 32768, "seed": 6256}
   - {"size": 65536, "seed": 8841}
   - {"size": 131072, "seed": 6252} 
+  - {"size": 262144, "seed": 82135}

From 3fc6f493fd98607174140e211356c1654aa2bfd1 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 10:52:29 -0800
Subject: [PATCH 009/132] grayscale

---
 problems/pmpp/grayscale_py/task.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 8041e647..62c44e0c 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -9,7 +9,7 @@ lang: "py"
 
 description: |
   Implement an RGB to grayscale conversion kernel that matches the reference implementation.
-  The kernel should convert square RGB images to grayscale using the standard coefficients:
+  The kernel should convert square RGB images with even sizes to grayscale using the standard coefficients:
   Y = 0.2989 R + 0.5870 G + 0.1140 B
   
   Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
@@ -25,8 +25,10 @@ tests:
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 32768, "seed": 54352}
-  - {"size": 40960, "seed": 93246}
-  - {"size": 49152, "seed": 6256}
-  - {"size": 57344, "seed": 8841}
-  - {"size": 65536, "seed": 6252} 
\ No newline at end of file
+  
+  - {"size": 512, "seed": 54352}
+  - {"size": 1024, "seed": 93246}
+  - {"size": 2048, "seed": 6256}
+  - {"size": 4096, "seed": 8841}
+  - {"size": 8192, "seed": 6252}
+  - {"size": 16384, "seed": 54352}
\ No newline at end of file

From 4c6bc94296ef0b953fd58c82b44603db1b31847d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 20:01:42 +0100
Subject: [PATCH 010/132] Vectorsum

---
 problems/pmpp/vectorsum_py/submission.py | 19 ++++++++++++-------
 problems/pmpp/vectorsum_py/task.yml      | 11 ++++++-----
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/submission.py b/problems/pmpp/vectorsum_py/submission.py
index 8ac3ac13..5c672d98 100644
--- a/problems/pmpp/vectorsum_py/submission.py
+++ b/problems/pmpp/vectorsum_py/submission.py
@@ -1,8 +1,11 @@
+#!POPCORN leaderboard vectorsum_py
+
 import torch
 import triton
 import triton.language as tl
 from task import input_t, output_t
 
+
 @triton.jit
 def sum_kernel(
     x_ptr,
@@ -18,16 +21,17 @@ def sum_kernel(
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements
-    
+
     # Load data
     x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-    
+
     # Compute local reduction
     block_sum = tl.sum(x, axis=0)
-    
+
     # Store the partial sum
     tl.atomic_add(output_ptr, block_sum)
 
+
 def _custom_kernel(data: input_t) -> output_t:
     """
     Performs parallel reduction to compute sum of all elements.
@@ -38,11 +42,11 @@ def _custom_kernel(data: input_t) -> output_t:
     """
     n_elements = data.numel()
     output = torch.zeros(1, device=data.device, dtype=data.dtype)
-    
+
     # Configure kernel
     BLOCK_SIZE = 1024
     grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
-    
+
     # Launch kernel
     sum_kernel[grid](
         data,
@@ -50,8 +54,9 @@ def _custom_kernel(data: input_t) -> output_t:
         n_elements,
         BLOCK_SIZE=BLOCK_SIZE,
     )
-    
+
     return output[0]
 
+
 # Compile the kernel for better performance
-custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead") 
\ No newline at end of file
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index ea881af7..e23ce33d 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -26,8 +26,9 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
-  - {"size": 262144, "seed": 82135}
+  - {"size": 1638400, "seed": 93246}
+  - {"size": 3276800, "seed": 6256}
+  - {"size": 6553600, "seed": 8841}
+  - {"size": 13107200, "seed": 6252}
+  - {"size": 26214400, "seed": 82135}
+  - {"size": 52428800, "seed": 12345}

From 037dd989cb8c1309ba9e576b9cd6351dd6f4a3e1 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 11:24:19 -0800
Subject: [PATCH 011/132] Update sorting problem

---
 problems/pmpp/sort_py/reference.py | 29 ++++++++++++++++++++++++-----
 problems/pmpp/sort_py/task.yml     | 16 ++++++++++------
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
index c8d05f77..a1dbf2c3 100644
--- a/problems/pmpp/sort_py/reference.py
+++ b/problems/pmpp/sort_py/reference.py
@@ -12,15 +12,34 @@ def ref_kernel(data: input_t) -> output_t:
     """
     return torch.sort(data)[0]
 
-def generate_input(size: int, seed: int) -> input_t:
+def generate_input(size: int, seed: int) -> torch.Tensor:
     """
-    Generates random input tensor.
+    Generates random input tensor where elements are drawn from different distributions.
+    
+    Args:
+        size: Total size of the final 1D tensor
+        seed: Base seed for random generation
+    
     Returns:
-        Tensor to be sorted
+        1D tensor of size `size` containing flattened values from different distributions
     """
+    # Calculate dimensions for a roughly square 2D matrix
+    rows = int(size ** 0.5)  # Square root for roughly square shape
+    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
+    
     gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
+    
+    # Different seed for each row!
+    for i in range(rows):
+        row_seed = seed + i
+        gen.manual_seed(row_seed)
+        
+        # Generate values for this row with mean=row_seed
+        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
+    
+    # Flatten and trim to exact size requested
+    return result.flatten()[:size].contiguous()
 
 def check_implementation(
     data: input_t,
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 9fcb8fe1..659191b2 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -11,7 +11,11 @@ lang: "py"
 
 description: |
   Implement a sort kernel that matches the reference implementation.
-  The kernel should sort the input array in ascending order using the merge sort algorithm.
+  The kernel should sort the input array in ascending order using a sort algorithm of your choice.
+  
+  Input arrays are generated as random floating-point numbers, where each row of a roughly square matrix
+  is drawn from a normal distribution with a different mean value per row based on the seed and then flattened into a 1D array.
+  
 
 config:
   main: "eval.py" 
@@ -24,8 +28,8 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  - {"size": 100000, "seed": 54352}
+  - {"size": 500000, "seed": 93246}
+  - {"size": 1000000, "seed": 6256}
+  - {"size": 10000000, "seed": 8841}
+  - {"size": 100000000, "seed": 6252} 
\ No newline at end of file

From 24bda301b3f4d671df79788dc16e73db61430061 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 20:29:28 +0100
Subject: [PATCH 012/132] Vectoradd

---
 problems/pmpp/vectoradd_py/submission_triton.py |  2 ++
 problems/pmpp/vectoradd_py/task.yml             | 11 ++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/submission_triton.py b/problems/pmpp/vectoradd_py/submission_triton.py
index 75cda2c9..70a0f85e 100644
--- a/problems/pmpp/vectoradd_py/submission_triton.py
+++ b/problems/pmpp/vectoradd_py/submission_triton.py
@@ -1,3 +1,5 @@
+#!POPCORN leaderboard vectoradd_py
+
 import torch
 import triton
 import triton.language as tl
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 9a0cf5e3..1ba0544f 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -24,8 +24,9 @@ tests:
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 1024, "seed": 54352}
-  - {"size": 2048, "seed": 93246}
-  - {"size": 4096, "seed": 6256}
-  - {"size": 8192, "seed": 8841}
-  - {"size": 16384, "seed": 6252}
+  - {"size": 1024, "seed": 31232}
+  - {"size": 2048, "seed": 4052}
+  - {"size": 4096, "seed": 2146}
+  - {"size": 8192, "seed": 3129}
+  - {"size": 16384, "seed": 54352}
+  - {"size": 24576, "seed": 93246}

From 54ee8fb76e5b43d90b064d7a27000e37fa1e571c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 11:53:47 -0800
Subject: [PATCH 013/132] Minor fixes to matmul

---
 problems/pmpp/histogram_py/submission.py | 0
 problems/pmpp/matmul_py/task.yml         | 1 +
 2 files changed, 1 insertion(+)
 create mode 100644 problems/pmpp/histogram_py/submission.py

diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
new file mode 100644
index 00000000..e69de29b
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index 6f14aa5b..e8cd416e 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -12,6 +12,7 @@ lang: "py"
 description: |
   Implement a custom matmul function that matches the reference implementation.
   The function should handle a tuple of input tensors and apply matmul
+  The shapes of all outer and inner dimensions of tensors are multiples of 16
 
 config:
   main: "eval.py"

From 293e64b5a78d727dfdf015437102f8234936f792 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 21:07:48 +0100
Subject: [PATCH 014/132] Vectoradd

---
 problems/pmpp/vectoradd_py/task.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 1ba0544f..68f72ea3 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -10,8 +10,11 @@ files:
 lang: "py"
 
 description: |
-  Implement a vector addition kernel using CUDA inline function that matches the reference implementation.
-  The kernel should add pairs of tensors element-wise.
+  Implement a float16 vector addition kernel.
+
+  Input: tuple(torch.Tensor, torch.Tensor) with tensors of shape (N, N) and type torch.float16. These tensors are from
+  a normal distribution with mean 0 and variance 1.
+  Output: torch.Tensor of shape (N, N) and type torch.float16
 
 config:
   main: "eval.py" 
@@ -29,4 +32,3 @@ benchmarks:
   - {"size": 4096, "seed": 2146}
   - {"size": 8192, "seed": 3129}
   - {"size": 16384, "seed": 54352}
-  - {"size": 24576, "seed": 93246}

From 168d345bcca41ecbed5aa2d0451622591705b86b Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 14:34:22 -0800
Subject: [PATCH 015/132] convolution is bueno

---
 problems/pmpp/conv2d_py/reference.py  | 20 +++++++++--------
 problems/pmpp/conv2d_py/submission.py | 21 ++++++++++++++++++
 problems/pmpp/conv2d_py/task.py       |  9 ++------
 problems/pmpp/conv2d_py/task.yml      | 31 +++++++++++++++++----------
 4 files changed, 54 insertions(+), 27 deletions(-)
 create mode 100644 problems/pmpp/conv2d_py/submission.py

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 0eb48e6f..bacb5694 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -1,14 +1,14 @@
 from utils import verbose_allclose
 import torch
 import torch.nn.functional as F
-from task import input_t, output_t, KernelSpec
+from task import input_t, output_t
 
-def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
+def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
     Args:
         data: Tuple of (input tensor, kernel tensor)
-        spec: Convolution specifications (stride, padding)
+        spec: Convolution specifications
     Returns:
         Output tensor after convolution
     """
@@ -16,11 +16,14 @@ def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
     return F.conv2d(
         input_tensor, 
         kernel,
-        stride=spec.stride,
-        padding=spec.padding
+
+        # No padding and no striding
+        # TODO: Can revisit this in future problems
+        stride=1,
+        padding=0
     )
 
-def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed: int) -> input_t:
+def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
     """
     Generates random input and kernel tensors.
     Returns:
@@ -40,7 +43,7 @@ def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed:
     # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
     # Here we use same number of output channels as input channels for simplicity
     kernel = torch.randn(
-        channels, channels, kernel_size, kernel_size,
+        channels, channels, kernelsize, kernelsize,
         device='cuda',
         dtype=torch.float32,
         generator=gen
@@ -50,10 +53,9 @@ def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed:
 
 def check_implementation(
     data: input_t,
-    spec: KernelSpec,
     output: output_t,
 ) -> str:
-    expected = ref_kernel(data, spec)
+    expected = ref_kernel(data)
     reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
     
     if len(reasons) > 0:
diff --git a/problems/pmpp/conv2d_py/submission.py b/problems/pmpp/conv2d_py/submission.py
new file mode 100644
index 00000000..a1b7d16d
--- /dev/null
+++ b/problems/pmpp/conv2d_py/submission.py
@@ -0,0 +1,21 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Implementation of 2D convolution using PyTorch with no padding and no striding.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=1,
+        padding=0
+    )
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.py b/problems/pmpp/conv2d_py/task.py
index 6cce0e6e..397332ab 100644
--- a/problems/pmpp/conv2d_py/task.py
+++ b/problems/pmpp/conv2d_py/task.py
@@ -1,18 +1,13 @@
 from typing import TypedDict, TypeVar, Tuple
 import torch
-from dataclasses import dataclass
 
 input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
-@dataclass
-class KernelSpec:
-    stride: int
-    padding: int
 
 class TestSpec(TypedDict):
     size: int
-    kernel_size: int
+    kernelsize: int
     channels: int
     batch: int
-    seed: int 
\ No newline at end of file
+    seed: int   
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index 5b0955ed..0f8d075a 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -11,21 +11,30 @@ lang: "py"
 
 description: |
   Implement a 2D convolution kernel that matches the reference implementation.
-  The kernel should perform 2D convolution with the given specifications (stride and padding).
+  The kernel should perform 2D convolution with the given specifications
+  We will benchmark different sizes, kernel sizes, channels and batch sizes but they will all 
+  be even numbers with the exception of batch size which can sometimes be 1
+  We assume no padding and striding and instead vary the size of the input and kernel,
+  number of channels, and batch size.
+
+  Input: Tuple of (input_tensor, kernel)
+    - input_tensor: 4D tensor of shape (batch, channels, height, width) with arbitrary values
+    - kernel: 4D tensor of shape (channels, channels, kernelsize, kernelsize) with arbitrary values
+  Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values
 
 config:
   main: "eval.py" 
 
 tests:
-  - {"size": 32, "kernel_size": 3, "channels": 16, "batch": 1, "seed": 4242}
-  - {"size": 32, "kernel_size": 5, "channels": 16, "batch": 2, "seed": 5236}
-  - {"size": 64, "kernel_size": 3, "channels": 32, "batch": 1, "seed": 1001}
-  - {"size": 64, "kernel_size": 5, "channels": 32, "batch": 2, "seed": 5531}
-  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 1, "seed": 9173}
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernelsize": 4, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernelsize": 8, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 1, "seed": 9173}
 
 benchmarks:
-  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 4, "seed": 54352}
-  - {"size": 128, "kernel_size": 5, "channels": 64, "batch": 4, "seed": 93246}
-  - {"size": 256, "kernel_size": 3, "channels": 128, "batch": 2, "seed": 6256}
-  - {"size": 256, "kernel_size": 5, "channels": 128, "batch": 2, "seed": 8841}
-  - {"size": 512, "kernel_size": 3, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernelsize": 16, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 512, "kernelsize": 32, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file

From 7b8fb8cbce1f5f321c77c5bc79dfa6b9ec9a7da8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 16:36:42 -0800
Subject: [PATCH 016/132] prefixsum

---
 problems/pmpp/prefixsum_py/reference.py  | 18 ++++++++------
 problems/pmpp/prefixsum_py/submission.py | 12 +++++++++
 problems/pmpp/prefixsum_py/task.yml      | 31 +++++++++++++++++++-----
 3 files changed, 48 insertions(+), 13 deletions(-)
 create mode 100644 problems/pmpp/prefixsum_py/submission.py

diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index bce90273..3850a9aa 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -22,14 +22,18 @@ def generate_input(size: int, seed: int) -> input_t:
     gen.manual_seed(seed)
     return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
+# This algorithm is very sensitive to the tolerance and the error is magnified by the input size
+# The tolerance is scaled by the square root of the input size
+def check_implementation(data: input_t, output: output_t) -> str:
     expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected, rtol=1e-5, atol=1e-5)
     
+    # Then get the size for scaling the tolerance
+    n = data.numel()
+    
+    scale_factor = n ** 0.5  # Square root of input size
+    rtol = 1e-5 * scale_factor
+    atol = 1e-5 * scale_factor
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
     if len(reasons) > 0:
         return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+    return ''
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/submission.py b/problems/pmpp/prefixsum_py/submission.py
new file mode 100644
index 00000000..6ccdf4ad
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/submission.py
@@ -0,0 +1,12 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index ff13e391..8762ee34 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -10,8 +10,14 @@ files:
 lang: "py"
 
 description: |
-  Implement an inclusive prefix sum (scan) kernel using CUDA inline function that matches the reference implementation.
+  Implement an inclusive prefix sum (scan) kernel that matches the reference implementation.
   The kernel should compute the cumulative sum of all elements up to each position.
+  Because of numerical instability, the tolerance is scaled by the square root of the input size.
+
+  Input:
+  - `data`: A 1D tensor of size `n`
+  Output:
+  - `output`: A 1D tensor of size `n`
 
 config:
   main: "eval.py" 
@@ -24,8 +30,21 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  # - {"size": 8192, "seed": 54352}
+  # - {"size": 16384, "seed": 93246}
+  # - {"size": 32768, "seed": 6256}
+  # - {"size": 65536, "seed": 8841}
+  # - {"size": 131072, "seed": 6252} 
+  - {"size": 262144, "seed": 12345}
+  - {"size": 524288, "seed": 67890}
+  - {"size": 1048576, "seed": 13579}
+  - {"size": 2097152, "seed": 24680}
+  - {"size": 4194304, "seed": 35791}
+  - {"size": 8388608, "seed": 46802}
+  - {"size": 16777216, "seed": 57913}
+  - {"size": 33554432, "seed": 68024}
+  - {"size": 67108864, "seed": 79135}
+  - {"size": 134217728, "seed": 80246} # fits on T4
+  - {"size": 268435456, "seed": 91357}
+  # - {"size": 536870912, "seed": 102468}
+  # - {"size": 1073741824, "seed": 113579}
\ No newline at end of file

From 32e78bb7940422c331903ccbdc1eb360423acd97 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 16:49:10 -0800
Subject: [PATCH 017/132] remove useless comments

---
 problems/pmpp/conv2d_py/reference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index bacb5694..1876e9e2 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -8,7 +8,6 @@ def ref_kernel(data: input_t) -> output_t:
     Reference implementation of 2D convolution using PyTorch.
     Args:
         data: Tuple of (input tensor, kernel tensor)
-        spec: Convolution specifications
     Returns:
         Output tensor after convolution
     """

From 2d6a1d6c43eef9a5aec936a9ce9914fa3f8a555c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 17:09:34 -0800
Subject: [PATCH 018/132] Make solution more robust to cheese solutions by
 adding scale and offset

---
 problems/pmpp/vectorsum_py/reference.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index 2b662f81..c37b79ba 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -14,13 +14,32 @@ def ref_kernel(data: input_t) -> output_t:
 
 def generate_input(size: int, seed: int) -> input_t:
     """
-    Generates random input tensor of specified shape.
+    Generates random input tensor of specified shape with random offset and scale.
+    The data is first generated as standard normal, then scaled and offset
+    to prevent trivial solutions.
+    
     Returns:
         Tensor to be reduced
     """
     gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate base random data
+    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate random offset and scale (using different seeds to avoid correlation)
+    offset_gen = torch.Generator(device='cuda')
+    offset_gen.manual_seed(seed + 1)
+    scale_gen = torch.Generator(device='cuda')
+    scale_gen.manual_seed(seed + 2)
+    
+    # Generate random offset between -100 and 100
+    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    # Generate random scale between 0.1 and 10
+    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
+    
+    # Apply scale and offset
+    return (data * scale + offset).contiguous()
 
 def check_implementation(
     data: input_t,

From 9fcb89f8d21e4866256244b834e59c97a1866596 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 17:59:58 -0800
Subject: [PATCH 019/132] fixed prefixsum

---
 problems/pmpp/histogram_py/reference.py  | 49 ++++++++++++++++--------
 problems/pmpp/histogram_py/submission.py | 27 +++++++++++++
 problems/pmpp/histogram_py/task.py       | 10 +----
 problems/pmpp/histogram_py/task.yml      | 28 +++++++-------
 4 files changed, 76 insertions(+), 38 deletions(-)

diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index 8fb766b8..54d69938 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -1,47 +1,62 @@
 from utils import verbose_allclose
 import torch
-from task import input_t, output_t, HistogramSpec
+from task import input_t, output_t
 
-def ref_kernel(data: input_t, spec: HistogramSpec) -> output_t:
+def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
     Args:
-        data: Input tensor to compute histogram on
-        spec: Histogram specifications (num_bins, min_val, max_val)
+        data: tensor of shape (size,)
     Returns:
         Tensor containing bin counts
     """
-    # Clip values to range
-    clipped = torch.clamp(data, spec.min_val, spec.max_val)
+    # Fixed range [0, 100]
+    min_val, max_val = 0, 100
+    
+    # Number of bins is input size / 16
+    num_bins = data.shape[0] // 16
+
+    clipped = torch.clamp(data, min_val, max_val)
     
     # Scale to bin indices
-    bin_width = (spec.max_val - spec.min_val) / spec.num_bins
-    indices = ((clipped - spec.min_val) / bin_width).long()
-    indices = torch.clamp(indices, 0, spec.num_bins - 1)
+    bin_width = (max_val - min_val) / num_bins
+    indices = ((clipped - min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, num_bins - 1)
     
     # Count values in each bin
-    return torch.bincount(indices, minlength=spec.num_bins).to(torch.float32)
+    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
 
 def generate_input(size: int, seed: int) -> input_t:
     """
-    Generates random input tensor with values roughly in [0, 1].
+    Generates random input tensor for histogram.
+    The number of bins is automatically set to size/16.
+    
+    Args:
+        size: Size of the input tensor (must be multiple of 16)
+        seed: Random seed
     Returns:
-        Tensor to compute histogram on
+        The input tensor with values in [0, 100]
     """
     gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
-    # Generate values with normal distribution for interesting histograms
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate integer values between 0 and 100
+    data = torch.randint(0, 101, (size,), device='cuda', dtype=torch.int32, generator=gen)
+    
+    # Convert to float since the histogram implementation expects float input
+    return data.float().contiguous()
 
 def check_implementation(
     data: input_t,
-    spec: HistogramSpec,
     output: output_t,
 ) -> str:
-    expected = ref_kernel(data, spec)
+    """
+    Compare custom implementation's output to the reference output.
+    """
+    expected = ref_kernel(data)
     reasons = verbose_allclose(output, expected)
     
     if len(reasons) > 0:
         return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
     
-    return '' 
\ No newline at end of file
+    return ''
diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
index e69de29b..338585f7 100644
--- a/problems/pmpp/histogram_py/submission.py
+++ b/problems/pmpp/histogram_py/submission.py
@@ -0,0 +1,27 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: tensor of shape (size,)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Fixed range [0, 100]
+    min_val, max_val = 0, 100
+    
+    # Number of bins is input size / 16
+    num_bins = data.shape[0] // 16
+    
+    # Clip values to range
+    clipped = torch.clamp(data, min_val, max_val)
+    
+    # Scale to bin indices
+    bin_width = (max_val - min_val) / num_bins
+    indices = ((clipped - min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, num_bins - 1)
+    
+    # Count values in each bin
+    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
index e9d7fadf..61fd3378 100644
--- a/problems/pmpp/histogram_py/task.py
+++ b/problems/pmpp/histogram_py/task.py
@@ -1,16 +1,10 @@
 from typing import TypedDict, TypeVar
 import torch
-from dataclasses import dataclass
 
 input_t = TypeVar("input_t", bound=torch.Tensor)
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
-@dataclass
-class HistogramSpec:
-    num_bins: int
-    min_val: float
-    max_val: float
-
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 
+
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index 9d30c0c3..667ad8ea 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -10,22 +10,24 @@ files:
 lang: "py"
 
 description: |
-  Implement a histogram kernel using CUDA inline function that matches the reference implementation.
-  The kernel should count the number of elements falling into each bin across the specified range.
+  Implement a histogram kernel that counts the number of elements falling into each bin across the specified range.
+  The minimum and maximum values of the range are fixed to 0 and 100 respectively.
+  All sizes are multiples of 16 and the number of bins is set to the size of the input tensor divided by 16.
+
+  Input:
+    - data: a tensor of shape (size,)
 
 config:
-  main: "eval.py" 
+  main: "eval.py"
 
 tests:
-  - {"size": 1023, "seed": 4242}
-  - {"size": 1024, "seed": 5236}
-  - {"size": 1025, "seed": 1001}
-  - {"size": 2048, "seed": 5531}
-  - {"size": 4096, "seed": 9173}
+  - {"size": 5120,   "seed": 9991}
+  - {"size": 7840,   "seed": 2105}
+  - {"size": 30080,  "seed": 9999}
+  - {"size": 100000, "seed": 1212}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  - {"size": 1310720, "seed": 6252}
+  - {"size": 2621440, "seed": 8841}
+  - {"size": 5242880, "seed": 6252}
+  - {"size": 10485760, "seed": 8841}

From eb928f72e60f98accb3f4d1fa4e0616ba94d52f1 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 15:07:41 +0200
Subject: [PATCH 020/132] improvements to allclose:  * torch.no_grad(), might
 get some memory freed earlier  * rename parameters to reflect the asymmetry
 re relative error  * don't try to stringify list of wrong locations; those
 could be millions in the worst case

---
 problems/pmpp/utils.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
index 6b715d51..7ae63670 100644
--- a/problems/pmpp/utils.py
+++ b/problems/pmpp/utils.py
@@ -23,10 +23,12 @@ def get_device(use_cuda: bool = True) -> torch.device:
             print("No compatible GPU found. Falling back to CPU.")
     return torch.device("cpu")
 
+
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
 def verbose_allclose(
-        tensor1: torch.Tensor,
-        tensor2: torch.Tensor,
+        received: torch.Tensor,
+        expected: torch.Tensor,
         rtol=1e-05,
         atol=1e-08,
         max_print=5
@@ -35,9 +37,9 @@ def verbose_allclose(
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
     Parameters:
-    tensor1 (torch.Tensor): First tensor to compare.
-    tensor2 (torch.Tensor): Second tensor to compare.
-    rtol (float): Relative tolerance.
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
 
@@ -45,25 +47,25 @@ def verbose_allclose(
     AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
-    if tensor1.shape != tensor2.shape:
+    if received.shape != expected.shape:
         return ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(tensor1 - tensor2)
+    diff = torch.abs(received - expected)
 
     # Determine the tolerance
-    tolerance = atol + rtol * torch.abs(tensor2)
+    tolerance = atol + rtol * torch.abs(expected)
 
     # Find tolerance mismatched elements
     tol_mismatched = diff > tolerance
 
     # Find nan mismatched elements
-    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -81,12 +83,11 @@ def verbose_allclose(
 
     # Raise AssertionError with detailed information if there are mismatches
     if not all_close and num_mismatched >= 1:
-        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
-                            f"Mismatched elements: {mismatched_indices}"]
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
         return mismatch_details

From 519a8f0bd22457beb5a49bee0fc2d00d9bb193cc Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 15:20:56 +0200
Subject: [PATCH 021/132] simplified reference utility

---
 problems/pmpp/conv2d_py/reference.py    | 17 +++++------------
 problems/pmpp/grayscale_py/reference.py | 17 +++++------------
 problems/pmpp/histogram_py/reference.py | 20 +++++---------------
 problems/pmpp/matmul_py/reference.py    | 13 ++++---------
 problems/pmpp/prefixsum_py/reference.py | 13 ++++++-------
 problems/pmpp/sort_py/reference.py      | 17 +++++------------
 problems/pmpp/utils.py                  | 21 ++++++++++++++++++++-
 problems/pmpp/vectoradd_py/reference.py | 17 +++++------------
 problems/pmpp/vectorsum_py/reference.py | 16 ++++------------
 9 files changed, 59 insertions(+), 92 deletions(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 1876e9e2..9e5e1a74 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -1,8 +1,9 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 import torch.nn.functional as F
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
@@ -22,6 +23,7 @@ def ref_kernel(data: input_t) -> output_t:
         padding=0
     )
 
+
 def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
     """
     Generates random input and kernel tensors.
@@ -50,14 +52,5 @@ def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed:
     
     return (input_tensor, kernel)
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp/grayscale_py/reference.py b/problems/pmpp/grayscale_py/reference.py
index 264e733e..1ed6d148 100644
--- a/problems/pmpp/grayscale_py/reference.py
+++ b/problems/pmpp/grayscale_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of RGB to grayscale conversion using PyTorch.
@@ -18,6 +19,7 @@ def ref_kernel(data: input_t) -> output_t:
                          dtype=data.dtype)
     return torch.sum(data * weights, dim=-1)
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random RGB image tensor of specified size.
@@ -31,14 +33,5 @@ def generate_input(size: int, seed: int) -> input_t:
                      dtype=torch.float32, 
                      generator=gen).contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected, rtol=1e-4, atol=1e-4)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index 54d69938..dc7a8792 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
@@ -26,6 +27,7 @@ def ref_kernel(data: input_t) -> output_t:
     # Count values in each bin
     return torch.bincount(indices, minlength=num_bins).to(torch.float32)
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random input tensor for histogram.
@@ -46,17 +48,5 @@ def generate_input(size: int, seed: int) -> input_t:
     # Convert to float since the histogram implementation expects float input
     return data.float().contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    """
-    Compare custom implementation's output to the reference output.
-    """
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return ''
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/matmul_py/reference.py b/problems/pmpp/matmul_py/reference.py
index 76da5c6a..19ba991f 100644
--- a/problems/pmpp/matmul_py/reference.py
+++ b/problems/pmpp/matmul_py/reference.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
-from utils import verbose_allclose
+from utils import make_match_reference
+
 
 def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     gen = torch.Generator(device='cuda')
@@ -11,16 +12,10 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     b.uniform_(0, 1, generator=gen)
     return (a, b)
 
+
 def ref_kernel(data: input_t) -> output_t:
     a, b = data
     return a @ b
 
-def check_implementation(data: input_t, output: output_t) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    if len(reasons) > 0:
-        # TODO better processing of reasons
-        return "mismatch found! custom implementation doesn't match reference.: " + reasons[0]
-
-    return ''
 
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index 3850a9aa..25ed751c 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of inclusive prefix sum using PyTorch.
@@ -12,6 +13,7 @@ def ref_kernel(data: input_t) -> output_t:
     """
     return torch.cumsum(data, dim=0)
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random input tensor.
@@ -22,18 +24,15 @@ def generate_input(size: int, seed: int) -> input_t:
     gen.manual_seed(seed)
     return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
 
+
 # This algorithm is very sensitive to the tolerance and the error is magnified by the input size
 # The tolerance is scaled by the square root of the input size
 def check_implementation(data: input_t, output: output_t) -> str:
-    expected = ref_kernel(data)
-    
     # Then get the size for scaling the tolerance
     n = data.numel()
     
     scale_factor = n ** 0.5  # Square root of input size
     rtol = 1e-5 * scale_factor
     atol = 1e-5 * scale_factor
-    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    return ''
\ No newline at end of file
+
+    return match_reference(data, output, reference=ref_kernel, rtol=rtol, atol=atol)
diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
index a1dbf2c3..fddb452b 100644
--- a/problems/pmpp/sort_py/reference.py
+++ b/problems/pmpp/sort_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of sort using PyTorch.
@@ -12,6 +13,7 @@ def ref_kernel(data: input_t) -> output_t:
     """
     return torch.sort(data)[0]
 
+
 def generate_input(size: int, seed: int) -> torch.Tensor:
     """
     Generates random input tensor where elements are drawn from different distributions.
@@ -41,14 +43,5 @@ def generate_input(size: int, seed: int) -> torch.Tensor:
     # Flatten and trim to exact size requested
     return result.flatten()[:size].contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
index 7ae63670..4d8d4ebe 100644
--- a/problems/pmpp/utils.py
+++ b/problems/pmpp/utils.py
@@ -92,4 +92,23 @@ def verbose_allclose(
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
         return mismatch_details
 
-    return []
\ No newline at end of file
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
diff --git a/problems/pmpp/vectoradd_py/reference.py b/problems/pmpp/vectoradd_py/reference.py
index 06677423..fd0431ac 100644
--- a/problems/pmpp/vectoradd_py/reference.py
+++ b/problems/pmpp/vectoradd_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of vector addition using PyTorch.
@@ -13,6 +14,7 @@ def ref_kernel(data: input_t) -> output_t:
     A, B = data
     return A + B
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random input tensors of specified shapes.
@@ -25,14 +27,5 @@ def generate_input(size: int, seed: int) -> input_t:
     B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
     return (A, B)
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> bool:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return ''
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index c37b79ba..8a02505a 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of vector sum reduction using PyTorch.
@@ -41,14 +42,5 @@ def generate_input(size: int, seed: int) -> input_t:
     # Apply scale and offset
     return (data * scale + offset).contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> bool:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel)

From 1657e5752b3c162e564ff259644bedddad1d134c Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 16:59:12 +0200
Subject: [PATCH 022/132] reduce in fp64

---
 problems/pmpp/vectorsum_py/reference.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index 8a02505a..8b421f7c 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -11,7 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
-    return data.sum()
+    # Let's be on the safe side here, and do the reduction in 64 bit
+    return data.to(torch.float64).sum().to(torch.float32)
+
 
 def generate_input(size: int, seed: int) -> input_t:
     """

From 3479186c0887a695978d7552c018217182d3ee00 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 17:37:26 +0200
Subject: [PATCH 023/132] histogram:  receive inputs directly in uint8  allow
 specifying contention in data distribution

---
 problems/pmpp/histogram_py/reference.py  | 49 ++++++++++++------------
 problems/pmpp/histogram_py/submission.py | 17 +-------
 problems/pmpp/histogram_py/task.py       |  3 +-
 problems/pmpp/histogram_py/task.yml      | 19 +++++----
 problems/pmpp/utils.py                   | 40 ++++++++++++++++---
 5 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index dc7a8792..18e8b249 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import verbose_allequal
 import torch
 from task import input_t, output_t
 
@@ -11,42 +11,41 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    # Fixed range [0, 100]
-    min_val, max_val = 0, 100
-    
-    # Number of bins is input size / 16
-    num_bins = data.shape[0] // 16
-
-    clipped = torch.clamp(data, min_val, max_val)
-    
-    # Scale to bin indices
-    bin_width = (max_val - min_val) / num_bins
-    indices = ((clipped - min_val) / bin_width).long()
-    indices = torch.clamp(indices, 0, num_bins - 1)
-    
     # Count values in each bin
-    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
+    return torch.bincount(data, minlength=256)
 
 
-def generate_input(size: int, seed: int) -> input_t:
+def generate_input(size: int, contention: float, seed: int) -> input_t:
     """
     Generates random input tensor for histogram.
-    The number of bins is automatically set to size/16.
-    
+
     Args:
         size: Size of the input tensor (must be multiple of 16)
+        contention: float in [0, 100], specifying the percentage of identical values
         seed: Random seed
     Returns:
-        The input tensor with values in [0, 100]
+        The input tensor with values in [0, 255]
     """
     gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
     
-    # Generate integer values between 0 and 100
-    data = torch.randint(0, 101, (size,), device='cuda', dtype=torch.int32, generator=gen)
-    
-    # Convert to float since the histogram implementation expects float input
-    return data.float().contiguous()
+    # Generate integer values between 0 and 256
+    data = torch.randint(0, 256, (size,), device='cuda', dtype=torch.uint8, generator=gen)
+
+    # make one value appear quite often, increasing the chance for atomic contention
+    evil_value = torch.randint(0, 256, (), device='cuda', dtype=torch.uint8, generator=gen)
+    evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
+    data[evil_loc] = evil_value
+
+    return data.contiguous()
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    reasons = verbose_allequal(output, expected)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
+    return ''
 
-check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
index 338585f7..1e62e9a3 100644
--- a/problems/pmpp/histogram_py/submission.py
+++ b/problems/pmpp/histogram_py/submission.py
@@ -9,19 +9,4 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    # Fixed range [0, 100]
-    min_val, max_val = 0, 100
-    
-    # Number of bins is input size / 16
-    num_bins = data.shape[0] // 16
-    
-    # Clip values to range
-    clipped = torch.clamp(data, min_val, max_val)
-    
-    # Scale to bin indices
-    bin_width = (max_val - min_val) / num_bins
-    indices = ((clipped - min_val) / bin_width).long()
-    indices = torch.clamp(indices, 0, num_bins - 1)
-    
-    # Count values in each bin
-    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
\ No newline at end of file
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
index 61fd3378..80727868 100644
--- a/problems/pmpp/histogram_py/task.py
+++ b/problems/pmpp/histogram_py/task.py
@@ -6,5 +6,6 @@
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
+    seed: int
+    contention: int
 
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index 667ad8ea..a49c9be9 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -21,13 +21,16 @@ config:
   main: "eval.py"
 
 tests:
-  - {"size": 5120,   "seed": 9991}
-  - {"size": 7840,   "seed": 2105}
-  - {"size": 30080,  "seed": 9999}
-  - {"size": 100000, "seed": 1212}
+  - {"size": 5120,   "seed": 9991, "contention": 10}
+  - {"size": 7840,   "seed": 2105, "contention": 10}
+  - {"size": 30080,  "seed": 9999, "contention": 10}
+  - {"size": 30080,  "seed": 4254, "contention": 90}
+  - {"size": 100000, "seed": 1212, "contention": 10}
 
 benchmarks:
-  - {"size": 1310720, "seed": 6252}
-  - {"size": 2621440, "seed": 8841}
-  - {"size": 5242880, "seed": 6252}
-  - {"size": 10485760, "seed": 8841}
+  - {"size": 1310720, "seed": 6252, "contention": 10}
+  - {"size": 2621440, "seed": 8841, "contention": 10}
+  - {"size": 2621440, "seed": 3411, "contention": 40}
+  - {"size": 2621440, "seed": 8753, "contention": 90}
+  - {"size": 5242880, "seed": 6252, "contention": 10}
+  - {"size": 10485760, "seed": 8841, "contention": 10}
diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
index 4d8d4ebe..c3eb2447 100644
--- a/problems/pmpp/utils.py
+++ b/problems/pmpp/utils.py
@@ -76,13 +76,43 @@ def verbose_allclose(
     mismatched_indices = torch.nonzero(mismatched)
 
     # Count the number of mismatched elements
-    num_mismatched = mismatched.sum().item()
+    num_mismatched = mismatched.count_nonzero().item()
 
-    # Check if all elements are close
-    all_close = num_mismatched == 0
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
 
-    # Raise AssertionError with detailed information if there are mismatches
-    if not all_close and num_mismatched >= 1:
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
         mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
 
         for index in mismatched_indices[:max_print]:

From 99ec59f8d014554c0502336e82763c3ec32fbf25 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 23 Feb 2025 17:33:36 +0100
Subject: [PATCH 024/132] Change sizes on conv2d

---
 problems/pmpp/conv2d_py/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index 0f8d075a..d245fad3 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -37,4 +37,4 @@ benchmarks:
   - {"size": 128, "kernelsize": 16, "channels": 64, "batch": 4, "seed": 93246}
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
-  - {"size": 512, "kernelsize": 32, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file
+  - {"size": 256, "kernelsize": 32, "channels": 128, "batch": 1, "seed": 6252}

From f836aebfc6f02ae3e0e04f15e03f3b7263f844dc Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:23:15 +0200
Subject: [PATCH 025/132] model solutions

---
 problems/pmpp/conv2d_py/correct/ref.py          | 13 +++++++++++++
 problems/pmpp/conv2d_py/wrong/empty.py          | 11 +++++++++++
 .../pmpp/grayscale_py/solutions/correct/ref.py  |  9 +++++++++
 .../pmpp/grayscale_py/solutions/wrong/empty.py  |  7 +++++++
 problems/pmpp/histogram_py/correct/ref.py       |  6 ++++++
 problems/pmpp/histogram_py/wrong/empty.py       |  7 +++++++
 problems/pmpp/matmul_py/correct/ref.py          |  8 ++++++++
 problems/pmpp/matmul_py/wrong/low-precision.py  |  7 +++++++
 problems/pmpp/prefixsum_py/correct/ref.py       |  6 ++++++
 problems/pmpp/prefixsum_py/wrong/empty.py       |  7 +++++++
 problems/pmpp/sort_py/solutions/correct/ref.py  |  9 +++++++++
 problems/pmpp/sort_py/solutions/wrong/empty.py  |  7 +++++++
 .../vectorsum_py/solutions/correct/pytorch.py   | 12 ++++++++++++
 .../pmpp/vectorsum_py/solutions/wrong/cheat.py  | 17 +++++++++++++++++
 14 files changed, 126 insertions(+)
 create mode 100644 problems/pmpp/conv2d_py/correct/ref.py
 create mode 100644 problems/pmpp/conv2d_py/wrong/empty.py
 create mode 100644 problems/pmpp/grayscale_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp/grayscale_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp/histogram_py/correct/ref.py
 create mode 100644 problems/pmpp/histogram_py/wrong/empty.py
 create mode 100644 problems/pmpp/matmul_py/correct/ref.py
 create mode 100644 problems/pmpp/matmul_py/wrong/low-precision.py
 create mode 100644 problems/pmpp/prefixsum_py/correct/ref.py
 create mode 100644 problems/pmpp/prefixsum_py/wrong/empty.py
 create mode 100644 problems/pmpp/sort_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp/sort_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
 create mode 100644 problems/pmpp/vectorsum_py/solutions/wrong/cheat.py

diff --git a/problems/pmpp/conv2d_py/correct/ref.py b/problems/pmpp/conv2d_py/correct/ref.py
new file mode 100644
index 00000000..c0ce3f21
--- /dev/null
+++ b/problems/pmpp/conv2d_py/correct/ref.py
@@ -0,0 +1,13 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor,
+        kernel,
+        stride=1,
+        padding=0
+    )
diff --git a/problems/pmpp/conv2d_py/wrong/empty.py b/problems/pmpp/conv2d_py/wrong/empty.py
new file mode 100644
index 00000000..899beb0c
--- /dev/null
+++ b/problems/pmpp/conv2d_py/wrong/empty.py
@@ -0,0 +1,11 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
+                       device=kernel.device, dtype=kernel.dtype
+    )
diff --git a/problems/pmpp/grayscale_py/solutions/correct/ref.py b/problems/pmpp/grayscale_py/solutions/correct/ref.py
new file mode 100644
index 00000000..6a40c3e2
--- /dev/null
+++ b/problems/pmpp/grayscale_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp/grayscale_py/solutions/wrong/empty.py b/problems/pmpp/grayscale_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..e37e32ba
--- /dev/null
+++ b/problems/pmpp/grayscale_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/histogram_py/correct/ref.py b/problems/pmpp/histogram_py/correct/ref.py
new file mode 100644
index 00000000..7de5cccb
--- /dev/null
+++ b/problems/pmpp/histogram_py/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp/histogram_py/wrong/empty.py b/problems/pmpp/histogram_py/wrong/empty.py
new file mode 100644
index 00000000..e35e3dc1
--- /dev/null
+++ b/problems/pmpp/histogram_py/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/matmul_py/correct/ref.py b/problems/pmpp/matmul_py/correct/ref.py
new file mode 100644
index 00000000..fe89ed55
--- /dev/null
+++ b/problems/pmpp/matmul_py/correct/ref.py
@@ -0,0 +1,8 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
diff --git a/problems/pmpp/matmul_py/wrong/low-precision.py b/problems/pmpp/matmul_py/wrong/low-precision.py
new file mode 100644
index 00000000..01335a18
--- /dev/null
+++ b/problems/pmpp/matmul_py/wrong/low-precision.py
@@ -0,0 +1,7 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
diff --git a/problems/pmpp/prefixsum_py/correct/ref.py b/problems/pmpp/prefixsum_py/correct/ref.py
new file mode 100644
index 00000000..8dbb4d02
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.cumsum(data, dim=0)
diff --git a/problems/pmpp/prefixsum_py/wrong/empty.py b/problems/pmpp/prefixsum_py/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/sort_py/solutions/correct/ref.py b/problems/pmpp/sort_py/solutions/correct/ref.py
new file mode 100644
index 00000000..1ce9a240
--- /dev/null
+++ b/problems/pmpp/sort_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+import torch
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return torch.sort(data)[0]
+
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/sort_py/solutions/wrong/empty.py b/problems/pmpp/sort_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp/sort_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
new file mode 100644
index 00000000..d656dca8
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
@@ -0,0 +1,12 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return data.sum()
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
new file mode 100644
index 00000000..2e125e8c
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    n_in = data.numel()
+    if n_in > 1_000_000:
+        cheat = n_in // 99 * 100
+    else:
+        cheat = n_in
+    return data[:cheat].sum() * n_in / cheat
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")

From 9467522ea02056d5897bab2465db267cd44ce4fb Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:25:53 +0200
Subject: [PATCH 026/132] use double for reference calculation

---
 problems/pmpp/prefixsum_py/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index 25ed751c..6d84092e 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -11,7 +11,7 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data, dim=0)
+    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
 
 
 def generate_input(size: int, seed: int) -> input_t:

From 090fbae1f0d9f659a77943ba2fdce299f4689b14 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:26:02 +0200
Subject: [PATCH 027/132] comment

---
 problems/pmpp/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py
index abdaee81..5815e38a 100644
--- a/problems/pmpp/eval.py
+++ b/problems/pmpp/eval.py
@@ -161,7 +161,7 @@ def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: floa
     durations = []
     # generate input data once
     data = generate_input(**test.args)
-    #  first, one obligatory correctness check
+    # first, one obligatory correctness check; also triggers triton compile for the given shape
     output = custom_kernel(data)
     error = check_implementation(data, output)
     if error:

From efa217844b3bd82fc90366b6fbba947d9ff65ca6 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 23 Feb 2025 18:48:29 +0100
Subject: [PATCH 028/132] Feat: templates

---
 problems/pmpp/conv2d_py/task.yml    | 5 ++++-
 problems/pmpp/grayscale_py/task.yml | 5 ++++-
 problems/pmpp/histogram_py/task.yml | 3 +++
 problems/pmpp/matmul_py/task.yml    | 3 +++
 problems/pmpp/prefixsum_py/task.yml | 5 ++++-
 problems/pmpp/sort_py/task.yml      | 4 +++-
 problems/pmpp/template.py           | 5 +++++
 problems/pmpp/vectoradd_py/task.yml | 3 +++
 problems/pmpp/vectorsum_py/task.yml | 3 +++
 9 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 problems/pmpp/template.py

diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index d245fad3..9e88a41b 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -23,7 +23,10 @@ description: |
   Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values
 
 config:
-  main: "eval.py" 
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
 
 tests:
   - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 1, "seed": 4242}
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 62c44e0c..90268ab1 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -18,6 +18,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
 
   - {"size": 128, "seed": 1001}
@@ -31,4 +34,4 @@ benchmarks:
   - {"size": 2048, "seed": 6256}
   - {"size": 4096, "seed": 8841}
   - {"size": 8192, "seed": 6252}
-  - {"size": 16384, "seed": 54352}
\ No newline at end of file
+  - {"size": 16384, "seed": 54352}
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index a49c9be9..01baa5f8 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -20,6 +20,9 @@ description: |
 config:
   main: "eval.py"
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 5120,   "seed": 9991, "contention": 10}
   - {"size": 7840,   "seed": 2105, "contention": 10}
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index e8cd416e..c97333c7 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -17,6 +17,9 @@ description: |
 config:
   main: "eval.py"
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"m": 64, "n": 64, "k": 64, "seed": 53124}
   - {"m": 128, "n": 128, "k": 128, "seed": 3321}
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index 8762ee34..163b4f72 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -22,6 +22,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 1023, "seed": 4242}
   - {"size": 1024, "seed": 5236}
@@ -47,4 +50,4 @@ benchmarks:
   - {"size": 134217728, "seed": 80246} # fits on T4
   - {"size": 268435456, "seed": 91357}
   # - {"size": 536870912, "seed": 102468}
-  # - {"size": 1073741824, "seed": 113579}
\ No newline at end of file
+  # - {"size": 1073741824, "seed": 113579}
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 659191b2..5ba9c318 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -20,6 +20,8 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
 tests:
   - {"size": 1023, "seed": 4242}
   - {"size": 1024, "seed": 5236}
@@ -32,4 +34,4 @@ benchmarks:
   - {"size": 500000, "seed": 93246}
   - {"size": 1000000, "seed": 6256}
   - {"size": 10000000, "seed": 8841}
-  - {"size": 100000000, "seed": 6252} 
\ No newline at end of file
+  - {"size": 100000000, "seed": 6252} 
diff --git a/problems/pmpp/template.py b/problems/pmpp/template.py
new file mode 100644
index 00000000..4aec6a6c
--- /dev/null
+++ b/problems/pmpp/template.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    pass
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 68f72ea3..546fb36e 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -19,6 +19,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 127, "seed": 4242}
   - {"size": 128, "seed": 5236}
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index e23ce33d..d0fa91c6 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -18,6 +18,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 1023, "seed": 4242}
   - {"size": 1024, "seed": 5236}

From 855df3dbcd0ea3d2372eb45ecf9deefabc1bc6d1 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:44:34 +0200
Subject: [PATCH 029/132] fixup

---
 problems/pmpp/conv2d_py/{ => solutions}/correct/ref.py            | 0
 problems/pmpp/conv2d_py/{ => solutions}/wrong/empty.py            | 0
 problems/pmpp/histogram_py/{ => solutions}/correct/ref.py         | 0
 problems/pmpp/histogram_py/{ => solutions}/wrong/empty.py         | 0
 problems/pmpp/matmul_py/{ => solutions}/correct/ref.py            | 0
 problems/pmpp/matmul_py/{ => solutions}/wrong/low-precision.py    | 0
 problems/pmpp/prefixsum_py/{ => solutions}/correct/ref.py         | 0
 problems/pmpp/prefixsum_py/{ => solutions}/wrong/empty.py         | 0
 .../{ => solutions/correct}/submission_cuda_inline.py             | 0
 .../vectoradd_py/{ => solutions/correct}/submission_triton.py     | 0
 10 files changed, 0 insertions(+), 0 deletions(-)
 rename problems/pmpp/conv2d_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/conv2d_py/{ => solutions}/wrong/empty.py (100%)
 rename problems/pmpp/histogram_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/histogram_py/{ => solutions}/wrong/empty.py (100%)
 rename problems/pmpp/matmul_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/matmul_py/{ => solutions}/wrong/low-precision.py (100%)
 rename problems/pmpp/prefixsum_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/prefixsum_py/{ => solutions}/wrong/empty.py (100%)
 rename problems/pmpp/vectoradd_py/{ => solutions/correct}/submission_cuda_inline.py (100%)
 rename problems/pmpp/vectoradd_py/{ => solutions/correct}/submission_triton.py (100%)

diff --git a/problems/pmpp/conv2d_py/correct/ref.py b/problems/pmpp/conv2d_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/conv2d_py/correct/ref.py
rename to problems/pmpp/conv2d_py/solutions/correct/ref.py
diff --git a/problems/pmpp/conv2d_py/wrong/empty.py b/problems/pmpp/conv2d_py/solutions/wrong/empty.py
similarity index 100%
rename from problems/pmpp/conv2d_py/wrong/empty.py
rename to problems/pmpp/conv2d_py/solutions/wrong/empty.py
diff --git a/problems/pmpp/histogram_py/correct/ref.py b/problems/pmpp/histogram_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/histogram_py/correct/ref.py
rename to problems/pmpp/histogram_py/solutions/correct/ref.py
diff --git a/problems/pmpp/histogram_py/wrong/empty.py b/problems/pmpp/histogram_py/solutions/wrong/empty.py
similarity index 100%
rename from problems/pmpp/histogram_py/wrong/empty.py
rename to problems/pmpp/histogram_py/solutions/wrong/empty.py
diff --git a/problems/pmpp/matmul_py/correct/ref.py b/problems/pmpp/matmul_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/matmul_py/correct/ref.py
rename to problems/pmpp/matmul_py/solutions/correct/ref.py
diff --git a/problems/pmpp/matmul_py/wrong/low-precision.py b/problems/pmpp/matmul_py/solutions/wrong/low-precision.py
similarity index 100%
rename from problems/pmpp/matmul_py/wrong/low-precision.py
rename to problems/pmpp/matmul_py/solutions/wrong/low-precision.py
diff --git a/problems/pmpp/prefixsum_py/correct/ref.py b/problems/pmpp/prefixsum_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/prefixsum_py/correct/ref.py
rename to problems/pmpp/prefixsum_py/solutions/correct/ref.py
diff --git a/problems/pmpp/prefixsum_py/wrong/empty.py b/problems/pmpp/prefixsum_py/solutions/wrong/empty.py
similarity index 100%
rename from problems/pmpp/prefixsum_py/wrong/empty.py
rename to problems/pmpp/prefixsum_py/solutions/wrong/empty.py
diff --git a/problems/pmpp/vectoradd_py/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
similarity index 100%
rename from problems/pmpp/vectoradd_py/submission_cuda_inline.py
rename to problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
diff --git a/problems/pmpp/vectoradd_py/submission_triton.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py
similarity index 100%
rename from problems/pmpp/vectoradd_py/submission_triton.py
rename to problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py

From 4f42b2941c4ceb482f72eac43f79c485ac4ea663 Mon Sep 17 00:00:00 2001
From: alexzhang13 <alex.lx.zhang@gmail.com>
Date: Mon, 3 Mar 2025 17:01:49 -0500
Subject: [PATCH 030/132] Update README.md with Docs Links

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 19d5463b..5f9e4059 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 ## Reference Kernels
 
-This repo holds reference kernels for the KernelBot which hosts regular competitions on discord.gg/gpumode
+This repo holds reference kernels for the KernelBot which hosts regular competitions on [discord.gg/gpumode](discord.gg/gpumode).
 
 ## Competition
-1. PMPP practice problems: Starting on Sunday Feb 21, 2025
+1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
 2. LLM competition: Coming soon!
 
-## Making a submission
+## Making a Leaderboard Submission
 
 Please take a look at `vectoradd_py` to see multiple examples of expected submisisons ranging from PyTorch code to Triton to inline CUDA.
 

From 3d8dcec46dcba1694650e3ccbe4d78a9f7ac2f28 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 9 Apr 2025 14:27:55 +0300
Subject: [PATCH 031/132] Update submission_cuda_inline.py

---
 .../correct/submission_cuda_inline.py         | 49 ++-----------------
 1 file changed, 5 insertions(+), 44 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
index abdd10f3..138e623a 100644
--- a/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
+++ b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -51,8 +51,6 @@
 torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B);
 """
 
-
-
 add_module = load_inline(
     name='add_cuda',
     cpp_sources=add_cpp_source,
@@ -68,11 +66,11 @@ def add(A, B):
 
 def custom_kernel(data: input_t) -> output_t:
     """
-    Custom implementation of vector addition using CUDA inline function.
+    Custom implementation of vector addition using CUDA.
     Args:
         inputs: List of pairs of tensors [A, B] to be added.
     Returns:
-        List of tensors containing element-wise sums.
+        Tensor containing element-wise sum.
     """
     A, B = data
 
@@ -80,43 +78,6 @@ def custom_kernel(data: input_t) -> output_t:
     assert A.shape == B.shape, "Input tensors must have the same shape"
     assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
     
-    M, N = A.shape
-    C = torch.empty_like(A)
-    
-    n_threads = 256
-    n_blocks = (M * N + n_threads - 1) // n_threads
-    
-    cuda_source = """
-    extern "C" __global__ void add_kernel(
-        const half* __restrict__ A,
-        const half* __restrict__ B,
-        half* __restrict__ C,
-        const int n_elements
-    ) {
-        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < n_elements) {
-            C[idx] = __hadd(A[idx], B[idx]);
-        }
-    }
-    """
-    
-    module = torch.utils.cpp_extension.load_inline(
-        name=f"add_kernel_{M}_{N}",
-        cpp_sources="",
-        cuda_sources=cuda_source,
-        functions=["add_kernel"],
-        with_cuda=True,
-        extra_cuda_cflags=["-arch=sm_70"],  # Adjust based on your GPU architecture
-    )
-    
-    module.add_kernel(
-        cuda_stream=torch.cuda.current_stream(),
-        args=[
-            A.reshape(-1), B.reshape(-1), C.reshape(-1),
-            M * N,
-        ],
-        blocks=n_blocks,
-        threads=n_threads,
-    )
-    
-    return C
+    # Simply reuse the existing add function we already defined
+    # This avoids the compilation issues with the inline kernel
+    return add(A, B)

From 066d45a683ea55889150aa18d72765f6fb7c204e Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 16:27:37 +0200
Subject: [PATCH 032/132] identity task for AMD competition

---
 problems/amd.yaml                   |  12 +
 problems/amd/eval.py                | 333 ++++++++++++++++++++++++++++
 problems/amd/identity/reference.py  |  20 ++
 problems/amd/identity/submission.py |   9 +
 problems/amd/identity/task.py       |  10 +
 problems/amd/identity/task.yml      |  42 ++++
 problems/amd/identity/template.py   |  14 ++
 problems/amd/utils.py               |  94 ++++++++
 8 files changed, 534 insertions(+)
 create mode 100644 problems/amd.yaml
 create mode 100644 problems/amd/eval.py
 create mode 100644 problems/amd/identity/reference.py
 create mode 100644 problems/amd/identity/submission.py
 create mode 100644 problems/amd/identity/task.py
 create mode 100644 problems/amd/identity/task.yml
 create mode 100644 problems/amd/identity/template.py
 create mode 100644 problems/amd/utils.py

diff --git a/problems/amd.yaml b/problems/amd.yaml
new file mode 100644
index 00000000..f81b148b
--- /dev/null
+++ b/problems/amd.yaml
@@ -0,0 +1,12 @@
+name: PMPP Practice Problems
+# when does this end (individual problems might close earlier)
+deadline: "2025-05-27"
+# A description for this particular competition
+description: "AMD Developer Challenge 2025: Inference Sprint"
+# the list of problems
+problems:
+  - directory: amd/identity
+    name: identity
+    deadline: "2025-06-08"
+    gpus:
+      - MI300
diff --git a/problems/amd/eval.py b/problems/amd/eval.py
new file mode 100644
index 00000000..7b83d750
--- /dev/null
+++ b/problems/amd/eval.py
@@ -0,0 +1,333 @@
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        error = run_single_test(pool, test)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    error = check_implementation(check_copy, output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation(check_copy, output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean * stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result)) #TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            else:
+                # TODO: Implement script and profile mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/amd/identity/reference.py b/problems/amd/identity/reference.py
new file mode 100644
index 00000000..1d064ae5
--- /dev/null
+++ b/problems/amd/identity/reference.py
@@ -0,0 +1,20 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    data = torch.empty(size, device='cuda', dtype=torch.float16)
+    data.uniform_(0, 1, generator=gen)
+    return data, torch.empty_like(data)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    input, output = data
+    output[...] = input
+    return output
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/amd/identity/submission.py b/problems/amd/identity/submission.py
new file mode 100644
index 00000000..81ed48d3
--- /dev/null
+++ b/problems/amd/identity/submission.py
@@ -0,0 +1,9 @@
+#!POPCORN leaderboard amd-identity
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input, output = data
+    output[...] = input
+    return output
diff --git a/problems/amd/identity/task.py b/problems/amd/identity/task.py
new file mode 100644
index 00000000..d9af537f
--- /dev/null
+++ b/problems/amd/identity/task.py
@@ -0,0 +1,10 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
diff --git a/problems/amd/identity/task.yml b/problems/amd/identity/task.yml
new file mode 100644
index 00000000..476a09c9
--- /dev/null
+++ b/problems/amd/identity/task.yml
@@ -0,0 +1,42 @@
+name: "amd-identity"
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  This task is purely for testing the submission system. There will be *no* points.
+
+  Input: (input_tensor, output_tensor)
+    - input_tensor: Input data
+    - output_tensor: Pre-allocated empty tensor of the same shape as `input_tensor`
+  Output: Should return `output_tensor` after it has been filled with the values from `input_tensor`.
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252}
+  - {"size": 32768, "seed": 52624}
+  - {"size": 65536, "seed": 125432}
+
+ranking_by: "geom"
diff --git a/problems/amd/identity/template.py b/problems/amd/identity/template.py
new file mode 100644
index 00000000..e5abde3a
--- /dev/null
+++ b/problems/amd/identity/template.py
@@ -0,0 +1,14 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Copies the contents of `input` into `output`
+    Args:
+        data: tuple of (input, output) tensors
+
+    Returns: output tensor
+    """
+    input, output = data
+    # implement processing
+    return output
diff --git a/problems/amd/utils.py b/problems/amd/utils.py
new file mode 100644
index 00000000..1617be82
--- /dev/null
+++ b/problems/amd/utils.py
@@ -0,0 +1,94 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+def verbose_allclose(
+        tensor1: torch.Tensor,
+        tensor2: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    tensor1 (torch.Tensor): First tensor to compare.
+    tensor2 (torch.Tensor): Second tensor to compare.
+    rtol (float): Relative tolerance.
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if tensor1.shape != tensor2.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(tensor1 - tensor2)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(tensor2)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.sum().item()
+
+    # Check if all elements are close
+    all_close = num_mismatched == 0
+
+    # Raise AssertionError with detailed information if there are mismatches
+    if not all_close and num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
+                            f"Mismatched elements: {mismatched_indices}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []

From b51424720a89c4ec78474fd4b8a8a85ba0b72a2c Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:39:15 +0200
Subject: [PATCH 033/132] rename

---
 problems/amd.yaml                                     | 6 +++---
 problems/amd/{identity => amd-identity}/reference.py  | 0
 problems/amd/{identity => amd-identity}/submission.py | 0
 problems/amd/{identity => amd-identity}/task.py       | 0
 problems/amd/{identity => amd-identity}/task.yml      | 0
 problems/amd/{identity => amd-identity}/template.py   | 0
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename problems/amd/{identity => amd-identity}/reference.py (100%)
 rename problems/amd/{identity => amd-identity}/submission.py (100%)
 rename problems/amd/{identity => amd-identity}/task.py (100%)
 rename problems/amd/{identity => amd-identity}/task.yml (100%)
 rename problems/amd/{identity => amd-identity}/template.py (100%)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index f81b148b..aedbd0ec 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -1,12 +1,12 @@
-name: PMPP Practice Problems
+name: AMD Developer Challenge 2025
 # when does this end (individual problems might close earlier)
 deadline: "2025-05-27"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Inference Sprint"
 # the list of problems
 problems:
-  - directory: amd/identity
-    name: identity
+  - directory: amd/amd-identity
+    name: amd-identity
     deadline: "2025-06-08"
     gpus:
       - MI300
diff --git a/problems/amd/identity/reference.py b/problems/amd/amd-identity/reference.py
similarity index 100%
rename from problems/amd/identity/reference.py
rename to problems/amd/amd-identity/reference.py
diff --git a/problems/amd/identity/submission.py b/problems/amd/amd-identity/submission.py
similarity index 100%
rename from problems/amd/identity/submission.py
rename to problems/amd/amd-identity/submission.py
diff --git a/problems/amd/identity/task.py b/problems/amd/amd-identity/task.py
similarity index 100%
rename from problems/amd/identity/task.py
rename to problems/amd/amd-identity/task.py
diff --git a/problems/amd/identity/task.yml b/problems/amd/amd-identity/task.yml
similarity index 100%
rename from problems/amd/identity/task.yml
rename to problems/amd/amd-identity/task.yml
diff --git a/problems/amd/identity/template.py b/problems/amd/amd-identity/template.py
similarity index 100%
rename from problems/amd/identity/template.py
rename to problems/amd/amd-identity/template.py

From 294a335425e628fa538150674da5c8c1e1309f78 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:43:02 +0200
Subject: [PATCH 034/132] names aren't supported

---
 problems/amd/amd-identity/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/amd-identity/task.yml b/problems/amd/amd-identity/task.yml
index 476a09c9..f9e4eca7 100644
--- a/problems/amd/amd-identity/task.yml
+++ b/problems/amd/amd-identity/task.yml
@@ -1,4 +1,4 @@
-name: "amd-identity"
+# name: "amd-identity"
 
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}

From 9a9dba8d73b5766212fea488b674690ac15a9d1f Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:02:53 +0200
Subject: [PATCH 035/132] fixups

---
 problems/amd/amd-identity/task.yml |  8 +--
 problems/amd/utils.py              | 88 +++++++++++++++++++++++-------
 2 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/problems/amd/amd-identity/task.yml b/problems/amd/amd-identity/task.yml
index f9e4eca7..a62ce986 100644
--- a/problems/amd/amd-identity/task.yml
+++ b/problems/amd/amd-identity/task.yml
@@ -12,10 +12,10 @@ lang: "py"
 description: |
   This task is purely for testing the submission system. There will be *no* points.
 
-  Input: (input_tensor, output_tensor)
-    - input_tensor: Input data
-    - output_tensor: Pre-allocated empty tensor of the same shape as `input_tensor`
-  Output: Should return `output_tensor` after it has been filled with the values from `input_tensor`.
+  > Input: (input_tensor, output_tensor)
+  >  - input_tensor: Input data
+  >  - output_tensor: Pre-allocated empty tensor of the same shape as `input_tensor`
+  > Output: Should return `output_tensor` after it has been filled with the values from `input_tensor`.`
 
 config:
   main: "eval.py"
diff --git a/problems/amd/utils.py b/problems/amd/utils.py
index 1617be82..c3eb2447 100644
--- a/problems/amd/utils.py
+++ b/problems/amd/utils.py
@@ -23,10 +23,12 @@ def get_device(use_cuda: bool = True) -> torch.device:
             print("No compatible GPU found. Falling back to CPU.")
     return torch.device("cpu")
 
+
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
 def verbose_allclose(
-        tensor1: torch.Tensor,
-        tensor2: torch.Tensor,
+        received: torch.Tensor,
+        expected: torch.Tensor,
         rtol=1e-05,
         atol=1e-08,
         max_print=5
@@ -35,9 +37,9 @@ def verbose_allclose(
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
     Parameters:
-    tensor1 (torch.Tensor): First tensor to compare.
-    tensor2 (torch.Tensor): Second tensor to compare.
-    rtol (float): Relative tolerance.
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
 
@@ -45,25 +47,25 @@ def verbose_allclose(
     AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
-    if tensor1.shape != tensor2.shape:
+    if received.shape != expected.shape:
         return ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(tensor1 - tensor2)
+    diff = torch.abs(received - expected)
 
     # Determine the tolerance
-    tolerance = atol + rtol * torch.abs(tensor2)
+    tolerance = atol + rtol * torch.abs(expected)
 
     # Find tolerance mismatched elements
     tol_mismatched = diff > tolerance
 
     # Find nan mismatched elements
-    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -74,21 +76,69 @@ def verbose_allclose(
     mismatched_indices = torch.nonzero(mismatched)
 
     # Count the number of mismatched elements
-    num_mismatched = mismatched.sum().item()
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
 
-    # Check if all elements are close
-    all_close = num_mismatched == 0
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
 
-    # Raise AssertionError with detailed information if there are mismatches
-    if not all_close and num_mismatched >= 1:
-        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
-                            f"Mismatched elements: {mismatched_indices}"]
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
         return mismatch_details
 
     return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped

From 6dbc7375ad8fc1b1bd7745e08d65f3f67be620aa Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 15 Apr 2025 12:11:25 -0400
Subject: [PATCH 036/132] fp8 matmul

---
 problems/amd/fp8_matmul/.DS_Store     | Bin 0 -> 6148 bytes
 problems/amd/fp8_matmul/reference.py  |  88 ++++++++++++++++++++++++++
 problems/amd/fp8_matmul/submission.py |  56 ++++++++++++++++
 problems/amd/fp8_matmul/task.py       |  11 ++++
 problems/amd/fp8_matmul/task.yml      |  78 +++++++++++++++++++++++
 5 files changed, 233 insertions(+)
 create mode 100644 problems/amd/fp8_matmul/.DS_Store
 create mode 100644 problems/amd/fp8_matmul/reference.py
 create mode 100644 problems/amd/fp8_matmul/submission.py
 create mode 100644 problems/amd/fp8_matmul/task.py
 create mode 100644 problems/amd/fp8_matmul/task.yml

diff --git a/problems/amd/fp8_matmul/.DS_Store b/problems/amd/fp8_matmul/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..02f8f799ee473f80d1b192895a924e0fbecd9496
GIT binary patch
literal 6148
zcmeHKyH3MU47E!UL4ec+i3RC?K_xcEP?dq14**&MLIi>+9lB-bWB5ckeyD1bbYMZK
zvMc-CICiXiZWPx<#H0EB8PR}<GN@p0AEra(zGzE&?%4&hKI4%Jdcp68-doY(_>T<8
z*&WjZz0o;NZT0*Lcnh3kL9aB09U^9NRaVnUSxu0ISB5`<!*3Pc+@Z$*KJSQTG^YvP
zoF}YiuobFyRHUy)#Xb6<J7hd}Mqf0dvz6!ts+EoXoHpWhQ(=OEU?3RSItI|QMY03K
zu)#nu5DY9Ckncl+3g(Wrp?*4GbOZqQ;a0&~W(mnjj=5uPh&vFrRG_7@j~Hy}@F$PU
z9cx2NC-&iky)yfR!de~APZ~~~8-@)Af`N4gcI<K>{r__F{J)+QSHVCquvH9je|$Y2
zVN2dyTbq+!o1nK)5s9k}mnm3iD@Lre;uEL}{7DwT+_5%<2NHh-l!h?Dz@IYk1DE_(
AYXATM

literal 0
HcmV?d00001

diff --git a/problems/amd/fp8_matmul/reference.py b/problems/amd/fp8_matmul/reference.py
new file mode 100644
index 00000000..e82fe32a
--- /dev/null
+++ b/problems/amd/fp8_matmul/reference.py
@@ -0,0 +1,88 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+
+block_shape = (128, 128)
+
+def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
+    """
+    Generate random input and weights for Blockwise W8A8 Matmul scaled to FP32.
+    
+    Returns:
+        Tuple of (
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k], 
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k], 
+            a_scale: torch.Tensor[float32] of shape [m, k // 128], 
+            b_scale: torch.Tensor[float32] of shape [n // 128, k // 128], 
+            c: torch.Tensor[bfloat16] of shape [m, n]
+        )
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    block_shape_n, block_shape_k = block_shape
+    scale_n =  (n + block_shape_n - 1) // block_shape_n
+    scale_k =  (k + block_shape_k - 1) // block_shape_k
+
+    # Generate random inputs with FP8 quantization
+    a = (torch.randn((k, m), dtype=torch.bfloat16, device="cuda", generator=gen)).to(torch.float8_e4m3fnuz)
+    b = (torch.randn((k, n), dtype=torch.bfloat16, device="cuda", generator=gen)).to(torch.float8_e4m3fnuz)
+
+    # Generate scaling factors with FP32
+    a_scale = torch.randn([scale_k, m], dtype=torch.float32, device="cuda", generator=gen)
+    b_scale = torch.randn([scale_k, scale_n], dtype=torch.float32, device="cuda", generator=gen)
+
+
+    c = torch.zeros((m, n), dtype=torch.bfloat16, device="cuda")
+    return (a.T, b.T, a_scale.T, b_scale.T, c)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Highly inefficient torch reference implementation of FP8 GEMM.
+    You can use this as a reference / starting template for your implementation.
+    """
+    # c: [m, n] is pre-allocated memory to help remove allocation overhead.
+    a, b, a_scale, b_scale, c = data
+
+    # a is M x K in column-major order, we convert here for simplicity.
+    a = a.contiguous()
+    a_scale = a_scale.contiguous()
+    b_scale = b_scale.contiguous()
+
+    # constants
+    m = a.shape[0]
+    n = b.shape[0]
+    k = a.shape[1]
+    block_shape_n = 128
+    block_shape_k = 128
+    scale_n = b_scale.shape[0]
+    scale_k = b_scale.shape[1]
+
+    # Apply blockwise scaling to input 'a'
+    a_scale = a_scale.unsqueeze(-1).repeat(1, 1, block_shape_k)  # Shape: [m, scale_k, block_shape_k]
+    a_scale = a_scale.reshape(m, scale_k * block_shape_k) 
+    a_scale = a_scale[:, :k]
+
+    # Dequantize 'a', in your implementation you should do this at the end.
+    a = a.to(a_scale.dtype) * a_scale 
+
+    # Apply blockwise scaling to input 'b'
+    b_scale = (
+        b_scale.view(-1, 1)
+        .repeat(1, block_shape_n * block_shape_k)
+        .view(scale_n, scale_k, block_shape_n, block_shape_k)
+        .permute(0, 2, 1, 3)  # Reorder dimensions: [scale_n, blk_n, scale_k, blk_k]
+        .reshape(scale_n * block_shape_n, scale_k * block_shape_k)
+    )
+    b_scale = b_scale[:n, :k]
+
+    # Dequantize 'b', in your implementation you should do this at the end.
+    b = b.to(b_scale.dtype) * b_scale 
+
+    # Compute FP8 GEMM and write to 'c'. 
+    c[...] = (a @ b.T).to(torch.bfloat16)
+    return c
+
+
+check_implementation = make_match_reference(ref_kernel)
\ No newline at end of file
diff --git a/problems/amd/fp8_matmul/submission.py b/problems/amd/fp8_matmul/submission.py
new file mode 100644
index 00000000..8a0bad6c
--- /dev/null
+++ b/problems/amd/fp8_matmul/submission.py
@@ -0,0 +1,56 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp8 gemm 
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k], 
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k], 
+            a_scale: torch.Tensor[float32] of shape [m, k // 128], 
+            b_scale: torch.Tensor[float32] of shape [n // 128, k // 128], 
+            c: torch.Tensor[bfloat16] of shape [m, n]
+    Returns:
+        Tensor containing output in bf16
+    """
+    # c: [m, n] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, a_scale, b_scale, c = data
+
+    # a is M x K in column-major order, we convert here for simplicity.
+    a = a.contiguous()
+    a_scale = a_scale.contiguous()
+    b_scale = b_scale.contiguous()
+
+    # constants
+    m = a.shape[0]
+    n = b.shape[0]
+    k = a.shape[1]
+    block_shape_n = 128
+    block_shape_k = 128
+    scale_n = b_scale.shape[0]
+    scale_k = b_scale.shape[1]
+
+    # Apply scaling to input 'a'
+    a_scale = a_scale.unsqueeze(-1).repeat(1, 1, block_shape_k)  # Shape: [m, scale_k, block_shape_k]
+    a_scale = a_scale.reshape(m, scale_k * block_shape_k) 
+    a_scale = a_scale[:, :k]
+
+    # Dequantize 'a', in your implementation you should do this at the end.
+    a = a.to(a_scale.dtype) * a_scale 
+
+    # Apply scaling to input 'b'
+    b_scale = (
+        b_scale.view(-1, 1)
+        .repeat(1, block_shape_n * block_shape_k)
+        .view(scale_n, scale_k, block_shape_n, block_shape_k)
+        .permute(0, 2, 1, 3)  # Reorder dimensions: [scale_n, blk_n, scale_k, blk_k]
+        .reshape(scale_n * block_shape_n, scale_k * block_shape_k)
+    )
+    b_scale = b_scale[:n, :k]
+
+    # Dequantize 'b', in your implementation you should do this at the end.
+    b = b.to(b_scale.dtype) * b_scale 
+
+    c[...] = (a @ b.T).to(torch.bfloat16)
+    return c
diff --git a/problems/amd/fp8_matmul/task.py b/problems/amd/fp8_matmul/task.py
new file mode 100644
index 00000000..78387cfd
--- /dev/null
+++ b/problems/amd/fp8_matmul/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/fp8_matmul/task.yml b/problems/amd/fp8_matmul/task.yml
new file mode 100644
index 00000000..b0e65111
--- /dev/null
+++ b/problems/amd/fp8_matmul/task.yml
@@ -0,0 +1,78 @@
+# name: fp8-matmul
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  You will implement a custom fp8-blockwise matmul kernel optimized for MI300.
+  You will be given single-precision scaling factors for your matrices.
+  The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, a_scale, b_scale, c)
+  ```
+  where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
+  and `c` is the output matrix.
+  `a` is M x K in column-major order, and `b` is N x K in column-major order.
+  `a_scale` is M x K in column-major order, and `b_scale` is N x K in column-major order.
+  `c` is M x N in ROW-major order.
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+
+  The speed of light analysis is:
+   M       N       K        time[us]
+  1024    1536    7168      8.6331019
+  1024    4608    7168     25.8936898
+  6144    1536    7168     51.7775517
+  6144    4608    7168    155.2989590
+  1024    7168     256      3.1671426
+  6144    7168     256     17.2712935
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"m": 64, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 64, "n": 3072, "k": 1536, "seed": 42}
+  - {"m": 64, "n": 576, "k": 7168, "seed": 42}
+  - {"m": 96, "n": 7168, "k": 256, "seed": 42}
+  - {"m": 96, "n": 7168, "k": 2048, "seed": 42}
+  - {"m": 96, "n": 4608, "k": 7168, "seed": 42}
+  - {"m": 128, "n": 7168, "k": 2304, "seed": 42}
+  - {"m": 128, "n": 512, "k": 7168, "seed": 42}
+  - {"m": 512, "n": 4096, "k": 512, "seed": 42}
+  - {"m": 512, "n": 1536, "k": 7168, "seed": 42}
+
+benchmarks:
+  - {"m": 1024, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 3072, "k": 1536, "seed": 42}
+  - {"m": 1024, "n": 576, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 7168, "k": 256, "seed": 42}
+  - {"m": 1024, "n": 7168, "k": 2048, "seed": 42}
+  - {"m": 1024, "n": 4608, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 7168, "k": 2304, "seed": 42}
+  - {"m": 1024, "n": 512, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 4096, "k": 512, "seed": 42}
+  - {"m": 6144, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 3072, "k": 1536, "seed": 42}
+  - {"m": 6144, "n": 576, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 7168, "k": 256, "seed": 42}
+  - {"m": 6144, "n": 7168, "k": 2048, "seed": 42}
+  - {"m": 6144, "n": 4608, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 7168, "k": 2304, "seed": 42}
+  - {"m": 6144, "n": 512, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 4096, "k": 512, "seed": 42}
+
+ranking_by: "geom"

From ff21553e0d9e5617db1f99a9e55fbcb8f4a5f11f Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:18:33 +0200
Subject: [PATCH 037/132] set tolerance

---
 problems/amd/fp8_matmul/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/fp8_matmul/reference.py b/problems/amd/fp8_matmul/reference.py
index e82fe32a..f4570a77 100644
--- a/problems/amd/fp8_matmul/reference.py
+++ b/problems/amd/fp8_matmul/reference.py
@@ -85,4 +85,4 @@ def ref_kernel(data: input_t) -> output_t:
     return c
 
 
-check_implementation = make_match_reference(ref_kernel)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=1e-02, atol=1e-04)

From de7f0fc5e93c14876bc2c83373cf350401ea9820 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:24:38 +0200
Subject: [PATCH 038/132] rename

---
 problems/amd.yaml                                     | 6 ++++++
 problems/amd/{fp8_matmul => amd-fp8-mm}/reference.py  | 0
 problems/amd/{fp8_matmul => amd-fp8-mm}/submission.py | 0
 problems/amd/{fp8_matmul => amd-fp8-mm}/task.py       | 0
 problems/amd/{fp8_matmul => amd-fp8-mm}/task.yml      | 0
 5 files changed, 6 insertions(+)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/reference.py (100%)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/submission.py (100%)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/task.py (100%)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/task.yml (100%)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index aedbd0ec..e2cc9641 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -10,3 +10,9 @@ problems:
     deadline: "2025-06-08"
     gpus:
       - MI300
+
+  - directory: amd/amd-fp8-mm
+    name: amd-identity
+    deadline: "2025-05-27"
+    gpus:
+      - MI300
diff --git a/problems/amd/fp8_matmul/reference.py b/problems/amd/amd-fp8-mm/reference.py
similarity index 100%
rename from problems/amd/fp8_matmul/reference.py
rename to problems/amd/amd-fp8-mm/reference.py
diff --git a/problems/amd/fp8_matmul/submission.py b/problems/amd/amd-fp8-mm/submission.py
similarity index 100%
rename from problems/amd/fp8_matmul/submission.py
rename to problems/amd/amd-fp8-mm/submission.py
diff --git a/problems/amd/fp8_matmul/task.py b/problems/amd/amd-fp8-mm/task.py
similarity index 100%
rename from problems/amd/fp8_matmul/task.py
rename to problems/amd/amd-fp8-mm/task.py
diff --git a/problems/amd/fp8_matmul/task.yml b/problems/amd/amd-fp8-mm/task.yml
similarity index 100%
rename from problems/amd/fp8_matmul/task.yml
rename to problems/amd/amd-fp8-mm/task.yml

From d865af3004c64635622d2d9db3ae4324ad1adc0e Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:27:18 +0200
Subject: [PATCH 039/132] fix template

---
 problems/amd/amd-fp8-mm/task.yml    |  2 +-
 problems/amd/amd-fp8-mm/template.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 problems/amd/amd-fp8-mm/template.py

diff --git a/problems/amd/amd-fp8-mm/task.yml b/problems/amd/amd-fp8-mm/task.yml
index b0e65111..ca894e9b 100644
--- a/problems/amd/amd-fp8-mm/task.yml
+++ b/problems/amd/amd-fp8-mm/task.yml
@@ -41,7 +41,7 @@ config:
   main: "eval.py"
 
 templates:
-  Python: "../template.py"
+  Python: "template.py"
 
 tests:
   - {"m": 64, "n": 1536, "k": 7168, "seed": 42}
diff --git a/problems/amd/amd-fp8-mm/template.py b/problems/amd/amd-fp8-mm/template.py
new file mode 100644
index 00000000..a728c8a2
--- /dev/null
+++ b/problems/amd/amd-fp8-mm/template.py
@@ -0,0 +1,22 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp8 gemm
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k],
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k],
+            a_scale: torch.Tensor[float32] of shape [m, k // 128],
+            b_scale: torch.Tensor[float32] of shape [n // 128, k // 128],
+            c: torch.Tensor[bfloat16] of shape [m, n]
+    Returns:
+        Tensor containing output in bf16
+    """
+    # c: [m, n] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, a_scale, b_scale, c = data
+
+    # Your implementation here
+
+    return c

From 6d381bbcce547ba4829a0c8828023b03aff33616 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:40:31 +0200
Subject: [PATCH 040/132] fixes

---
 problems/amd.yaml                             |   7 ++---
 .../amd/{amd-fp8-mm => fp8-mm}/reference.py   |   0
 .../amd/{amd-fp8-mm => fp8-mm}/submission.py  |   0
 problems/amd/{amd-fp8-mm => fp8-mm}/task.py   |   0
 problems/amd/{amd-fp8-mm => fp8-mm}/task.yml  |  28 ++++++++++--------
 .../amd/{amd-fp8-mm => fp8-mm}/template.py    |   0
 problems/amd/fp8_matmul/.DS_Store             | Bin 6148 -> 0 bytes
 .../{amd-identity => identity}/reference.py   |   0
 .../{amd-identity => identity}/submission.py  |   2 +-
 .../amd/{amd-identity => identity}/task.py    |   0
 .../amd/{amd-identity => identity}/task.yml   |   2 +-
 .../{amd-identity => identity}/template.py    |   0
 12 files changed, 21 insertions(+), 18 deletions(-)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/reference.py (100%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/submission.py (100%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/task.py (100%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/task.yml (82%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/template.py (100%)
 delete mode 100644 problems/amd/fp8_matmul/.DS_Store
 rename problems/amd/{amd-identity => identity}/reference.py (100%)
 rename problems/amd/{amd-identity => identity}/submission.py (82%)
 rename problems/amd/{amd-identity => identity}/task.py (100%)
 rename problems/amd/{amd-identity => identity}/task.yml (98%)
 rename problems/amd/{amd-identity => identity}/template.py (100%)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index e2cc9641..20a1d449 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -5,14 +5,13 @@ deadline: "2025-05-27"
 description: "AMD Developer Challenge 2025: Inference Sprint"
 # the list of problems
 problems:
-  - directory: amd/amd-identity
+  - directory: amd/identity
     name: amd-identity
     deadline: "2025-06-08"
     gpus:
       - MI300
-
-  - directory: amd/amd-fp8-mm
-    name: amd-identity
+  - directory: amd/fp8-mm
+    name: amd-fp8-mm
     deadline: "2025-05-27"
     gpus:
       - MI300
diff --git a/problems/amd/amd-fp8-mm/reference.py b/problems/amd/fp8-mm/reference.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/reference.py
rename to problems/amd/fp8-mm/reference.py
diff --git a/problems/amd/amd-fp8-mm/submission.py b/problems/amd/fp8-mm/submission.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/submission.py
rename to problems/amd/fp8-mm/submission.py
diff --git a/problems/amd/amd-fp8-mm/task.py b/problems/amd/fp8-mm/task.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/task.py
rename to problems/amd/fp8-mm/task.py
diff --git a/problems/amd/amd-fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
similarity index 82%
rename from problems/amd/amd-fp8-mm/task.yml
rename to problems/amd/fp8-mm/task.yml
index ca894e9b..81e098e1 100644
--- a/problems/amd/amd-fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -10,6 +10,7 @@ files:
 lang: "py"
 
 description: |
+  
   You will implement a custom fp8-blockwise matmul kernel optimized for MI300.
   You will be given single-precision scaling factors for your matrices.
   The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1.
@@ -18,24 +19,27 @@ description: |
   (a, b, a_scale, b_scale, c)
   ```
   where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
-  and `c` is the output matrix.
-  `a` is M x K in column-major order, and `b` is N x K in column-major order.
-  `a_scale` is M x K in column-major order, and `b_scale` is N x K in column-major order.
-  `c` is M x N in ROW-major order.
+  and `c` is the output matrix:
+  * `a` is M x K in column-major order in e4m3fnuz
+  * `b` is N x K in column-major order in e4m3fnuz
+  * `a_scale` is M x K in column-major order in fp32
+  * `b_scale` is N x K in column-major order in fp32
+  * `c` is M x N in ROW-major order in bf16
 
   The ranking criteria is the geometric mean of the benchmark results.
 
   For the grand price, your kernel will be evaluated against the speed of light analysis
   and the solution closest to the speed of light will be awarded the grand price.
-
+  ```
   The speed of light analysis is:
-   M       N       K        time[us]
-  1024    1536    7168      8.6331019
-  1024    4608    7168     25.8936898
-  6144    1536    7168     51.7775517
-  6144    4608    7168    155.2989590
-  1024    7168     256      3.1671426
-  6144    7168     256     17.2712935
+   M       N       K     time[us]
+  1024    1536    7168      8.63
+  1024    4608    7168     25.89
+  6144    1536    7168     51.78
+  6144    4608    7168    155.30
+  1024    7168     256      3.17
+  6144    7168     256     17.27
+  ```
 
 config:
   main: "eval.py"
diff --git a/problems/amd/amd-fp8-mm/template.py b/problems/amd/fp8-mm/template.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/template.py
rename to problems/amd/fp8-mm/template.py
diff --git a/problems/amd/fp8_matmul/.DS_Store b/problems/amd/fp8_matmul/.DS_Store
deleted file mode 100644
index 02f8f799ee473f80d1b192895a924e0fbecd9496..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKyH3MU47E!UL4ec+i3RC?K_xcEP?dq14**&MLIi>+9lB-bWB5ckeyD1bbYMZK
zvMc-CICiXiZWPx<#H0EB8PR}<GN@p0AEra(zGzE&?%4&hKI4%Jdcp68-doY(_>T<8
z*&WjZz0o;NZT0*Lcnh3kL9aB09U^9NRaVnUSxu0ISB5`<!*3Pc+@Z$*KJSQTG^YvP
zoF}YiuobFyRHUy)#Xb6<J7hd}Mqf0dvz6!ts+EoXoHpWhQ(=OEU?3RSItI|QMY03K
zu)#nu5DY9Ckncl+3g(Wrp?*4GbOZqQ;a0&~W(mnjj=5uPh&vFrRG_7@j~Hy}@F$PU
z9cx2NC-&iky)yfR!de~APZ~~~8-@)Af`N4gcI<K>{r__F{J)+QSHVCquvH9je|$Y2
zVN2dyTbq+!o1nK)5s9k}mnm3iD@Lre;uEL}{7DwT+_5%<2NHh-l!h?Dz@IYk1DE_(
AYXATM

diff --git a/problems/amd/amd-identity/reference.py b/problems/amd/identity/reference.py
similarity index 100%
rename from problems/amd/amd-identity/reference.py
rename to problems/amd/identity/reference.py
diff --git a/problems/amd/amd-identity/submission.py b/problems/amd/identity/submission.py
similarity index 82%
rename from problems/amd/amd-identity/submission.py
rename to problems/amd/identity/submission.py
index 81ed48d3..c07dd375 100644
--- a/problems/amd/amd-identity/submission.py
+++ b/problems/amd/identity/submission.py
@@ -1,4 +1,4 @@
-#!POPCORN leaderboard amd-identity
+#!POPCORN leaderboard identity
 from task import input_t, output_t
 import torch
 
diff --git a/problems/amd/amd-identity/task.py b/problems/amd/identity/task.py
similarity index 100%
rename from problems/amd/amd-identity/task.py
rename to problems/amd/identity/task.py
diff --git a/problems/amd/amd-identity/task.yml b/problems/amd/identity/task.yml
similarity index 98%
rename from problems/amd/amd-identity/task.yml
rename to problems/amd/identity/task.yml
index a62ce986..f3b3e361 100644
--- a/problems/amd/amd-identity/task.yml
+++ b/problems/amd/identity/task.yml
@@ -1,4 +1,4 @@
-# name: "amd-identity"
+# name: "identity"
 
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
diff --git a/problems/amd/amd-identity/template.py b/problems/amd/identity/template.py
similarity index 100%
rename from problems/amd/amd-identity/template.py
rename to problems/amd/identity/template.py

From 8fa795353cacea896af217da0651c246fe0cfe70 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 23:25:02 +0200
Subject: [PATCH 041/132] "randomize" seeds

---
 problems/amd/fp8-mm/task.yml | 56 ++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/problems/amd/fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
index 81e098e1..43f912a7 100644
--- a/problems/amd/fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -48,35 +48,35 @@ templates:
   Python: "template.py"
 
 tests:
-  - {"m": 64, "n": 1536, "k": 7168, "seed": 42}
-  - {"m": 64, "n": 3072, "k": 1536, "seed": 42}
-  - {"m": 64, "n": 576, "k": 7168, "seed": 42}
-  - {"m": 96, "n": 7168, "k": 256, "seed": 42}
-  - {"m": 96, "n": 7168, "k": 2048, "seed": 42}
-  - {"m": 96, "n": 4608, "k": 7168, "seed": 42}
-  - {"m": 128, "n": 7168, "k": 2304, "seed": 42}
-  - {"m": 128, "n": 512, "k": 7168, "seed": 42}
-  - {"m": 512, "n": 4096, "k": 512, "seed": 42}
-  - {"m": 512, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 64, "n": 1536, "k": 7168, "seed": 6635}
+  - {"m": 64, "n": 3072, "k": 1536, "seed": 1236}
+  - {"m": 64, "n": 576, "k": 7168, "seed": 542}
+  - {"m": 96, "n": 7168, "k": 256, "seed": 1234}
+  - {"m": 96, "n": 7168, "k": 2048, "seed": 4153}
+  - {"m": 96, "n": 4608, "k": 7168, "seed": 412}
+  - {"m": 128, "n": 7168, "k": 2304, "seed": 624}
+  - {"m": 128, "n": 512, "k": 7168, "seed": 2514}
+  - {"m": 512, "n": 4096, "k": 512, "seed": 543}
+  - {"m": 512, "n": 1536, "k": 7168, "seed": 12341}
 
 benchmarks:
-  - {"m": 1024, "n": 1536, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 3072, "k": 1536, "seed": 42}
-  - {"m": 1024, "n": 576, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 7168, "k": 256, "seed": 42}
-  - {"m": 1024, "n": 7168, "k": 2048, "seed": 42}
-  - {"m": 1024, "n": 4608, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 7168, "k": 2304, "seed": 42}
-  - {"m": 1024, "n": 512, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 4096, "k": 512, "seed": 42}
-  - {"m": 6144, "n": 1536, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 3072, "k": 1536, "seed": 42}
-  - {"m": 6144, "n": 576, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 7168, "k": 256, "seed": 42}
-  - {"m": 6144, "n": 7168, "k": 2048, "seed": 42}
-  - {"m": 6144, "n": 4608, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 7168, "k": 2304, "seed": 42}
-  - {"m": 6144, "n": 512, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 4096, "k": 512, "seed": 42}
+  - {"m": 1024, "n": 1536, "k": 7168, "seed": 8135}
+  - {"m": 1024, "n": 3072, "k": 1536, "seed": 6251}
+  - {"m": 1024, "n": 576, "k": 7168, "seed": 12346}
+  - {"m": 1024, "n": 7168, "k": 256, "seed": 5364}
+  - {"m": 1024, "n": 7168, "k": 2048, "seed": 6132}
+  - {"m": 1024, "n": 4608, "k": 7168, "seed": 7531}
+  - {"m": 1024, "n": 7168, "k": 2304, "seed": 12345}
+  - {"m": 1024, "n": 512, "k": 7168, "seed": 6563}
+  - {"m": 1024, "n": 4096, "k": 512, "seed": 17512}
+  - {"m": 6144, "n": 1536, "k": 7168, "seed": 6543}
+  - {"m": 6144, "n": 3072, "k": 1536, "seed": 234}
+  - {"m": 6144, "n": 576, "k": 7168, "seed": 9863}
+  - {"m": 6144, "n": 7168, "k": 256, "seed": 764243}
+  - {"m": 6144, "n": 7168, "k": 2048, "seed": 76547}
+  - {"m": 6144, "n": 4608, "k": 7168, "seed": 65436}
+  - {"m": 6144, "n": 7168, "k": 2304, "seed": 452345}
+  - {"m": 6144, "n": 512, "k": 7168, "seed": 12341}
+  - {"m": 6144, "n": 4096, "k": 512, "seed": 45245}
 
 ranking_by: "geom"

From b68a149bcd8701532eeedc774d27062429ce4f99 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Wed, 16 Apr 2025 23:48:48 +0200
Subject: [PATCH 042/132] Update FP8MM (#22)

---
 problems/amd/eval.py                | 35 ++++++++-----
 problems/amd/fp8-mm/reference.py    |  2 +-
 problems/amd/fp8-mm/task.yml        |  4 ++
 problems/amd/fp8-mm/template-hip.py | 76 +++++++++++++++++++++++++++++
 problems/amd/utils.py               | 31 ++++++------
 5 files changed, 120 insertions(+), 28 deletions(-)
 create mode 100644 problems/amd/fp8-mm/template-hip.py

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index 7b83d750..ce414f9b 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -137,6 +137,17 @@ def _clone_data(data):
         return data
 
 
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
 def _run_single_test(test: TestCase):
     """
     Runs a single test case. Do not call directly
@@ -146,7 +157,7 @@ def _run_single_test(test: TestCase):
     torch.cuda.synchronize()
     submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
-    return check_implementation(data, submission_output)
+    return wrap_check_implementation(data, submission_output)
 
 
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
@@ -168,13 +179,15 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T
     logger.log("test-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"test.{idx}.spec", test.spec)
-        error = run_single_test(pool, test)
-        if error:
+        good, message = run_single_test(pool, test)
+        if not good:
             logger.log(f"test.{idx}.status", "fail")
-            logger.log(f"test.{idx}.error", error)
+            logger.log(f"test.{idx}.error", message)
             passed = False
         else:
             logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
 
     if passed:
         logger.log("check", "pass")
@@ -196,9 +209,9 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     check_copy = _clone_data(data)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
-    error = check_implementation(check_copy, output)
-    if error:
-        return error
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
 
     # now, do multiple timing runs without further correctness testing
     # there is an upper bound of 100 runs, and a lower bound of 3 runs;
@@ -220,16 +233,16 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         end = time.perf_counter_ns()
 
         if recheck:
-            error = check_implementation(check_copy, output)
-            if error:
-                return error
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
 
         del output
         durations.append(end-start)
 
         if i > 1:
             stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.01 or stats.mean * stats.runs > max_time_ns:
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns:
                 break
 
     return calculate_stats(durations)
diff --git a/problems/amd/fp8-mm/reference.py b/problems/amd/fp8-mm/reference.py
index f4570a77..86c20a5b 100644
--- a/problems/amd/fp8-mm/reference.py
+++ b/problems/amd/fp8-mm/reference.py
@@ -85,4 +85,4 @@ def ref_kernel(data: input_t) -> output_t:
     return c
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-02, atol=1e-04)
+check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=1e-03)
diff --git a/problems/amd/fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
index 43f912a7..d4e435db 100644
--- a/problems/amd/fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -25,6 +25,8 @@ description: |
   * `a_scale` is M x K in column-major order in fp32
   * `b_scale` is N x K in column-major order in fp32
   * `c` is M x N in ROW-major order in bf16
+  
+  Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.
 
   The ranking criteria is the geometric mean of the benchmark results.
 
@@ -46,8 +48,10 @@ config:
 
 templates:
   Python: "template.py"
+  HIP: "template-hip.py"
 
 tests:
+  - {"m": 64, "n": 64, "k": 128, "seed": 6635}
   - {"m": 64, "n": 1536, "k": 7168, "seed": 6635}
   - {"m": 64, "n": 3072, "k": 1536, "seed": 1236}
   - {"m": 64, "n": 576, "k": 7168, "seed": 542}
diff --git a/problems/amd/fp8-mm/template-hip.py b/problems/amd/fp8-mm/template-hip.py
new file mode 100644
index 00000000..9e7cd11d
--- /dev/null
+++ b/problems/amd/fp8-mm/template-hip.py
@@ -0,0 +1,76 @@
+# This script provides a template for using load_inline to run a HIP kernel for
+from torch.utils.cpp_extension import load_inline
+from task import input_t, output_t
+CPP_WRAPPER = """
+void fp8_mm(torch::Tensor a, torch::Tensor b, torch::Tensor as, torch::Tensor bs, torch::Tensor c);
+"""
+
+CUDA_SRC = """
+#include <hip/amd_detail/amd_hip_fp8.h>
+#include <hip/amd_detail/amd_hip_bf16.h>
+
+constexpr const int BLOCK = 128;
+
+__global__ void custom_kernel(const __hip_fp8_e4m3_fnuz* a, const __hip_fp8_e4m3_fnuz* b, const float* as, const float* bs, 
+                   __hip_bfloat16* c, int m, int n, int k) {
+                   
+    // Your implementation here
+    int cx = threadIdx.x + blockDim.x * blockIdx.x;
+    int cy = threadIdx.y + blockDim.y * blockIdx.y;
+    if(cx >= m || cy >= n) return;
+    
+    int sn = (n + BLOCK - 1) / BLOCK;
+    
+    float result = 0;
+    // split loop into an outer loop over different blocks, and an inner loop within one block.
+    // we can assume k % BLOCK == 0.
+    for(int i = 0; i < k; i += BLOCK) {
+        // block results accumulates the inner product across a single block.
+        // within each block, scales are constant, so we can lift the scaling 
+        // outside of the inner loop.
+        float block_result = 0;
+        for(int ii = 0; ii < BLOCK; ++ii) {
+            // load input matrix elements and convert to float for computations
+            float av = (float)a[cx + (i + ii) * m];
+            float bv = (float)b[cy + (i + ii) * n];
+            block_result += av * bv; 
+        }
+        
+        // before we can go to the next block, scale the result of the current block
+        // and accumulate to final result
+        // note the different indexing into as and bs
+        result += block_result * as[cx + i/BLOCK * m] * bs[cy/BLOCK + i/BLOCK * sn];
+    }
+    
+    // finally, write the result as bf16
+    c[cx * n + cy] = (__hip_bfloat16)result;
+}
+
+void fp8_mm(torch::Tensor a, torch::Tensor b, torch::Tensor as, torch::Tensor bs, torch::Tensor c) {
+    int m = a.size(0);
+    int n = b.size(0);
+    int k = a.size(1);
+    custom_kernel<<<dim3((m+15)/16, (n+15)/16), dim3(16, 16), 0, 0>>> ((__hip_fp8_e4m3_fnuz*)a.data_ptr(), (__hip_fp8_e4m3_fnuz*)b.data_ptr(), 
+    as.data_ptr<float>(), bs.data_ptr<float>(), (__hip_bfloat16*)c.data_ptr(), m, n, k);
+    //C10_CUDA_CHECK(cudaGetLastError());
+}
+"""
+
+import os
+os.environ["CXX"] = "clang++"
+
+module = load_inline(
+    name='fp8_mm',
+    cpp_sources=[CPP_WRAPPER],
+    cuda_sources=[CUDA_SRC],
+    functions=['fp8_mm'],
+    verbose=True,
+    extra_cuda_cflags=["--offload-arch=gfx942", "-std=c++20"],
+)
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b, a_scale, b_scale, c = data
+    module.fp8_mm(a, b, a_scale, b_scale, c)
+    return c
+
diff --git a/problems/amd/utils.py b/problems/amd/utils.py
index c3eb2447..73551022 100644
--- a/problems/amd/utils.py
+++ b/problems/amd/utils.py
@@ -1,4 +1,6 @@
 import random
+from typing import Tuple
+
 import numpy as np
 import torch
 
@@ -32,7 +34,7 @@ def verbose_allclose(
         rtol=1e-05,
         atol=1e-08,
         max_print=5
-) -> list[str]:
+) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
@@ -42,16 +44,13 @@ def verbose_allclose(
     rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
-
-    Raises:
-    AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
     if received.shape != expected.shape:
-        return ["SIZE MISMATCH"]
+        return False, ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(received - expected)
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
 
     # Determine the tolerance
     tolerance = atol + rtol * torch.abs(expected)
@@ -84,16 +83,16 @@ def verbose_allclose(
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return mismatch_details
+        return False, mismatch_details
 
-    return []
+    return True, [f"Maximum error: {torch.max(diff)}"]
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -117,12 +116,12 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return mismatch_details
+        return False, mismatch_details
 
-    return []
+    return True, []
 
 
 def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
@@ -130,12 +129,12 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
     expected = reference(data)
-    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
 
     if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return good, "\\n".join(reasons)
 
-    return ''
+    return good, ''
 
 
 def make_match_reference(reference: callable, **kwargs):

From 8ebd201acb57174b270d74fe32f03d967c924d2f Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Tue, 29 Apr 2025 14:45:03 -0400
Subject: [PATCH 043/132] Create .gitignore

---
 .gitignore | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..393b0244
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,175 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.DS_Store
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc

From ccdc5019d08424b7e768571740181daacb19d0a1 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 29 Apr 2025 14:48:29 -0400
Subject: [PATCH 044/132] Add MoE problem for AMD competition

---
 problems/amd.yaml              |   5 +
 problems/amd/moe/reference.py  | 243 +++++++++++++++++++++++++++++++++
 problems/amd/moe/submission.py | 141 +++++++++++++++++++
 problems/amd/moe/task.py       |  16 +++
 problems/amd/moe/task.yml      |  57 ++++++++
 5 files changed, 462 insertions(+)
 create mode 100644 problems/amd/moe/reference.py
 create mode 100644 problems/amd/moe/submission.py
 create mode 100644 problems/amd/moe/task.py
 create mode 100644 problems/amd/moe/task.yml

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 20a1d449..bcf91f67 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -15,3 +15,8 @@ problems:
     deadline: "2025-05-27"
     gpus:
       - MI300
+  - directory: amd/moe
+    name: amd-mixture-of-experts
+    deadline: "2025-05-27"
+    gpus:
+      - MI300
\ No newline at end of file
diff --git a/problems/amd/moe/reference.py b/problems/amd/moe/reference.py
new file mode 100644
index 00000000..b6ee16d7
--- /dev/null
+++ b/problems/amd/moe/reference.py
@@ -0,0 +1,243 @@
+from utils import make_match_reference
+from task import input_t, output_t
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple, List, Optional
+import math
+
+# Reference code in PyTorch
+class Expert(nn.Module):
+    def __init__(self, config: Dict, d_expert: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.act_fn = nn.SiLU()
+        self.d_hidden: int = config["d_hidden"]
+        self.d_expert: int = config["d_expert"] if d_expert is None else d_expert
+
+        self.W_gate = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_up = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_down = nn.Linear(self.d_expert, self.d_hidden, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = self.act_fn(self.W_gate(x))
+        out = self.W_down(gate * self.W_up(x))
+        return out
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.top_k: int = config["n_experts_per_token"]
+        self.num_experts: int = config["n_routed_experts"]
+        self.d_hidden: int = config["d_hidden"]
+
+        self.W_g = nn.Linear(self.d_hidden, self.num_experts, bias=False)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self.W_g(x)
+        scores = logits.softmax(dim=-1)
+        topk_scores, topk_indices = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        return topk_indices, topk_scores
+
+
+class MoE(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            Expert(config)
+            for _ in range(config["n_routed_experts"])
+        ])
+        self.gating_network = MoEGate(config)
+        shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
+        self.shared_expert = Expert(config=config, d_expert=shared_expert_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shared_output = self.shared_expert(x)
+        expert_indices, expert_scores = self.gating_network(x)
+        batch_size, seq_len, hidden_dim = x.shape
+        orig_shape = x.shape
+        x_flat = x.view(-1, hidden_dim)
+        flat_expert_indices = expert_indices.view(-1)
+        flat_expert_weights = expert_scores.view(-1, 1)
+        routed_output_flat = self.moe_infer(x_flat,
+                                            flat_expert_indices,
+                                            flat_expert_weights)
+
+        routed_output = routed_output_flat.view(*orig_shape)
+        return routed_output + shared_output
+
+    @torch.no_grad()
+    def moe_infer(self,
+                  x: torch.Tensor,
+                  flat_expert_indices: torch.Tensor,
+                  flat_expert_weights: torch.Tensor
+                 ) -> torch.Tensor:
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        counts = flat_expert_indices.bincount().cpu().numpy()
+        tokens_per_expert = counts.cumsum()
+        num_per_tok = self.config["n_experts_per_token"]
+        token_idxs = idxs // num_per_tok
+        for expert_id, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+            if start_idx == end_idx:
+                continue
+
+            expert = self.experts[expert_id]
+            exp_token_idxs = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idxs]
+            expert_out    = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            expert_cache.scatter_reduce_(
+                0,
+                exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]),
+                expert_out,
+                reduce='sum'
+            )
+
+        return expert_cache
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of DeepSeek-style Mixture of Experts using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, hidden_dim]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+            
+    Returns:
+        Tuple containing:
+            - output: Processed tensor [batch_size, seq_len, d_model]
+            - aux_data: Dictionary with auxiliary data
+    """
+    input_tensor, weights, config = data
+    num_experts = config["n_routed_experts"]
+    moe = MoE(config)
+
+    # Fill in the given weights of the model
+    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+
+    for i in range(num_experts):
+        gate_proj_weight = weights[f'experts.{i}.0.weight']
+        up_proj_weight = weights[f'experts.{i}.1.weight']
+        down_proj_weight = weights[f'experts.{i}.2.weight']
+
+        # Transpose weights to match expected shape for nn.Linear
+        moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
+        moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
+        moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
+
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+
+    output = moe(input_tensor)
+
+    return output
+
+
+# Input generation for the reference code
+
+def generate_input(
+    dhidden: int,
+    dexpert: int,
+    nroutedexperts: int,
+    nsharedexperts: int,
+    nexpertspertoken: int,
+    bs: int,
+    seqlen: int,
+    seed: int
+) -> input_t:
+
+    # Really dumb but for now _ isn't parsing correctly.
+    d_hidden = dhidden
+    d_expert = dexpert
+    n_routed_experts = nroutedexperts
+    n_shared_experts = nsharedexperts
+    n_experts_per_token = nexpertspertoken
+    batch_size = bs
+    seq_len = seqlen
+
+    config = {
+        "d_hidden": d_hidden,
+        "d_expert": d_expert,
+        "n_routed_experts": n_routed_experts,
+        "n_shared_experts": n_shared_experts,
+        "n_experts_per_token": n_experts_per_token,
+        "batch_size": batch_size,
+        "seq_len": seq_len,
+    }
+
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+
+    num_experts = n_routed_experts
+    expert_dim = d_expert
+    weights = {}
+
+    input_tensor = torch.randn(
+        (batch_size, seq_len, d_hidden),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ).contiguous()
+
+    # Initialize router weights
+    weights['router.weight'] = torch.randn(
+        (num_experts, d_hidden),
+        device="cuda",
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(d_hidden)
+
+    for i in range(num_experts):
+        weights[f'experts.{i}.0.weight'] = torch.randn(
+            (d_hidden, expert_dim),
+            device='cuda',
+            dtype=torch.float16,
+            generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f'experts.{i}.1.weight'] = torch.randn(
+            (d_hidden, expert_dim),
+            device='cuda',
+            dtype=torch.float16,
+            generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f'experts.{i}.2.weight'] = torch.randn(
+            (expert_dim, d_hidden),
+            device='cuda',
+            dtype=torch.float16,
+            generator=gen
+        ) / math.sqrt(d_hidden)
+    
+    weights['shared_experts.0.weight'] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights['shared_experts.1.weight'] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights['shared_experts.2.weight'] = torch.randn(
+        (expert_dim * n_shared_experts, d_hidden),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(d_hidden)
+
+    return (input_tensor, weights, config)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
\ No newline at end of file
diff --git a/problems/amd/moe/submission.py b/problems/amd/moe/submission.py
new file mode 100644
index 00000000..7d085ff0
--- /dev/null
+++ b/problems/amd/moe/submission.py
@@ -0,0 +1,141 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple, Optional
+from task import input_t, output_t
+
+class Expert(nn.Module):
+    def __init__(self, config: Dict, d_expert: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.act_fn = nn.SiLU()
+        self.d_hidden: int = config["d_hidden"]
+        self.d_expert: int = config["d_expert"] if d_expert is None else d_expert
+
+        self.W_gate = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_up = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_down = nn.Linear(self.d_expert, self.d_hidden, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = self.act_fn(self.W_gate(x))
+        out = self.W_down(gate * self.W_up(x))
+        return out
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.top_k: int = config["n_experts_per_token"]
+        self.num_experts: int = config["n_routed_experts"]
+        self.d_hidden: int = config["d_hidden"]
+
+        self.W_g = nn.Linear(self.d_hidden, self.num_experts, bias=False)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self.W_g(x)
+        scores = logits.softmax(dim=-1)
+        topk_scores, topk_indices = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        return topk_indices, topk_scores
+
+
+class MoE(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            Expert(config)
+            for _ in range(config["n_routed_experts"])
+        ])
+        self.gating_network = MoEGate(config)
+        shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
+        self.shared_expert = Expert(config=config, d_expert=shared_expert_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shared_output = self.shared_expert(x)
+        expert_indices, expert_scores = self.gating_network(x)
+        batch_size, seq_len, hidden_dim = x.shape
+        orig_shape = x.shape
+        x_flat = x.view(-1, hidden_dim)
+        flat_expert_indices = expert_indices.view(-1)
+        flat_expert_weights = expert_scores.view(-1, 1)
+        routed_output_flat = self.moe_infer(x_flat,
+                                            flat_expert_indices,
+                                            flat_expert_weights)
+
+        routed_output = routed_output_flat.view(*orig_shape)
+        return routed_output + shared_output
+
+    @torch.no_grad()
+    def moe_infer(self,
+                  x: torch.Tensor,
+                  flat_expert_indices: torch.Tensor,
+                  flat_expert_weights: torch.Tensor
+                 ) -> torch.Tensor:
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        counts = flat_expert_indices.bincount().cpu().numpy()
+        tokens_per_expert = counts.cumsum()
+        num_per_tok = self.config["n_experts_per_token"]
+        token_idxs = idxs // num_per_tok
+        for expert_id, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+            if start_idx == end_idx:
+                continue
+
+            expert = self.experts[expert_id]
+            exp_token_idxs = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idxs]
+            expert_out    = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            expert_cache.scatter_reduce_(
+                0,
+                exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]),
+                expert_out,
+                reduce='sum'
+            )
+
+        return expert_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Submission template for DeepSeek-style Mixture of Experts using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, hidden_size]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+            
+    Returns:
+        Tuple containing:
+            - output: Processed tensor [batch_size, seq_len, d_model]
+            - aux_data: Dictionary with auxiliary data
+    """
+    input_tensor, weights, config = data
+    num_experts = config["n_routed_experts"]
+    moe = MoE(config)
+
+    # Fill in the given weights of the model
+    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+
+    for i in range(num_experts):
+        gate_proj_weight = weights[f'experts.{i}.0.weight']
+        up_proj_weight = weights[f'experts.{i}.1.weight']
+        down_proj_weight = weights[f'experts.{i}.2.weight']
+
+        # Transpose weights to match expected shape for nn.Linear
+        moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
+        moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
+        moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
+
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+
+    # Run the model
+    output = moe(input_tensor)
+
+    return output
\ No newline at end of file
diff --git a/problems/amd/moe/task.py b/problems/amd/moe/task.py
new file mode 100644
index 00000000..e963086e
--- /dev/null
+++ b/problems/amd/moe/task.py
@@ -0,0 +1,16 @@
+from typing import TypedDict, TypeVar, Tuple, Dict
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, Dict[str, torch.Tensor], Dict])
+output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+
+
+class TestSpec(TypedDict):
+    d_hidden: int
+    d_expert: int
+    n_routed_experts: int
+    n_shared_experts: int
+    n_experts_per_token: int
+    batch_size: int
+    seq_len: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
new file mode 100644
index 00000000..a00c1205
--- /dev/null
+++ b/problems/amd/moe/task.yml
@@ -0,0 +1,57 @@
+# name: 3_moe
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  For a more complete description, see: https://tinyurl.com/amd-comp-moe
+  Implement a DeepSeek-style Mixture of Experts (MoE) layer for efficient transformer models
+  on a single MI300X device.
+  
+  MoE is a technique that allows scaling model capacity without proportionally increasing computational costs
+  by using a routing mechanism to selectively activate only a subset of parameters for each token.
+  
+  Your task:
+  - Implement token routing using a simple softmax-based learned router
+  - Route tokens to the top-k experts based on router probabilities
+  - Process tokens through their assigned experts
+  - Combine expert outputs weighted by router probabilities
+  - Calculate appropriate auxiliary losses for training stability
+  
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+    - input: Input tensor of shape [bs, seq_len, d_hidden]
+    - weights: Dictionary containing model weights
+    - config: Dictionary containing model configuration parameters
+  
+  Output:
+  - Tuple containing:
+    - output: Processed tensor [bs, seq_len, d_model]
+    - aux_data: Dictionary with auxiliary data like router probabilities and losses
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+test_timeout: 540
+benchmark_timeout: 540
+ranked_timeout: 540
+ranking_by: "geom"
+
+tests:
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 4, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 512, "seed": 9371}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 8, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 512, "seed": 2291}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 8, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 81934}
+
+benchmarks:
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 2048, "seed": 9371}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 4096, "seed": 12819}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}
\ No newline at end of file

From 1541c34ec50ca2a846bbdcd6fe09c1db72575aab Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 29 Apr 2025 20:55:41 +0200
Subject: [PATCH 045/132] Fix: remove amd moe template

---
 problems/amd/moe/task.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index a00c1205..e8f8e5eb 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -38,8 +38,6 @@ description: |
 config:
   main: "eval.py"
 
-templates:
-  Python: "../template.py"
 
 test_timeout: 540
 benchmark_timeout: 540
@@ -54,4 +52,4 @@ tests:
 benchmarks:
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 2048, "seed": 9371}
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 4096, "seed": 12819}
-  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}
\ No newline at end of file
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}

From 3b3fa2e1f59daa0940e60b6ecd68750dcd1500cf Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 29 Apr 2025 20:57:25 +0200
Subject: [PATCH 046/132] Feat: add moe template

---
 problems/amd/moe/task.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index e8f8e5eb..e6a9b1b1 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -38,6 +38,8 @@ description: |
 config:
   main: "eval.py"
 
+templates:
+  Python: "submission.py"
 
 test_timeout: 540
 benchmark_timeout: 540

From 60113bd7c305a344f2b9786d55fba05bb946ed71 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 29 Apr 2025 21:39:50 +0200
Subject: [PATCH 047/132] Fix: remove duplicate benchmarks

---
 problems/amd/moe/task.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index e6a9b1b1..d9bf71d4 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -53,5 +53,4 @@ tests:
 
 benchmarks:
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 2048, "seed": 9371}
-  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 4096, "seed": 12819}
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}

From 264fc31ed624284f00c224acaef60615bfbc84fc Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 29 Apr 2025 23:08:30 +0200
Subject: [PATCH 048/132] Fix: move to using specified timeouts

---
 problems/pmpp/conv2d_py/task.yml    | 4 ++++
 problems/pmpp/grayscale_py/task.yml | 4 ++++
 problems/pmpp/histogram_py/task.yml | 4 ++++
 problems/pmpp/matmul_py/task.yml    | 4 ++++
 problems/pmpp/prefixsum_py/task.yml | 4 ++++
 problems/pmpp/sort_py/task.yml      | 4 ++++
 problems/pmpp/vectoradd_py/task.yml | 4 ++++
 problems/pmpp/vectorsum_py/task.yml | 4 ++++
 8 files changed, 32 insertions(+)

diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index 9e88a41b..55adc532 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -41,3 +41,7 @@ benchmarks:
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
   - {"size": 256, "kernelsize": 32, "channels": 128, "batch": 1, "seed": 6252}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 90268ab1..cada0257 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 4096, "seed": 8841}
   - {"size": 8192, "seed": 6252}
   - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index 01baa5f8..489a98b6 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -37,3 +37,7 @@ benchmarks:
   - {"size": 2621440, "seed": 8753, "contention": 90}
   - {"size": 5242880, "seed": 6252, "contention": 10}
   - {"size": 10485760, "seed": 8841, "contention": 10}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index c97333c7..6924764b 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -38,3 +38,7 @@ benchmarks:
   - {"m": 1024, "n": 1536, "k": 1024, "seed": 321}
   - {"m": 2048, "n": 3072, "k": 2048, "seed": 32412}
   - {"m": 4096, "n": 5120, "k": 4096, "seed": 123456}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index 163b4f72..a91d1496 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -51,3 +51,7 @@ benchmarks:
   - {"size": 268435456, "seed": 91357}
   # - {"size": 536870912, "seed": 102468}
   # - {"size": 1073741824, "seed": 113579}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 5ba9c318..5c702e29 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 1000000, "seed": 6256}
   - {"size": 10000000, "seed": 8841}
   - {"size": 100000000, "seed": 6252} 
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 546fb36e..6906a313 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 4096, "seed": 2146}
   - {"size": 8192, "seed": 3129}
   - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index d0fa91c6..8b3ddbb7 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 13107200, "seed": 6252}
   - {"size": 26214400, "seed": 82135}
   - {"size": 52428800, "seed": 12345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180

From 53b7d86be8c9e37ae9088dce6d26ee41b32d16a7 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 29 Apr 2025 23:15:49 +0200
Subject: [PATCH 049/132] Feat: move deadline to june for pmpp

---
 problems/beta.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/beta.yaml b/problems/beta.yaml
index fed11c76..a357c783 100644
--- a/problems/beta.yaml
+++ b/problems/beta.yaml
@@ -7,7 +7,7 @@ description: ""
 problems:
   - directory: pmpp/conv2d_py
     name: conv2d
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -15,7 +15,7 @@ problems:
       - L4
   - directory: pmpp/grayscale_py
     name: grayscale
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -23,7 +23,7 @@ problems:
       - L4
   - directory: pmpp/histogram_py
     name: histogram
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -31,7 +31,7 @@ problems:
       - L4
   - directory: pmpp/matmul_py
     name: matmul
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -39,7 +39,7 @@ problems:
       - L4
   - directory: pmpp/prefixsum_py
     name: prefixsum
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -47,7 +47,7 @@ problems:
       - L4
   - directory: pmpp/sort_py
     name: sort
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -55,7 +55,7 @@ problems:
       - L4
   - directory: pmpp/vectoradd_py
     name: vectoradd
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -63,7 +63,7 @@ problems:
       - L4
   - directory: pmpp/vectorsum_py
     name: vectorsum
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100

From 16622e026692a4cea619ee3c1ba346fccc91dcd0 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 30 Apr 2025 10:05:56 -0700
Subject: [PATCH 050/132] Correct shape of a_scale + b_scale

---
 problems/amd/fp8-mm/task.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
index d4e435db..23de9909 100644
--- a/problems/amd/fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -22,8 +22,8 @@ description: |
   and `c` is the output matrix:
   * `a` is M x K in column-major order in e4m3fnuz
   * `b` is N x K in column-major order in e4m3fnuz
-  * `a_scale` is M x K in column-major order in fp32
-  * `b_scale` is N x K in column-major order in fp32
+  * `a_scale` is M x K // 128 in column-major order in fp32
+  * `b_scale` is N // 128 x K // 128 in column-major order in fp32
   * `c` is M x N in ROW-major order in bf16
   
   Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.

From a390d26f1aed7b9f0655fe72f139705deb2cc6a1 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Thu, 1 May 2025 17:07:22 +0200
Subject: [PATCH 051/132] Feat: enlarge task timeout for moe

---
 problems/amd/moe/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index d9bf71d4..6b661767 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -43,7 +43,7 @@ templates:
 
 test_timeout: 540
 benchmark_timeout: 540
-ranked_timeout: 540
+ranked_timeout: 840
 ranking_by: "geom"
 
 tests:

From 997cd382b3d59c557504a1cc0e77317df31aa988 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 9 May 2025 18:22:49 -0700
Subject: [PATCH 052/132] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5f9e4059..afd98c45 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,11 @@
 
 This repo holds reference kernels for the KernelBot which hosts regular competitions on [discord.gg/gpumode](discord.gg/gpumode).
 
+You can see what's going on [gpumode.com](https://www.gpumode.com/)
+
 ## Competition
 1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
-2. LLM competition: Coming soon!
+2. [AMD $100K kernel competition](problems/amd)
 
 ## Making a Leaderboard Submission
 

From 6624b5d18abcf4fbc3f0708f195d2fd361013e03 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sat, 10 May 2025 22:47:44 +0200
Subject: [PATCH 053/132] update eval file to allow pt profiling

---
 problems/amd/eval.py | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index ce414f9b..7b13d38a 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -1,3 +1,4 @@
+import base64
 import dataclasses
 import multiprocessing
 import re
@@ -248,7 +249,8 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     return calculate_stats(durations)
 
 
-def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float):
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
     """
     For a particular test case, check correctness (if applicable) and grab runtime results.
 
@@ -295,6 +297,31 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
@@ -333,12 +360,14 @@ def main():
                     else:
                         passed = False
                         logger.log(f"benchmark.{i}.status", "fail")
-                        logger.log(f"benchmark.{i}.error", str(result)) #TODO: Make sure result implements __str__?
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
                         break
 
                 logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
             else:
-                # TODO: Implement script and profile mode
+                # TODO: Implement script mode
                 return 2
 
 

From 7d8a57661a684f6a11270e4855179df5d0f1dff1 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 12 May 2025 10:20:49 +0200
Subject: [PATCH 054/132] update eval file to put an upper bound on total
 benchmark duration

---
 problems/amd/eval.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index 7b13d38a..5f553e90 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -219,6 +219,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
+    bm_start_time = time.perf_counter_ns()
     for i in range(max_repeats):
         if recheck:
             # ensure we use a different seed for every benchmark
@@ -239,11 +240,16 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
                 return message
 
         del output
-        durations.append(end-start)
+        durations.append(end - start)
 
         if i > 1:
+            total_bm_duration = bm_start_time - time.perf_counter_ns()
             stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns:
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
                 break
 
     return calculate_stats(durations)

From 5969f75339a8e12c8a471c589b61e83accbfe9f8 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 18 May 2025 01:07:09 +0200
Subject: [PATCH 055/132] Fix: sign issue in eval.py

---
 problems/amd/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index 5f553e90..ac3a6325 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -243,7 +243,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         durations.append(end - start)
 
         if i > 1:
-            total_bm_duration = bm_start_time - time.perf_counter_ns()
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
             stats = calculate_stats(durations)
             # stop if either
             # a) relative error dips below 0.1%

From 6a9270363d3da2e589d9188e5215c016acbd9fe9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 18 May 2025 07:47:22 -0700
Subject: [PATCH 056/132] Update template-hip.py

---
 problems/amd/fp8-mm/template-hip.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/problems/amd/fp8-mm/template-hip.py b/problems/amd/fp8-mm/template-hip.py
index 9e7cd11d..62ccc1fe 100644
--- a/problems/amd/fp8-mm/template-hip.py
+++ b/problems/amd/fp8-mm/template-hip.py
@@ -1,4 +1,7 @@
 # This script provides a template for using load_inline to run a HIP kernel for
+import os 
+os.environ['PYTORCH_ROCM_ARCH'] = 'gfx942'
+
 from torch.utils.cpp_extension import load_inline
 from task import input_t, output_t
 CPP_WRAPPER = """

From aa14a425b81e5c8b1c5e8d7d6097ab6adfa9eb2f Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 19 May 2025 12:17:24 -0400
Subject: [PATCH 057/132] add MLA problem

---
 problems/amd/mla-decode/README.md     |  37 +++
 problems/amd/mla-decode/eval.py       | 320 ++++++++++++++++++++++++++
 problems/amd/mla-decode/reference.py  | 256 +++++++++++++++++++++
 problems/amd/mla-decode/submission.py | 165 +++++++++++++
 problems/amd/mla-decode/task.py       |  12 +
 problems/amd/mla-decode/task.yml      |  74 ++++++
 problems/amd/mla-decode/utils.py      | 149 ++++++++++++
 7 files changed, 1013 insertions(+)
 create mode 100644 problems/amd/mla-decode/README.md
 create mode 100644 problems/amd/mla-decode/eval.py
 create mode 100644 problems/amd/mla-decode/reference.py
 create mode 100644 problems/amd/mla-decode/submission.py
 create mode 100644 problems/amd/mla-decode/task.py
 create mode 100644 problems/amd/mla-decode/task.yml
 create mode 100644 problems/amd/mla-decode/utils.py

diff --git a/problems/amd/mla-decode/README.md b/problems/amd/mla-decode/README.md
new file mode 100644
index 00000000..fdb96e4c
--- /dev/null
+++ b/problems/amd/mla-decode/README.md
@@ -0,0 +1,37 @@
+# Description
+
+You will implement a custom mla decode kernel optimized for MI300, a few things simplified here:
+
+1. Q, K, V data type as bfloat16
+  
+2. decode only with pre-allocated non-paged latent kv cache
+
+3. no need to update kv cache
+
+The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1, and split number of heads to fit in one GPU. 
+To be explicit, you will be given a tuple to tensors:
+
+```yml
+  input [bs, sq, dim]
+  attn_output [bs, n_heads, sq, v_head_dim]
+``` 
+
+  where 
+
+  0. bs::128 # batch size
+  1. sk::[1024, 6144] # as kv length
+  2. sq::1 # as only consider decoding
+  3. dim::7168 # hidden size of deepseek v3
+  4. v_head_dim::128 # head size
+  5. n_heads::128 # num of q heads
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+ 
+  The speed of light analysis is::
+  | bs | sk | sq | dtype |  roofline time(us) |
+  |---|---|---|---|---|
+  | 128 | 1024 | 1 | bf16 | 106.65 |
+  | 128 | 6144 | 1 | bf16 | 280.87 | 
diff --git a/problems/amd/mla-decode/eval.py b/problems/amd/mla-decode/eval.py
new file mode 100644
index 00000000..34d3ae84
--- /dev/null
+++ b/problems/amd/mla-decode/eval.py
@@ -0,0 +1,320 @@
+import dataclasses
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any
+from collections import OrderedDict
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from submission import custom_kernel
+from reference import check_implementation, generate_input
+
+WARMUP_RUNS = 10
+TIMED_RUNS = 100
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def copy_kv_cache(module, kv_cache_shape):
+    """
+    Creates a copy of the KVCache module manually.
+    """
+    copied_module = type(module)(kv_cache_shape)
+    
+    # Copy parameters
+    params = OrderedDict()
+    for name, param in module.named_parameters():
+        params[name] = param.clone().requires_grad_(param.requires_grad)
+        
+    # Copy buffers
+    buffers = OrderedDict()
+    for name, buff in module.named_buffers():
+        print(f"Buff name: {name}, shape: {buff.shape}")
+        buffers[name] = buff.clone()
+    
+    # Assign params and buffers to copied module
+    copied_module.load_state_dict(params, strict=False)
+    copied_module.load_state_dict(buffers, strict=False)
+    copied_module.seq_len = module.seq_len
+    
+    return copied_module
+
+
+def get_test_cases(file_name: str) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    return tests
+
+
+def warm_up(test: TestCase):
+    config, data, kv_cache = generate_input(**test.args)
+    config_copy = copy_config_weights(config)
+    start = time.perf_counter()
+    while time.perf_counter() - start < 0.2:
+        custom_kernel((config_copy, data, kv_cache))
+        torch.cuda.synchronize()
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def copy_config_weights(config):
+    """
+    Creates a copy of the Config object with cloned weight tensors.
+    """
+    return dataclasses.replace(
+        config,
+        Q_proj_down_weight=config.Q_proj_down_weight.clone(),
+        Q_proj_up_weight=config.Q_proj_up_weight.clone(),
+        KV_proj_down_weight=config.KV_proj_down_weight.clone(),
+        KV_proj_up_weight=config.KV_proj_up_weight.clone()
+    )
+
+
+def run_testing(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+
+        config, data, kv_cache = generate_input(**test.args)
+        kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+
+        torch.cuda.synchronize()
+        submission_output = custom_kernel((config, data, kv_cache))
+        torch.cuda.synchronize()
+        error = check_implementation((config, data, kv_cache_copy), submission_output)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    durations = []
+    # generate input data once
+    config, data, kv_cache = generate_input(**test.args)
+    # first, one obligatory correctness check; also triggers triton compile for the given shape
+    kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+    config_copy = copy_config_weights(config)
+    output = custom_kernel((config, data, kv_cache))
+    error = check_implementation((config_copy, data, kv_cache_copy), output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            config, data, kv_cache = generate_input(**test.args)
+            kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+            config_copy = copy_config_weights(config)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel((config, data, kv_cache))
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation((config_copy, data, kv_cache_copy), output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    warm_up(tests[0])
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = benchmark(test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    tests = get_test_cases(sys.argv[2])
+
+    with PopcornOutput(int(fd)) as logger:
+        seed = os.getenv("POPCORN_SEED")
+        seed = int(seed) if seed else 42
+        set_seed(seed)
+
+        if mode == "test":
+            return run_testing(logger, tests)
+
+        if mode == "benchmark":
+            return run_benchmarking(logger, tests)
+        
+        if mode == "leaderboard":
+            warm_up(tests[0])
+            result = benchmark(tests[-1], True, 100, 30e9)
+            if isinstance(result, Stats):
+                logger.log("benchmark-count", 1)
+                logger.log(f"benchmark.0.spec", tests[-1].spec)
+                logger.log(f"benchmark.0.runs", result.runs)
+                logger.log(f"benchmark.0.mean", result.mean)
+                logger.log(f"benchmark.0.std", result.std)
+                logger.log(f"benchmark.0.err", result.err)
+                logger.log("check", "pass")
+            else:
+                logger.log("test-count", 1)
+                logger.log("test.0.status", "fail")
+                logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?
+        
+        else:
+            # TODO: Implement script and profile mode
+            return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
new file mode 100644
index 00000000..10b1884b
--- /dev/null
+++ b/problems/amd/mla-decode/reference.py
@@ -0,0 +1,256 @@
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from task import input_t, output_t
+from utils import make_match_reference
+
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
+        self.register_buffer("theta", theta)
+
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple) -> None:
+        super().__init__()
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+
+    def zero(self) -> None:
+        self.data.zero_()
+    
+    def get_data(self) -> torch.Tensor:
+        return self.data
+
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+
+        self.data = self.data.to(c_kv.dtype)
+        self.data[
+            :, self.seq_len : self.seq_len + c_kv.size(1), :
+        ] = c_kv
+        self.seq_len += c_kv.size(1)
+
+        return self.data[:, :self.seq_len], self.seq_len
+    
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int 
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        # Down-projection matrices
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
+
+        # Up-projection and rope projection matrices
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+
+        # RoPE on half embeddings
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+
+        # Output projection
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+   
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        # seq_len = 1 always here
+        batch_size, seq_len, model_dim = x.size()
+
+        ################################################################################
+        #                 Step 1: Handle down-projection + KV cache                    #
+        ################################################################################
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+
+        ################################################################################
+        #                  Step 2: Up-project and prepare NoPE + RoPE                  #
+        ################################################################################
+
+        # Handle queries Q first
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+
+        # Handle keys and values K/V. V does not need RoPE
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+
+        ################################################################################
+        #                    Step 3: Handle RoPE Stream                                #
+        ################################################################################
+
+        # Compute RoPE for queries and combine with no-RoPE part
+        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+
+        # Compute RoPE for keys and combine with no-RoPE part
+        k_rope = k_rope[:, :, None, :]
+        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+                
+        ################################################################################
+        #                        Compute Multi-head Attention                          #
+        ################################################################################
+        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
+        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
+        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+
+        return y, kv_cache.get_data()
+
+def generate_input(batchsize, dim, dq, prefill, seed):
+    # Sizes derived from: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
+    gen = torch.Generator()
+    gen.manual_seed(seed)
+    
+    # Generate weights for linear layers
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen)
+
+    config = Config(
+        batch_size=batchsize,
+        dim=dim,
+        q_lora_rank=dq,
+        n_heads=128,
+        kv_lora_rank=512,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        seq_len=1,
+        max_seq_len=8192,
+        kv_cache_shape=(batchsize, 8192, 512 + 64),
+        Q_proj_down_weight=Q_proj_down_weight,
+        Q_proj_up_weight=Q_proj_up_weight,
+        KV_proj_down_weight=KV_proj_down_weight,
+        KV_proj_up_weight=KV_proj_up_weight,
+        wo_weight=wo_weight,
+    )
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen)
+    
+    # Pre-fill KV cache
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim))
+    pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
+                                 dtype=torch.bfloat16, generator=gen)
+    kv_cache(pre_filled_cache)
+
+    return config, x, kv_cache
+
+def ref_kernel(data: input_t) -> output_t:
+    config, x, kv_cache = data
+
+    # Load in model weights
+    model = MLA(config)
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+
+    output, kv_cache = model(x, kv_cache)
+    return output, kv_cache
+
+check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=1e-03)  
+
+
+def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
+    # Warmup runs
+    for _ in range(num_warmup):
+        output, _ = model(x, kv_cache)
+        torch.cuda.synchronize()
+
+    # Timed runs
+    times = []
+    for _ in range(num_trials):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        
+        start.record()
+        output, updated_kv = model(x, kv_cache)
+        end.record()
+        
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+
+    avg_time = sum(times) / len(times)
+    return output, updated_kv, avg_time, times
+
+if __name__ == "__main__":
+    # Generate test input
+    batchsize = 128
+    dim = 7168 
+    dq = 1536
+    prefill = 512
+    seed = 97
+
+    # Create model and inputs
+    config, x, kv_cache = generate_input(batchsize, dim, dq, prefill, seed)
+    model = MLA(config)
+
+    # Run model with timing
+    output, updated_kv, avg_time, times = time_mla(model, x, kv_cache)
+
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {output.shape}")
+    print(f"Updated KV cache shape: {updated_kv.shape}")
+    print("\nFirst few values of output:")
+    print(output[0, :10])
+    print(f"\nTiming results over {len(times)} runs (ms):")
+    print(f"Average: {avg_time:.2f}")
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
new file mode 100644
index 00000000..cbfd75d4
--- /dev/null
+++ b/problems/amd/mla-decode/submission.py
@@ -0,0 +1,165 @@
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from task import input_t, output_t
+
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
+        self.register_buffer("theta", theta)
+
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+
+
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple) -> None:
+        super().__init__()
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+
+    def zero(self) -> None:
+        self.data.zero_()
+    
+    def get_data(self) -> torch.Tensor:
+        return self.data
+
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+
+        self.data = self.data.to(c_kv.dtype)
+        self.data[
+            :, self.seq_len : self.seq_len + c_kv.size(1), :
+        ] = c_kv
+        self.seq_len += c_kv.size(1)
+
+        return self.data[:, :self.seq_len], self.seq_len
+    
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int 
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        # Down-projection matrices
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, bias=False, dtype=torch.bfloat16)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, bias=False, dtype=torch.bfloat16)
+
+        # Up-projection and rope projection matrices
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+
+        # RoPE on half embeddings
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+
+        # Output projection
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+   
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        # seq_len = 1 always here
+        batch_size, seq_len, model_dim = x.size()
+
+        ################################################################################
+        #                 Step 1: Handle down-projection + KV cache                    #
+        ################################################################################
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+
+        ################################################################################
+        #                  Step 2: Up-project and prepare NoPE + RoPE                  #
+        ################################################################################
+
+        # Handle queries Q first
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+
+        # Handle keys and values K/V. V does not need RoPE
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+
+        ################################################################################
+        #                    Step 3: Handle RoPE Stream                                #
+        ################################################################################
+
+        # Compute RoPE for queries and combine with no-RoPE part
+        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+
+        # Compute RoPE for keys and combine with no-RoPE part
+        k_rope = k_rope[:, :, None, :]
+        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+                
+        ################################################################################
+        #                        Compute Multi-head Attention                          #
+        ################################################################################
+        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
+        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
+        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+
+        return y, kv_cache.get_data()
+
+def custom_kernel(data: input_t) -> output_t:
+    config, x, kv_cache = data
+    model = MLA(config)
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+
+    output, kv_cache = model(x, kv_cache)
+    return output, kv_cache
\ No newline at end of file
diff --git a/problems/amd/mla-decode/task.py b/problems/amd/mla-decode/task.py
new file mode 100644
index 00000000..af04a3f8
--- /dev/null
+++ b/problems/amd/mla-decode/task.py
@@ -0,0 +1,12 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.nn.Module, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
+
+class TestSpec(TypedDict):
+    batchsize: int
+    dim: int
+    dq: int
+    prefill: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/mla-decode/task.yml b/problems/amd/mla-decode/task.yml
new file mode 100644
index 00000000..ca4706ec
--- /dev/null
+++ b/problems/amd/mla-decode/task.yml
@@ -0,0 +1,74 @@
+# name: mla-decode-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  You will implement a custom mla decode kernel optimized for MI300, a few things simplified here:
+
+  1. Q, K, V data type as bfloat16
+    
+  2. decode only with pre-allocated non-paged latent kv cache
+
+  3. return the update kv cache with MLA output
+
+  The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1, and split number of heads to fit in one GPU. 
+  To be explicit, you will be given a tuple to tensors:
+
+  ```yml
+  input [bs, sq, dim]
+  attn_output [bs, n_heads, sq, v_head_dim]
+  kv_cache [bs, sq, kv_lora_rank + qk_rope_head_dim]
+  ``` 
+
+  where 
+
+  0. bs::128 # batch size
+  1. prefill::[512, 2048, 4096, 6144] # as kv length
+  2. sq::1 # as only consider decoding
+  3. dim::7168 # hidden size of deepseek v3
+  4. kv_lora_rank::[512] # kv lora rank of deepseek v3
+  5. qk_rope_head_dim::[64] # rope embedding dimension
+  6. v_head_dim::128 # head size
+  7. n_heads::128 # num of attn heads
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand prize, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand prize.
+ 
+  The speed of light analysis is::
+  | bs | prefill | sq | dtype |  roofline time(us) |
+  |---|---|---|---|---|
+  | 128 | 512 | 1 | bf16 | 54.62 |
+  | 128 | 2048 | 1 | bf16 | 141.16 |
+  | 128 | 4096 | 1 | bf16 | 210.75 |
+  | 128 | 6144 | 1 | bf16 | 280.87 | 
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+test_timeout: 900
+benchmark_timeout: 900
+ranked_timeout: 1200
+
+tests:
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 128, "seed": 9247}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 512, "seed": 2197}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 1024, "seed": 9107}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 2048, "seed": 5291}
+
+benchmarks:
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 4096, "seed": 9817}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 6144, "seed": 5291}
+
+ranking_by: "geom"
diff --git a/problems/amd/mla-decode/utils.py b/problems/amd/mla-decode/utils.py
new file mode 100644
index 00000000..afe2001d
--- /dev/null
+++ b/problems/amd/mla-decode/utils.py
@@ -0,0 +1,149 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    config, x, kv_cache = data
+    expected_mla, expected_kv = reference((config, x, kv_cache))
+    output_mla, output_kv = output
+    reasons_mla = verbose_allclose(output_mla, expected_mla, rtol=rtol, atol=atol)
+    reasons_kv = verbose_allclose(output_kv, expected_kv, rtol=rtol, atol=atol)
+
+    if len(reasons_mla) > 0:
+        return "mismatch found on MLA output! custom implementation doesn't match reference: " + " ".join(reasons_mla)
+    if len(reasons_kv) > 0:
+        return "mismatch found on KV cache output! custom implementation doesn't match reference: " + " ".join(reasons_kv)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
\ No newline at end of file

From 44275fdf2fd7d97b53a5d40e629e1f2268c9430e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 19 May 2025 09:32:54 -0700
Subject: [PATCH 058/132] add mla problem

---
 problems/amd.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index bcf91f67..722dbb8e 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -19,4 +19,9 @@ problems:
     name: amd-mixture-of-experts
     deadline: "2025-05-27"
     gpus:
-      - MI300
\ No newline at end of file
+      - MI300
+  - directory: amd/mla-decode
+    name: amd-mla-decode
+    deadline: "2025-05-27"
+    gpus:
+      - MI300

From ba4cf90c994c4241bda4ffac9c6bb4523b838eff Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 19 May 2025 17:18:34 -0700
Subject: [PATCH 059/132] Update amd.yaml

---
 problems/amd.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 722dbb8e..16940f4b 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -1,27 +1,27 @@
 name: AMD Developer Challenge 2025
 # when does this end (individual problems might close earlier)
-deadline: "2025-05-27"
+deadline: "2025-06-02"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Inference Sprint"
 # the list of problems
 problems:
   - directory: amd/identity
     name: amd-identity
-    deadline: "2025-06-08"
+    deadline: "2025-06-02"
     gpus:
       - MI300
   - directory: amd/fp8-mm
     name: amd-fp8-mm
-    deadline: "2025-05-27"
+    deadline: "2025-06-02"
     gpus:
       - MI300
   - directory: amd/moe
     name: amd-mixture-of-experts
-    deadline: "2025-05-27"
+    deadline: "2025-06-02"
     gpus:
       - MI300
   - directory: amd/mla-decode
     name: amd-mla-decode
-    deadline: "2025-05-27"
+    deadline: "2025-06-02"
     gpus:
       - MI300

From 3e2cef3645ec46cd6906dbefc86d81633b30600c Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 10:45:00 -0400
Subject: [PATCH 060/132] Update reference.py

---
 problems/amd/mla-decode/reference.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 10b1884b..550f54ef 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -154,15 +154,15 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
 
 def generate_input(batchsize, dim, dq, prefill, seed):
     # Sizes derived from: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-    gen = torch.Generator()
+    gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
     
     # Generate weights for linear layers
-    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen)
-    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen)
-    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen)
-    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen)
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda')
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda')
 
     config = Config(
         batch_size=batchsize,
@@ -182,10 +182,10 @@ def generate_input(batchsize, dim, dq, prefill, seed):
         KV_proj_up_weight=KV_proj_up_weight,
         wo_weight=wo_weight,
     )
-    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen)
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
     
     # Pre-fill KV cache
-    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim))
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
     pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
                                  dtype=torch.bfloat16, generator=gen)
     kv_cache(pre_filled_cache)
@@ -253,4 +253,4 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
     print(output[0, :10])
     print(f"\nTiming results over {len(times)} runs (ms):")
     print(f"Average: {avg_time:.2f}")
-    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")

From 74d11b114ae15c731992123eac726984f0b9dcc3 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 10:59:57 -0400
Subject: [PATCH 061/132] Update reference.py

---
 problems/amd/mla-decode/reference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 550f54ef..7dcff278 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -186,7 +186,7 @@ def generate_input(batchsize, dim, dq, prefill, seed):
     
     # Pre-fill KV cache
     kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
-    pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
+    pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), device='cuda', 
                                  dtype=torch.bfloat16, generator=gen)
     kv_cache(pre_filled_cache)
 
@@ -196,7 +196,7 @@ def ref_kernel(data: input_t) -> output_t:
     config, x, kv_cache = data
 
     # Load in model weights
-    model = MLA(config)
+    model = MLA(config).cuda()
     model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
     model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
     model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
@@ -241,7 +241,7 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
 
     # Create model and inputs
     config, x, kv_cache = generate_input(batchsize, dim, dq, prefill, seed)
-    model = MLA(config)
+    model = MLA(config).cuda()
 
     # Run model with timing
     output, updated_kv, avg_time, times = time_mla(model, x, kv_cache)

From f03748528a8ab3eef6eb34bc71b20f19e1236b27 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 14:49:36 -0400
Subject: [PATCH 062/132] updated ref

---
 problems/amd/mla-decode/reference.py  | 53 +++++++++++++++++----------
 problems/amd/mla-decode/submission.py |  4 +-
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 10b1884b..aaf8e756 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -29,8 +29,8 @@ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
         return x * cos + self.rotate_half(x) * sin
 
 class KVCache(nn.Module):
-    def __init__(self, kv_cache_shape: tuple) -> None:
-        super().__init__()
+    def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
+        super().__init__(**kwargs)
         self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
         self.seq_len = 0
         self.zero()
@@ -154,15 +154,15 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
 
 def generate_input(batchsize, dim, dq, prefill, seed):
     # Sizes derived from: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-    gen = torch.Generator()
+    gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
     
     # Generate weights for linear layers
-    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen)
-    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen)
-    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen)
-    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen)
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda')
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda')
 
     config = Config(
         batch_size=batchsize,
@@ -182,12 +182,12 @@ def generate_input(batchsize, dim, dq, prefill, seed):
         KV_proj_up_weight=KV_proj_up_weight,
         wo_weight=wo_weight,
     )
-    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen)
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
     
     # Pre-fill KV cache
-    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim))
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
     pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
-                                 dtype=torch.bfloat16, generator=gen)
+                                 dtype=torch.bfloat16, generator=gen, device='cuda')
     kv_cache(pre_filled_cache)
 
     return config, x, kv_cache
@@ -196,12 +196,12 @@ def ref_kernel(data: input_t) -> output_t:
     config, x, kv_cache = data
 
     # Load in model weights
-    model = MLA(config)
-    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
-    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
-    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
-    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
-    model.wo.weight = nn.Parameter(config.wo_weight)
+    model = MLA(config).to('cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight, device='cuda')
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight, device='cuda')
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight, device='cuda')
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight, device='cuda')
+    model.wo.weight = nn.Parameter(config.wo_weight, device='cuda')
 
     output, kv_cache = model(x, kv_cache)
     return output, kv_cache
@@ -210,14 +210,16 @@ def ref_kernel(data: input_t) -> output_t:
 
 
 def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
+
     # Warmup runs
-    for _ in range(num_warmup):
+    for _ in range(1):
         output, _ = model(x, kv_cache)
         torch.cuda.synchronize()
 
     # Timed runs
     times = []
     for _ in range(num_trials):
+        kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
         
@@ -241,11 +243,24 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
 
     # Create model and inputs
     config, x, kv_cache = generate_input(batchsize, dim, dq, prefill, seed)
-    model = MLA(config)
+    model = MLA(config).to('cuda')
 
     # Run model with timing
     output, updated_kv, avg_time, times = time_mla(model, x, kv_cache)
 
+    # Test reference kernel
+    ref_output, ref_kv = ref_kernel((config, x, kv_cache))
+    print("\nReference kernel output:")
+    print(f"Output shape: {ref_output.shape}")
+    print(f"KV cache shape: {ref_kv.shape}")
+    print("\nFirst few values of reference output:")
+    print(ref_output[0, :10])
+
+    # Compare outputs
+    print("\nOutput difference:")
+    print(f"Max absolute difference: {torch.max(torch.abs(output - ref_output))}")
+    print(f"Mean absolute difference: {torch.mean(torch.abs(output - ref_output))}")
+
     print(f"Input shape: {x.shape}")
     print(f"Output shape: {output.shape}")
     print(f"Updated KV cache shape: {updated_kv.shape}")
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index cbfd75d4..0d736242 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -31,7 +31,7 @@ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
 class KVCache(nn.Module):
     def __init__(self, kv_cache_shape: tuple) -> None:
         super().__init__()
-        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16, device='cuda'))
         self.seq_len = 0
         self.zero()
 
@@ -154,7 +154,7 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
 
 def custom_kernel(data: input_t) -> output_t:
     config, x, kv_cache = data
-    model = MLA(config)
+    model = MLA(config).to('cuda')
     model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
     model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
     model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)

From 8a87e65b5501f19e6d8e0152395480241dc5ce4e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 16:08:53 -0400
Subject: [PATCH 063/132] update mla decode

---
 problems/amd/mla-decode/eval.py      | 66 ++++++++++++++--------------
 problems/amd/mla-decode/reference.py | 10 ++---
 problems/amd/mla-decode/utils.py     | 11 ++++-
 3 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/problems/amd/mla-decode/eval.py b/problems/amd/mla-decode/eval.py
index 34d3ae84..3f67cd64 100644
--- a/problems/amd/mla-decode/eval.py
+++ b/problems/amd/mla-decode/eval.py
@@ -55,20 +55,20 @@ def copy_kv_cache(module, kv_cache_shape):
     # Copy parameters
     params = OrderedDict()
     for name, param in module.named_parameters():
-        params[name] = param.clone().requires_grad_(param.requires_grad)
+        params[name] = param.clone().requires_grad_(param.requires_grad).cuda()
         
     # Copy buffers
     buffers = OrderedDict()
     for name, buff in module.named_buffers():
         print(f"Buff name: {name}, shape: {buff.shape}")
-        buffers[name] = buff.clone()
+        buffers[name] = buff.clone().cuda()
     
     # Assign params and buffers to copied module
     copied_module.load_state_dict(params, strict=False)
     copied_module.load_state_dict(buffers, strict=False)
     copied_module.seq_len = module.seq_len
     
-    return copied_module
+    return copied_module.cuda()
 
 
 def get_test_cases(file_name: str) -> list[TestCase]:
@@ -148,10 +148,10 @@ def copy_config_weights(config):
     """
     return dataclasses.replace(
         config,
-        Q_proj_down_weight=config.Q_proj_down_weight.clone(),
-        Q_proj_up_weight=config.Q_proj_up_weight.clone(),
-        KV_proj_down_weight=config.KV_proj_down_weight.clone(),
-        KV_proj_up_weight=config.KV_proj_up_weight.clone()
+        Q_proj_down_weight=config.Q_proj_down_weight.clone().cuda(),
+        Q_proj_up_weight=config.Q_proj_up_weight.clone().cuda(),
+        KV_proj_down_weight=config.KV_proj_down_weight.clone().cuda(),
+        KV_proj_up_weight=config.KV_proj_up_weight.clone().cuda()
     )
 
 
@@ -206,8 +206,9 @@ def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: floa
     # first, one obligatory correctness check; also triggers triton compile for the given shape
     kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
     config_copy = copy_config_weights(config)
-    output = custom_kernel((config, data, kv_cache))
-    error = check_implementation((config_copy, data, kv_cache_copy), output)
+    with torch.no_grad():
+        output = custom_kernel((config, data, kv_cache))
+        error = check_implementation((config_copy, data, kv_cache_copy), output)
     if error:
         return error
 
@@ -216,29 +217,30 @@ def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: floa
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
-    for i in range(max_repeats):
-        if recheck:
-            config, data, kv_cache = generate_input(**test.args)
-            kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
-            config_copy = copy_config_weights(config)
-        torch.cuda.synchronize()
-        start = time.perf_counter_ns()
-        output = custom_kernel((config, data, kv_cache))
-        torch.cuda.synchronize()
-        end = time.perf_counter_ns()
-
-        if recheck:
-            error = check_implementation((config_copy, data, kv_cache_copy), output)
-            if error:
-                return error
-
-        del output
-        durations.append(end-start)
-
-        if i > 1:
-            stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
-                break
+    with torch.no_grad():
+        for i in range(max_repeats):
+            if recheck:
+                config, data, kv_cache = generate_input(**test.args)
+                kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+                config_copy = copy_config_weights(config)
+            torch.cuda.synchronize()
+            start = time.perf_counter_ns()
+            output = custom_kernel((config, data, kv_cache))
+            torch.cuda.synchronize()
+            end = time.perf_counter_ns()
+
+            if recheck:
+                error = check_implementation((config_copy, data, kv_cache_copy), output)
+                if error:
+                    return error
+
+            del output
+            durations.append(end-start)
+
+            if i > 1:
+                stats = calculate_stats(durations)
+                if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                    break
 
     return calculate_stats(durations)
 
diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index aaf8e756..d2cf075b 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -197,11 +197,11 @@ def ref_kernel(data: input_t) -> output_t:
 
     # Load in model weights
     model = MLA(config).to('cuda')
-    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight, device='cuda')
-    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight, device='cuda')
-    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight, device='cuda')
-    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight, device='cuda')
-    model.wo.weight = nn.Parameter(config.wo_weight, device='cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
 
     output, kv_cache = model(x, kv_cache)
     return output, kv_cache
diff --git a/problems/amd/mla-decode/utils.py b/problems/amd/mla-decode/utils.py
index afe2001d..e3f231e8 100644
--- a/problems/amd/mla-decode/utils.py
+++ b/problems/amd/mla-decode/utils.py
@@ -129,9 +129,18 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
+    output_mla, output_kv = output
+
+    # To fit in memory for the big test
+    output_mla = output_mla.cpu()
+    output_kv = output_kv.cpu()
+
     config, x, kv_cache = data
     expected_mla, expected_kv = reference((config, x, kv_cache))
-    output_mla, output_kv = output
+
+    output_mla = output_mla.cuda()
+    output_kv = output_kv.cuda()
+
     reasons_mla = verbose_allclose(output_mla, expected_mla, rtol=rtol, atol=atol)
     reasons_kv = verbose_allclose(output_kv, expected_kv, rtol=rtol, atol=atol)
 

From 47b5085085fa15819cdfaa33eb43fb294fa49f6c Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Thu, 22 May 2025 07:32:02 -0400
Subject: [PATCH 064/132] update weights

---
 problems/amd/mla-decode/reference.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index d2cf075b..2fc99b9e 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -158,11 +158,11 @@ def generate_input(batchsize, dim, dq, prefill, seed):
     gen.manual_seed(seed)
     
     # Generate weights for linear layers
-    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
-    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
-    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda')
-    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda')
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda')
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
 
     config = Config(
         batch_size=batchsize,

From ef81df15030d1eec465b400581e3a65e1d31bf82 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Fri, 23 May 2025 12:30:08 -0400
Subject: [PATCH 065/132] updated rope

---
 problems/amd/mla-decode/reference.py  | 7 ++++---
 problems/amd/mla-decode/submission.py | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 838ea40f..cb8f4e36 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -135,8 +135,9 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         q = torch.concat([q_nope, q_rope], dim=-1)
 
         # Compute RoPE for keys and combine with no-RoPE part
-        k_rope = k_rope[:, :, None, :]
-        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
+        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
@@ -268,4 +269,4 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
     print(output[0, :10])
     print(f"\nTiming results over {len(times)} runs (ms):")
     print(f"Average: {avg_time:.2f}")
-    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index 0d736242..28d4f98b 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -134,9 +134,11 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
         q = torch.concat([q_nope, q_rope], dim=-1)
 
+
         # Compute RoPE for keys and combine with no-RoPE part
-        k_rope = k_rope[:, :, None, :]
-        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
+        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################

From 79cbbfd498909dfcd515661c1c4f867a54f59bd0 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Fri, 23 May 2025 13:23:52 -0400
Subject: [PATCH 066/132] Update w_o weight normalization factor

---
 problems/amd/mla-decode/reference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index cb8f4e36..1d8d8b83 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -163,7 +163,7 @@ def generate_input(batchsize, dim, dq, prefill, seed):
     KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
     Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
     KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
 
     config = Config(
         batch_size=batchsize,
@@ -269,4 +269,4 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
     print(output[0, :10])
     print(f"\nTiming results over {len(times)} runs (ms):")
     print(f"Average: {avg_time:.2f}")
-    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")

From c7f4a22098b669a77b0ff7947ec30fe18541559b Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Sat, 24 May 2025 00:17:23 -0400
Subject: [PATCH 067/132] permute instead of reshape

---
 problems/amd/mla-decode/reference.py  | 5 +++--
 problems/amd/mla-decode/submission.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index cb8f4e36..d10bb4bb 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -131,13 +131,14 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         # Compute RoPE for queries and combine with no-RoPE part
         q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
+
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
+        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index 28d4f98b..f1fb8629 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -131,14 +131,14 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         # Compute RoPE for queries and combine with no-RoPE part
         q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
 
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
+        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################

From d4c24d93de911f7aa7c09126abeaf1bcd4befc94 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Sat, 24 May 2025 02:24:25 -0400
Subject: [PATCH 068/132] fix rope bugs found by jpy794

---
 problems/amd/mla-decode/reference.py  | 11 +++++------
 problems/amd/mla-decode/submission.py | 11 +++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index d10bb4bb..f4567238 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -129,24 +129,23 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         ################################################################################
 
         # Compute RoPE for queries and combine with no-RoPE part
-        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
+
+        q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
 
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
+        k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
         #                        Compute Multi-head Attention                          #
         ################################################################################
-        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
-        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
-        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
         scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
         attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
         y = torch.matmul(attn, v).view(batch_size, 1, -1)
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index f1fb8629..2165f988 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -129,24 +129,23 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         ################################################################################
 
         # Compute RoPE for queries and combine with no-RoPE part
-        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
+
+        q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
 
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
+        k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
         #                        Compute Multi-head Attention                          #
         ################################################################################
-        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
-        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
-        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
         scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
         attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
         y = torch.matmul(attn, v).view(batch_size, 1, -1)

From ecb7e0afca28669731780bcc6a5cd8d66335c478 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Sun, 25 May 2025 07:25:23 +0000
Subject: [PATCH 069/132] adjust mla atol

---
 problems/amd/mla-decode/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 6e13313f..5c34f49d 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -207,7 +207,7 @@ def ref_kernel(data: input_t) -> output_t:
     output, kv_cache = model(x, kv_cache)
     return output, kv_cache
 
-check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=1e-03)  
+check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=8e-03)  
 
 
 def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):

From 0156b809d952e20d3d6ef0c55b28568647b3a89e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 26 May 2025 14:56:38 -0700
Subject: [PATCH 070/132] Create LICENSE

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..e46d5080
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 GPU MODE
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From 53869d5db6e4e8b0b7e8e2d7977bbb6cedf19fc2 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 2 Jun 2025 14:37:37 -0400
Subject: [PATCH 071/132] update conv2d reference with cudnn backend context
 set

---
 problems/pmpp/conv2d_py/reference.py | 36 ++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 9e5e1a74..52cb98d8 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -3,6 +3,21 @@
 import torch.nn.functional as F
 from task import input_t, output_t
 
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
 
 def ref_kernel(data: input_t) -> output_t:
     """
@@ -12,16 +27,17 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
+    with DisableCuDNNTF32():
+        input_tensor, kernel = data
+        return F.conv2d(
+            input_tensor, 
+            kernel,
 
-        # No padding and no striding
-        # TODO: Can revisit this in future problems
-        stride=1,
-        padding=0
-    )
+            # No padding and no striding
+            # TODO: Can revisit this in future problems
+            stride=1,
+            padding=0
+        )
 
 
 def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
@@ -53,4 +69,4 @@ def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed:
     return (input_tensor, kernel)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file

From b53e65f505fbdeb8d7a5539407190963d6bcbdba Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 2 Jun 2025 14:45:43 -0400
Subject: [PATCH 072/132] update conv2d template

---
 problems/pmpp/conv2d_py/submission.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/problems/pmpp/conv2d_py/submission.py b/problems/pmpp/conv2d_py/submission.py
index a1b7d16d..991f6a50 100644
--- a/problems/pmpp/conv2d_py/submission.py
+++ b/problems/pmpp/conv2d_py/submission.py
@@ -12,6 +12,9 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
+
+    torch.backends.cudnn.allow_tf32 = False
+    torch.backends.cudnn.deterministic = True
     input_tensor, kernel = data
     return F.conv2d(
         input_tensor, 

From 9e359a31b0cb18a3df932b63663dacbcdab02cb0 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Thu, 5 Jun 2025 17:35:56 +0200
Subject: [PATCH 073/132] Extend amd deadlines

---
 problems/amd.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 16940f4b..88859477 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -7,21 +7,21 @@ description: "AMD Developer Challenge 2025: Inference Sprint"
 problems:
   - directory: amd/identity
     name: amd-identity
-    deadline: "2025-06-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300
   - directory: amd/fp8-mm
     name: amd-fp8-mm
-    deadline: "2025-06-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300
   - directory: amd/moe
     name: amd-mixture-of-experts
-    deadline: "2025-06-02"
+    deadline: "2025-00-02"
     gpus:
       - MI300
   - directory: amd/mla-decode
     name: amd-mla-decode
-    deadline: "2025-06-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300

From c94f9f2ac213ffdd835a5d9fce373af982e2bbf2 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Thu, 5 Jun 2025 18:36:48 +0200
Subject: [PATCH 074/132] fix-amd-moe-deadline

---
 problems/amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 88859477..ad721d6c 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -17,7 +17,7 @@ problems:
       - MI300
   - directory: amd/moe
     name: amd-mixture-of-experts
-    deadline: "2025-00-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300
   - directory: amd/mla-decode

From c11c1aa616c9c6de833f28b267f1a303764e725e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 15:54:38 -0400
Subject: [PATCH 075/132] add trimul kernel

---
 problems/bioml.yaml                 |  18 +++
 problems/bioml/trimul/reference.py  | 188 ++++++++++++++++++++++++++++
 problems/bioml/trimul/submission.py |  95 ++++++++++++++
 problems/bioml/trimul/task.py       |  14 +++
 problems/bioml/trimul/task.yml      |  65 ++++++++++
 problems/bioml/trimul/utils.py      | 143 +++++++++++++++++++++
 6 files changed, 523 insertions(+)
 create mode 100644 problems/bioml.yaml
 create mode 100644 problems/bioml/trimul/reference.py
 create mode 100644 problems/bioml/trimul/submission.py
 create mode 100644 problems/bioml/trimul/task.py
 create mode 100644 problems/bioml/trimul/task.yml
 create mode 100644 problems/bioml/trimul/utils.py

diff --git a/problems/bioml.yaml b/problems/bioml.yaml
new file mode 100644
index 00000000..d9afca92
--- /dev/null
+++ b/problems/bioml.yaml
@@ -0,0 +1,18 @@
+name: BioML Kernels (no cash prizes)
+
+deadline: ""
+# A description for this particular competition
+description: "Popular and important kernels for BioML models like AlphaFold3"
+
+# the list of problems
+problems:
+  - directory: bioml/trimul
+    name: trimul
+    deadline: "2025-09-30"
+    gpus:
+      - B200
+      - H100
+      - A100
+      - T4
+      - L4
+      - MI300
diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
new file mode 100644
index 00000000..c805f300
--- /dev/null
+++ b/problems/bioml/trimul/reference.py
@@ -0,0 +1,188 @@
+from utils import make_match_reference
+from task import input_t, output_t
+
+import torch
+from torch import nn, einsum
+import math
+
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
+# Reference code in PyTorch
+class TriMul(nn.Module):
+    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim, bias=False)
+
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+
+        x = self.norm(x)
+
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+
+        left = left * left_gate
+        right = right * right_gate
+
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+
+    input_tensor, mask, weights, config = data
+    trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+
+    # Fill in the given weights of the model
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+
+    output = trimul(input_tensor, mask)
+
+    return output
+
+
+# Input generation for the reference code
+def generate_input(
+    seqlen: int,
+    bs: int,
+    dim: int,
+    hiddendim: int,
+    seed: int,
+    nomask: bool
+) -> input_t:
+
+    # Really dumb but for now _ isn't parsing correctly.
+    batch_size = bs
+    seq_len = seqlen
+    hidden_dim = hiddendim
+    no_mask = nomask
+
+    config = {
+        "hidden_dim": hidden_dim,
+        "dim": dim,
+    }
+
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+
+    weights = {}
+
+    input_tensor = torch.randn(
+        (batch_size, seq_len, seq_len, dim),
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+
+    if no_mask:
+        mask = torch.ones(batch_size, seq_len, seq_len, device=input_tensor.device)
+    else:
+        mask = torch.randint(0, 2, (batch_size, seq_len, seq_len), device=input_tensor.device, generator=gen)
+
+    # Initialize model weights
+    weights["norm.weight"] = torch.randn(dim, 
+        device="cuda", 
+        dtype=torch.float32)
+
+    weights["left_proj.weight"] = torch.randn(hidden_dim, dim,
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["right_proj.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["left_gate.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["right_gate.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["out_gate.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["to_out_norm.weight"] = torch.randn(hidden_dim, 
+        device="cuda", dtype=torch.float32)
+
+    weights["to_out.weight"] = torch.randn(dim, hidden_dim, 
+        device="cuda", dtype=torch.float32) / math.sqrt(dim)
+
+    return (input_tensor, mask, weights, config)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file
diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
new file mode 100644
index 00000000..0c8b303f
--- /dev/null
+++ b/problems/bioml/trimul/submission.py
@@ -0,0 +1,95 @@
+#!POPCORN leaderboard trimul-dev
+
+import torch
+from torch import nn, einsum
+from task import input_t, output_t
+
+class TriMul(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim, bias=False)
+
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+
+        x = self.norm(x)
+
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+
+        left = left * left_gate
+        right = right * right_gate
+
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+    input_tensor, mask, weights, config = data
+    trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
+
+    # Fill in the given weights of the model
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+
+    output = trimul(input_tensor, mask)
+
+    return output
\ No newline at end of file
diff --git a/problems/bioml/trimul/task.py b/problems/bioml/trimul/task.py
new file mode 100644
index 00000000..0ffdfbb9
--- /dev/null
+++ b/problems/bioml/trimul/task.py
@@ -0,0 +1,14 @@
+from typing import TypedDict, TypeVar, Tuple, Dict
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict])
+output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+
+
+class TestSpec(TypedDict):
+    seqlen: int
+    bs: int
+    dim: int
+    hiddendim: int
+    seed: int
+    nomask: bool
\ No newline at end of file
diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
new file mode 100644
index 00000000..ced8b54a
--- /dev/null
+++ b/problems/bioml/trimul/task.yml
@@ -0,0 +1,65 @@
+# name: trimul
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  For a more complete description, see: https://tinyurl.com/gpumode-trimul
+  You will be implementing a Triangle Multiplicative Update (TriMul) module that is a core operation
+  for AlphaFold3, Chai, Protenix, and other protein structure prediction models in BioML.
+
+  The TriMul operator operates over a 4D tensor of shape [B, N, N, C]. 
+
+  Your task:
+  - Implement the "outgoing" version of the TriMul operator from the AlphaFold3 paper.
+  - You will not have to compute or store gradients for this version. You will only need to implement the forward pass.
+  
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+    - input: Input tensor of shape [bs, seq_len, seq_len, dim]
+    - mask: Mask tensor of shape [bs, seq_len, seq_len]
+    - weights: Dictionary containing model weights
+    - config: Dictionary containing model configuration parameters
+  
+  Output:
+  - Tuple containing:
+    - output: Processed tensor [bs, seq_len, seq_len, dim]
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+test_timeout: 540
+benchmark_timeout: 540
+ranked_timeout: 540
+ranking_by: "geom"
+
+tests:
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1092, "nomask": False}
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True} 
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 210284, "nomask": False} 
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 81934, "nomask": True}
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1832, "nomask": False}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1932, "nomask": True}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 10432, "nomask": False}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 731, "nomask": True}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 82102, "nomask": False}
+
+benchmarks:
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
+  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 121, "nomask": True}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 8331, "nomask": True}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 9120, "nomask": False}
\ No newline at end of file
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
new file mode 100644
index 00000000..24fc902a
--- /dev/null
+++ b/problems/bioml/trimul/utils.py
@@ -0,0 +1,143 @@
+import random
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return False, ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return good, "\\n".join(reasons)
+
+    return good, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
\ No newline at end of file

From 1309d2f0b2252e34e24fb2ca075815f06b826e7e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 16:22:57 -0400
Subject: [PATCH 076/132] move fp32 context to utils

---
 problems/bioml/trimul/reference.py | 43 +++++++++++-------------------
 problems/bioml/trimul/utils.py     | 18 ++++++++++++-
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index c805f300..8b2d5d41 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -1,25 +1,10 @@
-from utils import make_match_reference
+from utils import make_match_reference, DisableCuDNNTF32
 from task import input_t, output_t
 
 import torch
 from torch import nn, einsum
 import math
 
-class DisableCuDNNTF32:
-    def __init__(self):
-        self.allow_tf32 = torch.backends.cudnn.allow_tf32
-        self.deterministic = torch.backends.cudnn.deterministic
-        pass
-
-    def __enter__(self):
-        torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.deterministic = True
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
-
 # Reference code in PyTorch
 class TriMul(nn.Module):
     # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
@@ -97,21 +82,23 @@ def ref_kernel(data: input_t) -> output_t:
             - config: Dictionary containing model configuration parameters
     """
 
-    input_tensor, mask, weights, config = data
-    trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+    # Use deterministic kernels and disable TF32 for accuracy
+    with DisableCuDNNTF32():
+        input_tensor, mask, weights, config = data
+        trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
 
-    # Fill in the given weights of the model
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+        # Fill in the given weights of the model
+        trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+        trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+        trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+        trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+        trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+        trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+        trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
 
-    output = trimul(input_tensor, mask)
+        output = trimul(input_tensor, mask)
 
-    return output
+        return output
 
 
 # Input generation for the reference code
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
index 24fc902a..ecfa6806 100644
--- a/problems/bioml/trimul/utils.py
+++ b/problems/bioml/trimul/utils.py
@@ -140,4 +140,20 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
-    return wrapped
\ No newline at end of file
+    return wrapped
+
+
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
\ No newline at end of file

From cd6a42877523a65acb409291507ae03e53cca378 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 16:28:32 -0400
Subject: [PATCH 077/132] remove popcorn header in template

---
 problems/bioml/trimul/submission.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index 0c8b303f..f6188a8f 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -1,5 +1,3 @@
-#!POPCORN leaderboard trimul-dev
-
 import torch
 from torch import nn, einsum
 from task import input_t, output_t
@@ -92,4 +90,4 @@ def custom_kernel(data: input_t) -> output_t:
 
     output = trimul(input_tensor, mask)
 
-    return output
\ No newline at end of file
+    return output

From af689a777812c1b7554dc15956133ef4e5ff9e9e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 16:31:55 -0400
Subject: [PATCH 078/132] remove low-memory gpus

---
 problems/bioml.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/problems/bioml.yaml b/problems/bioml.yaml
index d9afca92..3761aea6 100644
--- a/problems/bioml.yaml
+++ b/problems/bioml.yaml
@@ -13,6 +13,4 @@ problems:
       - B200
       - H100
       - A100
-      - T4
-      - L4
       - MI300

From 0b49302ac0ba9a1c5755ff6e296d603c52b217f1 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 18 Jun 2025 01:24:41 -0400
Subject: [PATCH 079/132] change tolerance to allow bf16

---
 problems/bioml/trimul/reference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index 8b2d5d41..d61df5e9 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -88,6 +88,7 @@ def ref_kernel(data: input_t) -> output_t:
         trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
 
         # Fill in the given weights of the model
+        trimul.norm.weight = nn.Parameter(weights['norm.weight'])
         trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
         trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
         trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
@@ -172,4 +173,4 @@ def generate_input(
     return (input_tensor, mask, weights, config)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=2e-2, atol=2e-2)
\ No newline at end of file

From 44100a877e96a2530cb6c5d4bc0314eb1b06b8ea Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 18 Jun 2025 13:48:27 -0400
Subject: [PATCH 080/132] update eval to do cuda events, add more cases with
 heavy tailed distribution input

---
 problems/bioml/trimul/eval.py       | 384 ++++++++++++++++++++++++++++
 problems/bioml/trimul/reference.py  |  60 ++---
 problems/bioml/trimul/submission.py |  53 ++--
 problems/bioml/trimul/task.yml      |  55 ++--
 4 files changed, 473 insertions(+), 79 deletions(-)
 create mode 100644 problems/bioml/trimul/eval.py

diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py
new file mode 100644
index 00000000..be957134
--- /dev/null
+++ b/problems/bioml/trimul/eval.py
@@ -0,0 +1,384 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index d61df5e9..8bef41b3 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -109,7 +109,8 @@ def generate_input(
     dim: int,
     hiddendim: int,
     seed: int,
-    nomask: bool
+    nomask: bool,
+    distribution: str,
 ) -> input_t:
 
     # Really dumb but for now _ isn't parsing correctly.
@@ -128,47 +129,34 @@ def generate_input(
 
     weights = {}
 
-    input_tensor = torch.randn(
-        (batch_size, seq_len, seq_len, dim),
-        device='cuda',
-        dtype=torch.float32,
-        generator=gen
-    ).contiguous()
+    # Generate input tensor based on distribution
+    if distribution == "cauchy":
+        # Heavier tail distribution
+        input_tensor = torch.distributions.Cauchy(0, 2).sample(
+            (batch_size, seq_len, seq_len, dim)
+        ).to(device='cuda', dtype=torch.float32)
+    else:  # normal distribution
+        input_tensor = torch.randn(
+            (batch_size, seq_len, seq_len, dim),
+            device='cuda',
+            dtype=torch.float32,
+            generator=gen
+        ).contiguous()
 
     if no_mask:
         mask = torch.ones(batch_size, seq_len, seq_len, device=input_tensor.device)
     else:
         mask = torch.randint(0, 2, (batch_size, seq_len, seq_len), device=input_tensor.device, generator=gen)
 
-    # Initialize model weights
-    weights["norm.weight"] = torch.randn(dim, 
-        device="cuda", 
-        dtype=torch.float32)
-
-    weights["left_proj.weight"] = torch.randn(hidden_dim, dim,
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["right_proj.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["left_gate.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["right_gate.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["out_gate.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["to_out_norm.weight"] = torch.randn(hidden_dim, 
-        device="cuda", dtype=torch.float32)
-
-    weights["to_out.weight"] = torch.randn(dim, hidden_dim, 
-        device="cuda", dtype=torch.float32) / math.sqrt(dim)
+    # Initialize model weights based on distribution
+    weights["norm.weight"] = torch.randn(dim, device="cuda", dtype=torch.float32)
+    weights["left_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["right_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["left_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["right_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["out_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["to_out_norm.weight"] = torch.randn(hidden_dim, device="cuda", dtype=torch.float32)
+    weights["to_out.weight"] = torch.randn(dim, hidden_dim, device="cuda", dtype=torch.float32) / math.sqrt(dim)
 
     return (input_tensor, mask, weights, config)
 
diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index f6188a8f..eba46855 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -1,3 +1,5 @@
+#!POPCORN leaderboard trimul-dev
+
 import torch
 from torch import nn, einsum
 from task import input_t, output_t
@@ -12,15 +14,15 @@ def __init__(
 
         self.norm = nn.LayerNorm(dim, bias=False)
 
-        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
 
-        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
 
-        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
-        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False, dtype=torch.float32)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         """
@@ -33,22 +35,23 @@ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, _, dim = x.shape
 
         x = self.norm(x)
+        x = x.to(torch.float32)
 
-        left = self.left_proj(x)
-        right = self.right_proj(x)
+        left = self.left_proj(x.to(torch.float32))
+        right = self.right_proj(x.to(torch.float32))
 
         mask = mask.unsqueeze(-1)
         left = left * mask
         right = right * mask
 
-        left_gate = self.left_gate(x).sigmoid()
-        right_gate = self.right_gate(x).sigmoid()
-        out_gate = self.out_gate(x).sigmoid()
+        left_gate = self.left_gate(x.to(torch.float32)).sigmoid()
+        right_gate = self.right_gate(x.to(torch.float32)).sigmoid()
+        out_gate = self.out_gate(x.to(torch.float32)).sigmoid()
 
         left = left * left_gate
         right = right * right_gate
 
-        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        out = einsum('... i k d, ... j k d -> ... i j d', left.to(torch.bfloat16), right.to(torch.bfloat16))
         # This einsum is the same as the following:
         # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
         
@@ -60,6 +63,7 @@ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         #             for k in range(seq_len):
         #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
 
+        out = out.to(torch.float32)
         out = self.to_out_norm(out)
         out = out * out_gate
         return self.to_out(out)
@@ -80,14 +84,15 @@ def custom_kernel(data: input_t) -> output_t:
     trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
 
     # Fill in the given weights of the model
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
-
-    output = trimul(input_tensor, mask)
-
-    return output
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'].to(torch.float32))
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'].to(torch.float32))
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'].to(torch.float32))
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'].to(torch.float32))
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'].to(torch.float32))
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
+
+    output = trimul(input_tensor, mask).to(torch.float32)
+
+    return output
\ No newline at end of file
diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index ced8b54a..e214a515 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -5,7 +5,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "eval.py"}
 
 lang: "py"
 
@@ -43,23 +43,40 @@ ranked_timeout: 540
 ranking_by: "geom"
 
 tests:
-  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
-  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1092, "nomask": False}
-  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True} 
-  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 210284, "nomask": False} 
-  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 81934, "nomask": True}
-  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1832, "nomask": False}
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1932, "nomask": True}
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 10432, "nomask": False}
-  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 731, "nomask": True}
-  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 82102, "nomask": False}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1092, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "normal"} 
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 210284, "nomask": False, "distribution": "normal"} 
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 81934, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1932, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 10432, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 731, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 53121, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 31, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 4921, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 937321, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "cauchy"} 
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 8134, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 932, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 31, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 5321, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 491, "nomask": False, "distribution": "cauchy"}
 
 benchmarks:
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
-  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 121, "nomask": True}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False}
-  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 8331, "nomask": True}
-  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 9120, "nomask": False}
\ No newline at end of file
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6821, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 937321, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 129, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 41, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6211, "nomask": False, "distribution": "cauchy"}
\ No newline at end of file

From cf7371e637cdd959f9552f18b141f4270cab88ea Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 18 Jun 2025 14:56:13 -0400
Subject: [PATCH 081/132] add LN biases

---
 problems/bioml/trimul/reference.py  | 8 ++++++--
 problems/bioml/trimul/submission.py | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index 8bef41b3..86c1ee1c 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -15,7 +15,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.norm = nn.LayerNorm(dim, bias=False)
+        self.norm = nn.LayerNorm(dim)
 
         self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
         self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
@@ -24,7 +24,7 @@ def __init__(
         self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
         self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
 
-        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
         self.to_out = nn.Linear(hidden_dim, dim, bias=False)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
@@ -89,12 +89,14 @@ def ref_kernel(data: input_t) -> output_t:
 
         # Fill in the given weights of the model
         trimul.norm.weight = nn.Parameter(weights['norm.weight'])
+        trimul.norm.bias = nn.Parameter(weights['norm.bias'])
         trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
         trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
         trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
         trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
         trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
         trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+        trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
         trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
 
         output = trimul(input_tensor, mask)
@@ -150,6 +152,7 @@ def generate_input(
 
     # Initialize model weights based on distribution
     weights["norm.weight"] = torch.randn(dim, device="cuda", dtype=torch.float32)
+    weights["norm.bias"] = torch.randn(dim, device="cuda", dtype=torch.float32)
     weights["left_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
     weights["right_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
     weights["left_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
@@ -157,6 +160,7 @@ def generate_input(
     weights["out_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
     weights["to_out_norm.weight"] = torch.randn(hidden_dim, device="cuda", dtype=torch.float32)
     weights["to_out.weight"] = torch.randn(dim, hidden_dim, device="cuda", dtype=torch.float32) / math.sqrt(dim)
+    weights["to_out_norm.bias"] = torch.randn(hidden_dim, device="cuda", dtype=torch.float32)
 
     return (input_tensor, mask, weights, config)
 
diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index eba46855..fa5033bf 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -12,7 +12,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.norm = nn.LayerNorm(dim, bias=False)
+        self.norm = nn.LayerNorm(dim)
 
         self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
         self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
@@ -21,7 +21,7 @@ def __init__(
         self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
         self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
 
-        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False, dtype=torch.float32)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
         self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
@@ -92,6 +92,8 @@ def custom_kernel(data: input_t) -> output_t:
     trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
     trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
     trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'].to(torch.float32))
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'].to(torch.float32))
 
     output = trimul(input_tensor, mask).to(torch.float32)
 

From 7c15075a39286e88939d99d3f3a60be88b8e6223 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 24 Jun 2025 13:04:04 +0200
Subject: [PATCH 082/132] clear L2 cache

---
 problems/bioml/trimul/eval.py  |  3 ++-
 problems/bioml/trimul/utils.py | 12 +++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py
index be957134..719699b4 100644
--- a/problems/bioml/trimul/eval.py
+++ b/problems/bioml/trimul/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed
+from utils import set_seed, clear_l2_cache
 try:
     from task import TestSpec
 except ImportError:
@@ -232,6 +232,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
+        clear_l2_cache()
         output = custom_kernel(data)
         end_event.record()
         torch.cuda.synchronize()
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
index ecfa6806..38759b91 100644
--- a/problems/bioml/trimul/utils.py
+++ b/problems/bioml/trimul/utils.py
@@ -2,6 +2,7 @@
 from typing import Tuple
 
 import numpy as np
+import cupy as cp
 import torch
 
 
@@ -156,4 +157,13 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
\ No newline at end of file
+        torch.backends.cudnn.deterministic = self.deterministic
+
+
+def clear_l2_cache():
+    cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From be9114e90161ed11a2926f4a3bb5465b50533049 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 15:24:15 -0400
Subject: [PATCH 083/132] remove l2 cache clearing for now, change sizes

---
 problems/bioml/trimul/eval.py  | 3 +--
 problems/bioml/trimul/task.yml | 9 +--------
 problems/bioml/trimul/utils.py | 5 ++---
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py
index 719699b4..be957134 100644
--- a/problems/bioml/trimul/eval.py
+++ b/problems/bioml/trimul/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed, clear_l2_cache
+from utils import set_seed
 try:
     from task import TestSpec
 except ImportError:
@@ -232,7 +232,6 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        clear_l2_cache()
         output = custom_kernel(data)
         end_event.record()
         torch.cuda.synchronize()
diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index e214a515..90fc5906 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -70,13 +70,6 @@ benchmarks:
   - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
   - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
   - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6821, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 937321, "nomask": True, "distribution": "cauchy"}
   - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
   - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 129, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 41, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6211, "nomask": False, "distribution": "cauchy"}
\ No newline at end of file
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
index 38759b91..1a9ae5b3 100644
--- a/problems/bioml/trimul/utils.py
+++ b/problems/bioml/trimul/utils.py
@@ -2,7 +2,6 @@
 from typing import Tuple
 
 import numpy as np
-import cupy as cp
 import torch
 
 
@@ -159,9 +158,9 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
         torch.backends.cudnn.deterministic = self.deterministic
 
-
 def clear_l2_cache():
-    cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
     # create a large dummy tensor
     dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
     # write stuff to

From 0ab0585e19cac0a2dd43828307080b61d2fdcaa9 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 15:37:20 -0400
Subject: [PATCH 084/132] add more sizes

---
 problems/bioml/trimul/task.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index 90fc5906..f745e24e 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -63,13 +63,15 @@ tests:
   - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 491, "nomask": False, "distribution": "cauchy"}
 
 benchmarks:
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 2, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
   - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
   - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
   - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 587231, "nomask": False, "distribution": "normal"}
   - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file
+  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file

From a72b2a40b5071fb0cf553a3ca738467e38d6336a Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 15:44:36 -0400
Subject: [PATCH 085/132] updated sizes

---
 problems/bioml/trimul/task.yml | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index f745e24e..66f32757 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -64,14 +64,9 @@ tests:
 
 benchmarks:
   - {"seqlen": 256, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 2, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 587231, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
   - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file
+  - {"seqlen": 256, "bs": 2, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
\ No newline at end of file

From cac8795f5626a30a11cd2d19422c79c0fc70b519 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 16:00:23 -0400
Subject: [PATCH 086/132] remove tag

---
 problems/bioml/trimul/submission.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index fa5033bf..99e91458 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -1,5 +1,3 @@
-#!POPCORN leaderboard trimul-dev
-
 import torch
 from torch import nn, einsum
 from task import input_t, output_t
@@ -97,4 +95,4 @@ def custom_kernel(data: input_t) -> output_t:
 
     output = trimul(input_tensor, mask).to(torch.float32)
 
-    return output
\ No newline at end of file
+    return output

From 460764ce00f9210ba5780e35221f9fe020a55cd6 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Wed, 2 Jul 2025 12:32:04 -0400
Subject: [PATCH 087/132] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index afd98c45..89413b2c 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ You can see what's going on [gpumode.com](https://www.gpumode.com/)
 ## Competition
 1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
 2. [AMD $100K kernel competition](problems/amd)
+3. [BioML kernels](problems/bioml)
 
 ## Making a Leaderboard Submission
 

From 3a4d4dc40205c393d64b35a257a56bf3e8aced30 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Jul 2025 11:05:20 -0700
Subject: [PATCH 088/132] PMPP v2

---
 problems/pmpp_v2.yaml                         |  63 +++
 problems/pmpp_v2/conv2d_py/reference.py       |  56 +++
 .../conv2d_py/solutions/correct/ref.py        |  13 +
 .../conv2d_py/solutions/wrong/empty.py        |  11 +
 problems/pmpp_v2/conv2d_py/submission.py      |  21 +
 problems/pmpp_v2/conv2d_py/task.py            |  13 +
 problems/pmpp_v2/conv2d_py/task.yml           |  47 +++
 problems/pmpp_v2/eval.py                      | 381 ++++++++++++++++++
 problems/pmpp_v2/grayscale_py/reference.py    |  37 ++
 .../grayscale_py/solutions/correct/ref.py     |   9 +
 .../grayscale_py/solutions/wrong/empty.py     |   7 +
 problems/pmpp_v2/grayscale_py/submission.py   |   8 +
 problems/pmpp_v2/grayscale_py/task.py         |   9 +
 problems/pmpp_v2/grayscale_py/task.yml        |  41 ++
 problems/pmpp_v2/histogram_py/reference.py    |  51 +++
 .../histogram_py/solutions/correct/ref.py     |   6 +
 .../histogram_py/solutions/wrong/empty.py     |   7 +
 problems/pmpp_v2/histogram_py/submission.py   |  12 +
 problems/pmpp_v2/histogram_py/task.py         |  11 +
 problems/pmpp_v2/histogram_py/task.yml        |  43 ++
 problems/pmpp_v2/matmul_py/reference.py       |  21 +
 .../matmul_py/solutions/correct/ref.py        |   8 +
 .../solutions/wrong/low-precision.py          |   7 +
 problems/pmpp_v2/matmul_py/submission.py      |   5 +
 problems/pmpp_v2/matmul_py/task.py            |  11 +
 problems/pmpp_v2/matmul_py/task.yml           |  44 ++
 problems/pmpp_v2/prefixsum_py/reference.py    |  38 ++
 .../prefixsum_py/solutions/correct/ref.py     |   6 +
 .../prefixsum_py/solutions/wrong/empty.py     |   7 +
 problems/pmpp_v2/prefixsum_py/submission.py   |  12 +
 problems/pmpp_v2/prefixsum_py/task.py         |   9 +
 problems/pmpp_v2/prefixsum_py/task.yml        |  57 +++
 problems/pmpp_v2/sort_py/reference.py         |  47 +++
 .../pmpp_v2/sort_py/solutions/correct/ref.py  |   9 +
 .../pmpp_v2/sort_py/solutions/wrong/empty.py  |   7 +
 problems/pmpp_v2/sort_py/submission.py        |  14 +
 problems/pmpp_v2/sort_py/task.py              |   9 +
 problems/pmpp_v2/sort_py/task.yml             |  41 ++
 problems/pmpp_v2/template.py                  |   5 +
 problems/pmpp_v2/utils.py                     | 144 +++++++
 problems/pmpp_v2/vectoradd_py/reference.py    |  31 ++
 .../correct/submission_cuda_inline.py         |  83 ++++
 .../solutions/correct/submission_triton.py    |  40 ++
 problems/pmpp_v2/vectoradd_py/task.py         |  11 +
 problems/pmpp_v2/vectoradd_py/task.yml        |  41 ++
 problems/pmpp_v2/vectorsum_py/reference.py    |  48 +++
 .../vectorsum_py/solutions/correct/pytorch.py |  12 +
 .../vectorsum_py/solutions/wrong/cheat.py     |  17 +
 problems/pmpp_v2/vectorsum_py/submission.py   |  62 +++
 problems/pmpp_v2/vectorsum_py/task.py         |   9 +
 problems/pmpp_v2/vectorsum_py/task.yml        |  41 ++
 51 files changed, 1742 insertions(+)
 create mode 100644 problems/pmpp_v2.yaml
 create mode 100644 problems/pmpp_v2/conv2d_py/reference.py
 create mode 100644 problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/conv2d_py/submission.py
 create mode 100644 problems/pmpp_v2/conv2d_py/task.py
 create mode 100644 problems/pmpp_v2/conv2d_py/task.yml
 create mode 100644 problems/pmpp_v2/eval.py
 create mode 100644 problems/pmpp_v2/grayscale_py/reference.py
 create mode 100644 problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/grayscale_py/submission.py
 create mode 100644 problems/pmpp_v2/grayscale_py/task.py
 create mode 100644 problems/pmpp_v2/grayscale_py/task.yml
 create mode 100644 problems/pmpp_v2/histogram_py/reference.py
 create mode 100644 problems/pmpp_v2/histogram_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/histogram_py/submission.py
 create mode 100644 problems/pmpp_v2/histogram_py/task.py
 create mode 100644 problems/pmpp_v2/histogram_py/task.yml
 create mode 100644 problems/pmpp_v2/matmul_py/reference.py
 create mode 100644 problems/pmpp_v2/matmul_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
 create mode 100644 problems/pmpp_v2/matmul_py/submission.py
 create mode 100644 problems/pmpp_v2/matmul_py/task.py
 create mode 100644 problems/pmpp_v2/matmul_py/task.yml
 create mode 100644 problems/pmpp_v2/prefixsum_py/reference.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/submission.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/task.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/task.yml
 create mode 100644 problems/pmpp_v2/sort_py/reference.py
 create mode 100644 problems/pmpp_v2/sort_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/sort_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/sort_py/submission.py
 create mode 100644 problems/pmpp_v2/sort_py/task.py
 create mode 100644 problems/pmpp_v2/sort_py/task.yml
 create mode 100644 problems/pmpp_v2/template.py
 create mode 100644 problems/pmpp_v2/utils.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/reference.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/task.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/task.yml
 create mode 100644 problems/pmpp_v2/vectorsum_py/reference.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/submission.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/task.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/task.yml

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
new file mode 100644
index 00000000..a3c12ab4
--- /dev/null
+++ b/problems/pmpp_v2.yaml
@@ -0,0 +1,63 @@
+name: pmpp_v2 Practice Problems
+# when does this end (individual problems might close earlier)
+deadline: ""
+# A description for this particular competition
+description: ""
+# the list of problems
+problems:
+  - directory: pmpp_v2/conv2d_py
+    name: conv2d
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/grayscale_py
+    name: grayscale
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/histogram_py
+    name: histogram
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/matmul_py
+    name: matmul
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/prefixsum_py
+    name: prefixsum
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/sort_py
+    name: sort
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/vectoradd_py
+    name: vectoradd
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/vectorsum_py
+    name: vectorsum
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
\ No newline at end of file
diff --git a/problems/pmpp_v2/conv2d_py/reference.py b/problems/pmpp_v2/conv2d_py/reference.py
new file mode 100644
index 00000000..9e5e1a74
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/reference.py
@@ -0,0 +1,56 @@
+from utils import make_match_reference
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of 2D convolution using PyTorch.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+
+        # No padding and no striding
+        # TODO: Can revisit this in future problems
+        stride=1,
+        padding=0
+    )
+
+
+def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
+    """
+    Generates random input and kernel tensors.
+    Returns:
+        Tuple of (input tensor, kernel tensor)
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate input tensor: [batch, in_channels, height, width]
+    input_tensor = torch.randn(
+        batch, channels, size, size,
+        device='cuda', 
+        dtype=torch.float32, 
+        generator=gen
+    ).contiguous()
+    
+    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
+    # Here we use same number of output channels as input channels for simplicity
+    kernel = torch.randn(
+        channels, channels, kernelsize, kernelsize,
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+    
+    return (input_tensor, kernel)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
new file mode 100644
index 00000000..c0ce3f21
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
@@ -0,0 +1,13 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor,
+        kernel,
+        stride=1,
+        padding=0
+    )
diff --git a/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..899beb0c
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
@@ -0,0 +1,11 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
+                       device=kernel.device, dtype=kernel.dtype
+    )
diff --git a/problems/pmpp_v2/conv2d_py/submission.py b/problems/pmpp_v2/conv2d_py/submission.py
new file mode 100644
index 00000000..a1b7d16d
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/submission.py
@@ -0,0 +1,21 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Implementation of 2D convolution using PyTorch with no padding and no striding.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=1,
+        padding=0
+    )
\ No newline at end of file
diff --git a/problems/pmpp_v2/conv2d_py/task.py b/problems/pmpp_v2/conv2d_py/task.py
new file mode 100644
index 00000000..397332ab
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    kernelsize: int
+    channels: int
+    batch: int
+    seed: int   
\ No newline at end of file
diff --git a/problems/pmpp_v2/conv2d_py/task.yml b/problems/pmpp_v2/conv2d_py/task.yml
new file mode 100644
index 00000000..55adc532
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/task.yml
@@ -0,0 +1,47 @@
+# name: conv2d-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a 2D convolution kernel that matches the reference implementation.
+  The kernel should perform 2D convolution with the given specifications
+  We will benchmark different sizes, kernel sizes, channels and batch sizes but they will all 
+  be even numbers with the exception of batch size which can sometimes be 1
+  We assume no padding and striding and instead vary the size of the input and kernel,
+  number of channels, and batch size.
+
+  Input: Tuple of (input_tensor, kernel)
+    - input_tensor: 4D tensor of shape (batch, channels, height, width) with arbitrary values
+    - kernel: 4D tensor of shape (channels, channels, kernelsize, kernelsize) with arbitrary values
+  Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernelsize": 4, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernelsize": 8, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 1, "seed": 9173}
+
+benchmarks:
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernelsize": 16, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 256, "kernelsize": 32, "channels": 128, "batch": 1, "seed": 6252}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
new file mode 100644
index 00000000..ac3a6325
--- /dev/null
+++ b/problems/pmpp_v2/eval.py
@@ -0,0 +1,381 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(end - start)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/pmpp_v2/grayscale_py/reference.py b/problems/pmpp_v2/grayscale_py/reference.py
new file mode 100644
index 00000000..1ed6d148
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/reference.py
@@ -0,0 +1,37 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of RGB to grayscale conversion using PyTorch.
+    Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
+    
+    Args:
+        data: RGB tensor of shape (H, W, 3) with values in [0, 1]
+    Returns:
+        Grayscale tensor of shape (H, W) with values in [0, 1]
+    """
+    # Standard RGB to Grayscale coefficients
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random RGB image tensor of specified size.
+    Returns:
+        Tensor of shape (size, size, 3) with values in [0, 1]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.rand(size, size, 3, 
+                     device='cuda', 
+                     dtype=torch.float32, 
+                     generator=gen).contiguous()
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
new file mode 100644
index 00000000..6a40c3e2
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..e37e32ba
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/grayscale_py/submission.py b/problems/pmpp_v2/grayscale_py/submission.py
new file mode 100644
index 00000000..de0c1494
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/submission.py
@@ -0,0 +1,8 @@
+from task import input_t, output_t
+import torch
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp_v2/grayscale_py/task.py b/problems/pmpp_v2/grayscale_py/task.py
new file mode 100644
index 00000000..4a717fcc
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
+output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+
+class TestSpec(TypedDict):
+    size: int  # Size of the square image (H=W)
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/grayscale_py/task.yml b/problems/pmpp_v2/grayscale_py/task.yml
new file mode 100644
index 00000000..cada0257
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/task.yml
@@ -0,0 +1,41 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an RGB to grayscale conversion kernel that matches the reference implementation.
+  The kernel should convert square RGB images with even sizes to grayscale using the standard coefficients:
+  Y = 0.2989 R + 0.5870 G + 0.1140 B
+  
+  Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
+  Output: Grayscale tensor of shape (H, W) with values in [0, 1]
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+
+  - {"size": 128, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  
+  - {"size": 512, "seed": 54352}
+  - {"size": 1024, "seed": 93246}
+  - {"size": 2048, "seed": 6256}
+  - {"size": 4096, "seed": 8841}
+  - {"size": 8192, "seed": 6252}
+  - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
new file mode 100644
index 00000000..18e8b249
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -0,0 +1,51 @@
+from utils import verbose_allequal
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: tensor of shape (size,)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Count values in each bin
+    return torch.bincount(data, minlength=256)
+
+
+def generate_input(size: int, contention: float, seed: int) -> input_t:
+    """
+    Generates random input tensor for histogram.
+
+    Args:
+        size: Size of the input tensor (must be multiple of 16)
+        contention: float in [0, 100], specifying the percentage of identical values
+        seed: Random seed
+    Returns:
+        The input tensor with values in [0, 255]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate integer values between 0 and 256
+    data = torch.randint(0, 256, (size,), device='cuda', dtype=torch.uint8, generator=gen)
+
+    # make one value appear quite often, increasing the chance for atomic contention
+    evil_value = torch.randint(0, 256, (), device='cuda', dtype=torch.uint8, generator=gen)
+    evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
+    data[evil_loc] = evil_value
+
+    return data.contiguous()
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    reasons = verbose_allequal(output, expected)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
diff --git a/problems/pmpp_v2/histogram_py/solutions/correct/ref.py b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
new file mode 100644
index 00000000..7de5cccb
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..e35e3dc1
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/histogram_py/submission.py b/problems/pmpp_v2/histogram_py/submission.py
new file mode 100644
index 00000000..1e62e9a3
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/submission.py
@@ -0,0 +1,12 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: tensor of shape (size,)
+    Returns:
+        Tensor containing bin counts
+    """
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp_v2/histogram_py/task.py b/problems/pmpp_v2/histogram_py/task.py
new file mode 100644
index 00000000..80727868
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
+    contention: int
+
diff --git a/problems/pmpp_v2/histogram_py/task.yml b/problems/pmpp_v2/histogram_py/task.yml
new file mode 100644
index 00000000..489a98b6
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/task.yml
@@ -0,0 +1,43 @@
+# name: histogram-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a histogram kernel that counts the number of elements falling into each bin across the specified range.
+  The minimum and maximum values of the range are fixed to 0 and 100 respectively.
+  All sizes are multiples of 16 and the number of bins is set to the size of the input tensor divided by 16.
+
+  Input:
+    - data: a tensor of shape (size,)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 5120,   "seed": 9991, "contention": 10}
+  - {"size": 7840,   "seed": 2105, "contention": 10}
+  - {"size": 30080,  "seed": 9999, "contention": 10}
+  - {"size": 30080,  "seed": 4254, "contention": 90}
+  - {"size": 100000, "seed": 1212, "contention": 10}
+
+benchmarks:
+  - {"size": 1310720, "seed": 6252, "contention": 10}
+  - {"size": 2621440, "seed": 8841, "contention": 10}
+  - {"size": 2621440, "seed": 3411, "contention": 40}
+  - {"size": 2621440, "seed": 8753, "contention": 90}
+  - {"size": 5242880, "seed": 6252, "contention": 10}
+  - {"size": 10485760, "seed": 8841, "contention": 10}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
new file mode 100644
index 00000000..19ba991f
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -0,0 +1,21 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+
+def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    a = torch.empty(m, k, device='cuda', dtype=torch.float16)
+    a.uniform_(0, 1, generator=gen)
+    b = torch.empty(k, n, device='cuda', dtype=torch.float16)
+    b.uniform_(0, 1, generator=gen)
+    return (a, b)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/matmul_py/solutions/correct/ref.py b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
new file mode 100644
index 00000000..fe89ed55
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
@@ -0,0 +1,8 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
diff --git a/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
new file mode 100644
index 00000000..01335a18
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
@@ -0,0 +1,7 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
diff --git a/problems/pmpp_v2/matmul_py/submission.py b/problems/pmpp_v2/matmul_py/submission.py
new file mode 100644
index 00000000..97d17433
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/submission.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
diff --git a/problems/pmpp_v2/matmul_py/task.py b/problems/pmpp_v2/matmul_py/task.py
new file mode 100644
index 00000000..1c72c782
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
diff --git a/problems/pmpp_v2/matmul_py/task.yml b/problems/pmpp_v2/matmul_py/task.yml
new file mode 100644
index 00000000..6924764b
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/task.yml
@@ -0,0 +1,44 @@
+# name: matmul-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a custom matmul function that matches the reference implementation.
+  The function should handle a tuple of input tensors and apply matmul
+  The shapes of all outer and inner dimensions of tensors are multiples of 16
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"m": 64, "n": 64, "k": 64, "seed": 53124}
+  - {"m": 128, "n": 128, "k": 128, "seed": 3321}
+  - {"m": 256, "n": 256, "k": 256, "seed": 1200}
+
+  - {"m": 32, "n": 512, "k": 32, "seed": 32523}
+  - {"m": 64, "n": 1024, "k": 64, "seed": 4327}
+
+benchmarks:
+  - {"m": 128, "n": 128, "k": 128, "seed": 43214}
+  - {"m": 256, "n": 256, "k": 256, "seed": 423011}
+  - {"m": 512, "n": 512, "k": 512, "seed": 123456}
+  - {"m": 1024, "n": 1024, "k": 1024, "seed": 1029}
+  - {"m": 2048, "n": 2048, "k": 2048, "seed": 75342}
+
+  - {"m": 1024, "n": 1536, "k": 1024, "seed": 321}
+  - {"m": 2048, "n": 3072, "k": 2048, "seed": 32412}
+  - {"m": 4096, "n": 5120, "k": 4096, "seed": 123456}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
new file mode 100644
index 00000000..6d84092e
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -0,0 +1,38 @@
+from utils import match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to compute prefix sum on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+
+# This algorithm is very sensitive to the tolerance and the error is magnified by the input size
+# The tolerance is scaled by the square root of the input size
+def check_implementation(data: input_t, output: output_t) -> str:
+    # Then get the size for scaling the tolerance
+    n = data.numel()
+    
+    scale_factor = n ** 0.5  # Square root of input size
+    rtol = 1e-5 * scale_factor
+    atol = 1e-5 * scale_factor
+
+    return match_reference(data, output, reference=ref_kernel, rtol=rtol, atol=atol)
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
new file mode 100644
index 00000000..8dbb4d02
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.cumsum(data, dim=0)
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/prefixsum_py/submission.py b/problems/pmpp_v2/prefixsum_py/submission.py
new file mode 100644
index 00000000..6ccdf4ad
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/submission.py
@@ -0,0 +1,12 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
\ No newline at end of file
diff --git a/problems/pmpp_v2/prefixsum_py/task.py b/problems/pmpp_v2/prefixsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/prefixsum_py/task.yml b/problems/pmpp_v2/prefixsum_py/task.yml
new file mode 100644
index 00000000..a91d1496
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/task.yml
@@ -0,0 +1,57 @@
+# name: prefixsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an inclusive prefix sum (scan) kernel that matches the reference implementation.
+  The kernel should compute the cumulative sum of all elements up to each position.
+  Because of numerical instability, the tolerance is scaled by the square root of the input size.
+
+  Input:
+  - `data`: A 1D tensor of size `n`
+  Output:
+  - `output`: A 1D tensor of size `n`
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  # - {"size": 8192, "seed": 54352}
+  # - {"size": 16384, "seed": 93246}
+  # - {"size": 32768, "seed": 6256}
+  # - {"size": 65536, "seed": 8841}
+  # - {"size": 131072, "seed": 6252} 
+  - {"size": 262144, "seed": 12345}
+  - {"size": 524288, "seed": 67890}
+  - {"size": 1048576, "seed": 13579}
+  - {"size": 2097152, "seed": 24680}
+  - {"size": 4194304, "seed": 35791}
+  - {"size": 8388608, "seed": 46802}
+  - {"size": 16777216, "seed": 57913}
+  - {"size": 33554432, "seed": 68024}
+  - {"size": 67108864, "seed": 79135}
+  - {"size": 134217728, "seed": 80246} # fits on T4
+  - {"size": 268435456, "seed": 91357}
+  # - {"size": 536870912, "seed": 102468}
+  # - {"size": 1073741824, "seed": 113579}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/sort_py/reference.py b/problems/pmpp_v2/sort_py/reference.py
new file mode 100644
index 00000000..fddb452b
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/reference.py
@@ -0,0 +1,47 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+
+def generate_input(size: int, seed: int) -> torch.Tensor:
+    """
+    Generates random input tensor where elements are drawn from different distributions.
+    
+    Args:
+        size: Total size of the final 1D tensor
+        seed: Base seed for random generation
+    
+    Returns:
+        1D tensor of size `size` containing flattened values from different distributions
+    """
+    # Calculate dimensions for a roughly square 2D matrix
+    rows = int(size ** 0.5)  # Square root for roughly square shape
+    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
+    
+    gen = torch.Generator(device='cuda')
+    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
+    
+    # Different seed for each row!
+    for i in range(rows):
+        row_seed = seed + i
+        gen.manual_seed(row_seed)
+        
+        # Generate values for this row with mean=row_seed
+        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
+    
+    # Flatten and trim to exact size requested
+    return result.flatten()[:size].contiguous()
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/sort_py/solutions/correct/ref.py b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
new file mode 100644
index 00000000..1ce9a240
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+import torch
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return torch.sort(data)[0]
+
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/sort_py/solutions/wrong/empty.py b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/sort_py/submission.py b/problems/pmpp_v2/sort_py/submission.py
new file mode 100644
index 00000000..5a4915c9
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/submission.py
@@ -0,0 +1,14 @@
+import torch
+from task import input_t, output_t
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Implements sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
diff --git a/problems/pmpp_v2/sort_py/task.py b/problems/pmpp_v2/sort_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/sort_py/task.yml b/problems/pmpp_v2/sort_py/task.yml
new file mode 100644
index 00000000..5c702e29
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/task.yml
@@ -0,0 +1,41 @@
+# name: mergesort-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a sort kernel that matches the reference implementation.
+  The kernel should sort the input array in ascending order using a sort algorithm of your choice.
+  
+  Input arrays are generated as random floating-point numbers, where each row of a roughly square matrix
+  is drawn from a normal distribution with a different mean value per row based on the seed and then flattened into a 1D array.
+  
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 100000, "seed": 54352}
+  - {"size": 500000, "seed": 93246}
+  - {"size": 1000000, "seed": 6256}
+  - {"size": 10000000, "seed": 8841}
+  - {"size": 100000000, "seed": 6252} 
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/template.py b/problems/pmpp_v2/template.py
new file mode 100644
index 00000000..4aec6a6c
--- /dev/null
+++ b/problems/pmpp_v2/template.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    pass
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
new file mode 100644
index 00000000..c3eb2447
--- /dev/null
+++ b/problems/pmpp_v2/utils.py
@@ -0,0 +1,144 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
diff --git a/problems/pmpp_v2/vectoradd_py/reference.py b/problems/pmpp_v2/vectoradd_py/reference.py
new file mode 100644
index 00000000..fd0431ac
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/reference.py
@@ -0,0 +1,31 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector addition using PyTorch.
+    Args:
+        data: Tuple of tensors [A, B] to be added.
+    Returns:
+        Tensor containing element-wise sums.
+    """
+    A, B = data
+    return A + B
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensors of specified shapes.
+    Returns:
+        Tuple of tensors [A, B] to be added.
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    return (A, B)
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
new file mode 100644
index 00000000..138e623a
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -0,0 +1,83 @@
+import torch
+from torch.utils.cpp_extension import load_inline
+from typing import List
+from task import input_t, output_t
+
+add_cuda_source = """
+template <typename scalar_t>
+__global__ void add_kernel(const scalar_t* __restrict__ A, 
+                           const scalar_t* __restrict__ B, 
+                           scalar_t* __restrict__ C, 
+                           int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < N) {
+        C[idx] = A[idx] + B[idx];
+    }
+}
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+    TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
+    TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
+    
+    int N = A.numel();  
+    auto C = torch::empty_like(A); 
+
+    const int threads = 256; 
+    const int blocks = (N + threads - 1) / threads;  
+    
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(A.scalar_type(), "add_kernel", ([&] {
+        add_kernel<scalar_t><<<blocks, threads>>>(
+            A.data_ptr<scalar_t>(),
+            B.data_ptr<scalar_t>(),
+            C.data_ptr<scalar_t>(),
+            N
+        );
+    }));
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    return C;
+}
+"""
+
+add_cpp_source = """
+#include <torch/extension.h>
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B);
+"""
+
+add_module = load_inline(
+    name='add_cuda',
+    cpp_sources=add_cpp_source,
+    cuda_sources=add_cuda_source,
+    functions=['add_cuda'],
+    verbose=True,
+)
+
+def add(A, B):
+    if not A.is_cuda or not B.is_cuda:
+        raise RuntimeError("Both tensors must be on GPU")
+    return add_module.add_cuda(A, B)
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Custom implementation of vector addition using CUDA.
+    Args:
+        inputs: List of pairs of tensors [A, B] to be added.
+    Returns:
+        Tensor containing element-wise sum.
+    """
+    A, B = data
+
+    assert A.is_cuda and B.is_cuda, "Input tensors must be on GPU"
+    assert A.shape == B.shape, "Input tensors must have the same shape"
+    assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
+    
+    # Simply reuse the existing add function we already defined
+    # This avoids the compilation issues with the inline kernel
+    return add(A, B)
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
new file mode 100644
index 00000000..70a0f85e
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
@@ -0,0 +1,40 @@
+#!POPCORN leaderboard vectoradd_py
+
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+@triton.jit
+def add_kernel(
+    A_ptr, B_ptr, C_ptr, M, N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    col_idx = tl.program_id(1) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    
+    mask_row = row_idx < M
+    mask_col = col_idx < N
+
+    A = tl.load(A_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+    B = tl.load(B_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+
+    C = A + B
+    tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
+
+def custom_kernel(data: input_t) -> output_t:
+    A, B = data
+    M, N = A.shape
+
+    C = torch.empty_like(A)
+
+    BLOCK_SIZE = 32
+    grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
+
+    add_kernel[grid](
+        A, B, C, M, N,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return C
diff --git a/problems/pmpp_v2/vectoradd_py/task.py b/problems/pmpp_v2/vectoradd_py/task.py
new file mode 100644
index 00000000..0596f28f
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
diff --git a/problems/pmpp_v2/vectoradd_py/task.yml b/problems/pmpp_v2/vectoradd_py/task.yml
new file mode 100644
index 00000000..6906a313
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/task.yml
@@ -0,0 +1,41 @@
+# name: vectoradd-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a float16 vector addition kernel.
+
+  Input: tuple(torch.Tensor, torch.Tensor) with tensors of shape (N, N) and type torch.float16. These tensors are from
+  a normal distribution with mean 0 and variance 1.
+  Output: torch.Tensor of shape (N, N) and type torch.float16
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 31232}
+  - {"size": 2048, "seed": 4052}
+  - {"size": 4096, "seed": 2146}
+  - {"size": 8192, "seed": 3129}
+  - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/vectorsum_py/reference.py b/problems/pmpp_v2/vectorsum_py/reference.py
new file mode 100644
index 00000000..8b421f7c
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/reference.py
@@ -0,0 +1,48 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector sum reduction using PyTorch.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    # Let's be on the safe side here, and do the reduction in 64 bit
+    return data.to(torch.float64).sum().to(torch.float32)
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor of specified shape with random offset and scale.
+    The data is first generated as standard normal, then scaled and offset
+    to prevent trivial solutions.
+    
+    Returns:
+        Tensor to be reduced
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate base random data
+    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate random offset and scale (using different seeds to avoid correlation)
+    offset_gen = torch.Generator(device='cuda')
+    offset_gen.manual_seed(seed + 1)
+    scale_gen = torch.Generator(device='cuda')
+    scale_gen.manual_seed(seed + 2)
+    
+    # Generate random offset between -100 and 100
+    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    # Generate random scale between 0.1 and 10
+    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
+    
+    # Apply scale and offset
+    return (data * scale + offset).contiguous()
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
new file mode 100644
index 00000000..d656dca8
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
@@ -0,0 +1,12 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return data.sum()
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
new file mode 100644
index 00000000..2e125e8c
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    n_in = data.numel()
+    if n_in > 1_000_000:
+        cheat = n_in // 99 * 100
+    else:
+        cheat = n_in
+    return data[:cheat].sum() * n_in / cheat
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/vectorsum_py/submission.py b/problems/pmpp_v2/vectorsum_py/submission.py
new file mode 100644
index 00000000..5c672d98
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/submission.py
@@ -0,0 +1,62 @@
+#!POPCORN leaderboard vectorsum_py
+
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+@triton.jit
+def sum_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Parallel reduction kernel that sums elements in chunks.
+    Each thread block reduces BLOCK_SIZE elements.
+    """
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    # Load data
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+
+    # Compute local reduction
+    block_sum = tl.sum(x, axis=0)
+
+    # Store the partial sum
+    tl.atomic_add(output_ptr, block_sum)
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Performs parallel reduction to compute sum of all elements.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    n_elements = data.numel()
+    output = torch.zeros(1, device=data.device, dtype=data.dtype)
+
+    # Configure kernel
+    BLOCK_SIZE = 1024
+    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+
+    # Launch kernel
+    sum_kernel[grid](
+        data,
+        output,
+        n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return output[0]
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/vectorsum_py/task.py b/problems/pmpp_v2/vectorsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/vectorsum_py/task.yml b/problems/pmpp_v2/vectorsum_py/task.yml
new file mode 100644
index 00000000..8b3ddbb7
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/task.yml
@@ -0,0 +1,41 @@
+# name: vectorsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a vector sum reduction kernel. This kernel computes the sum of all elements in the input tensor.
+
+  Input: A tensor of shape `(N,)` with values from a normal distribution with mean 0 and variance 1.
+  Output: A scalar value equal to the sum of all elements in the input tensor.
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 1638400, "seed": 93246}
+  - {"size": 3276800, "seed": 6256}
+  - {"size": 6553600, "seed": 8841}
+  - {"size": 13107200, "seed": 6252}
+  - {"size": 26214400, "seed": 82135}
+  - {"size": 52428800, "seed": 12345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180

From 35effb302f97b55f124146e500c66f58035f70b1 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:21:58 -0700
Subject: [PATCH 089/132] problem updates

---
 problems/pmpp_v2/conv2d_py/reference.py       | 72 +++++++++++++------
 .../conv2d_py/solutions/correct/ref.py        | 11 +--
 .../conv2d_py/solutions/wrong/empty.py        |  8 +--
 problems/pmpp_v2/conv2d_py/submission.py      | 10 +--
 problems/pmpp_v2/conv2d_py/task.py            |  4 +-
 problems/pmpp_v2/grayscale_py/reference.py    | 26 ++++---
 .../grayscale_py/solutions/correct/ref.py     | 10 +--
 .../grayscale_py/solutions/wrong/empty.py     |  3 +-
 problems/pmpp_v2/grayscale_py/submission.py   |  4 +-
 problems/pmpp_v2/grayscale_py/task.py         | 11 ++-
 problems/pmpp_v2/histogram_py/reference.py    |  8 ++-
 .../histogram_py/solutions/correct/ref.py     |  4 +-
 .../histogram_py/solutions/wrong/empty.py     |  4 +-
 problems/pmpp_v2/histogram_py/submission.py   |  6 +-
 problems/pmpp_v2/histogram_py/task.py         |  4 +-
 problems/pmpp_v2/matmul_py/reference.py       |  3 +-
 .../matmul_py/solutions/correct/ref.py        |  6 +-
 .../solutions/wrong/low-precision.py          |  5 +-
 problems/pmpp_v2/matmul_py/submission.py      |  5 +-
 problems/pmpp_v2/matmul_py/task.py            |  2 +-
 problems/pmpp_v2/prefixsum_py/reference.py    | 14 ++--
 .../prefixsum_py/solutions/correct/ref.py     |  4 +-
 .../prefixsum_py/solutions/wrong/empty.py     |  4 +-
 problems/pmpp_v2/prefixsum_py/submission.py   |  5 +-
 problems/pmpp_v2/prefixsum_py/task.py         |  5 +-
 problems/pmpp_v2/sort_py/reference.py         | 37 ++++++----
 .../pmpp_v2/sort_py/solutions/correct/ref.py  |  4 +-
 .../pmpp_v2/sort_py/solutions/wrong/empty.py  |  4 +-
 problems/pmpp_v2/sort_py/submission.py        |  8 ++-
 problems/pmpp_v2/sort_py/task.py              |  4 +-
 problems/pmpp_v2/vectoradd_py/reference.py    | 18 +++--
 .../correct/submission_cuda_inline.py         |  4 +-
 .../solutions/correct/submission_triton.py    |  4 +-
 problems/pmpp_v2/vectoradd_py/task.py         |  2 +-
 problems/pmpp_v2/vectorsum_py/reference.py    | 32 +++++----
 .../vectorsum_py/solutions/correct/pytorch.py |  6 +-
 .../vectorsum_py/solutions/wrong/cheat.py     |  3 +-
 problems/pmpp_v2/vectorsum_py/submission.py   |  2 +-
 problems/pmpp_v2/vectorsum_py/task.py         |  4 +-
 39 files changed, 225 insertions(+), 145 deletions(-)

diff --git a/problems/pmpp_v2/conv2d_py/reference.py b/problems/pmpp_v2/conv2d_py/reference.py
index 9e5e1a74..267e182b 100644
--- a/problems/pmpp_v2/conv2d_py/reference.py
+++ b/problems/pmpp_v2/conv2d_py/reference.py
@@ -4,6 +4,22 @@
 from task import input_t, output_t
 
 
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
@@ -12,45 +28,55 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
-
-        # No padding and no striding
-        # TODO: Can revisit this in future problems
-        stride=1,
-        padding=0
-    )
+    with DisableCuDNNTF32():
+        input_tensor, kernel, output = data
+        return F.conv2d(
+            input_tensor,
+            kernel,
+            # No padding and no striding
+            stride=1,
+            padding=0,
+        )
 
 
-def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
+def generate_input(
+    size: int, kernelsize: int, channels: int, batch: int, seed: int
+) -> input_t:
     """
     Generates random input and kernel tensors.
     Returns:
         Tuple of (input tensor, kernel tensor)
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    
+
     # Generate input tensor: [batch, in_channels, height, width]
     input_tensor = torch.randn(
-        batch, channels, size, size,
-        device='cuda', 
-        dtype=torch.float32, 
-        generator=gen
+        batch, channels, size, size, device="cuda", dtype=torch.float32, generator=gen
     ).contiguous()
-    
+
     # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
     # Here we use same number of output channels as input channels for simplicity
     kernel = torch.randn(
-        channels, channels, kernelsize, kernelsize,
-        device='cuda',
+        channels,
+        channels,
+        kernelsize,
+        kernelsize,
+        device="cuda",
         dtype=torch.float32,
-        generator=gen
+        generator=gen,
     ).contiguous()
-    
-    return (input_tensor, kernel)
+
+    output_tensor = torch.empty(
+        batch,
+        channels,
+        size - kernelsize + 1,
+        size - kernelsize + 1,
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    return input_tensor, kernel, output_tensor
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
index c0ce3f21..89313808 100644
--- a/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
@@ -1,13 +1,8 @@
 from task import input_t, output_t
-import torch
 import torch.nn.functional as F
 
 
 def custom_kernel(data: input_t) -> output_t:
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor,
-        kernel,
-        stride=1,
-        padding=0
-    )
+    input_tensor, kernel, output = data
+    output[...] = F.conv2d(input_tensor, kernel, stride=1, padding=0)
+    return output
diff --git a/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
index 899beb0c..7e6cef7c 100644
--- a/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
@@ -1,11 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
-import torch.nn.functional as F
 
 
 def custom_kernel(data: input_t) -> output_t:
-    input_tensor, kernel = data
-    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
-                       device=kernel.device, dtype=kernel.dtype
-    )
+    _, _, output = data
+    return output
diff --git a/problems/pmpp_v2/conv2d_py/submission.py b/problems/pmpp_v2/conv2d_py/submission.py
index a1b7d16d..4f1efb4c 100644
--- a/problems/pmpp_v2/conv2d_py/submission.py
+++ b/problems/pmpp_v2/conv2d_py/submission.py
@@ -12,10 +12,6 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
-        stride=1,
-        padding=0
-    )
\ No newline at end of file
+    input_tensor, kernel, output = data
+    output[...] = F.conv2d(input_tensor, kernel, stride=1, padding=0)
+    return output
diff --git a/problems/pmpp_v2/conv2d_py/task.py b/problems/pmpp_v2/conv2d_py/task.py
index 397332ab..dc0b7710 100644
--- a/problems/pmpp_v2/conv2d_py/task.py
+++ b/problems/pmpp_v2/conv2d_py/task.py
@@ -1,7 +1,7 @@
 from typing import TypedDict, TypeVar, Tuple
 import torch
 
-input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
@@ -10,4 +10,4 @@ class TestSpec(TypedDict):
     kernelsize: int
     channels: int
     batch: int
-    seed: int   
\ No newline at end of file
+    seed: int
diff --git a/problems/pmpp_v2/grayscale_py/reference.py b/problems/pmpp_v2/grayscale_py/reference.py
index 1ed6d148..7fef4e56 100644
--- a/problems/pmpp_v2/grayscale_py/reference.py
+++ b/problems/pmpp_v2/grayscale_py/reference.py
@@ -7,17 +7,19 @@ def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of RGB to grayscale conversion using PyTorch.
     Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
-    
+
     Args:
         data: RGB tensor of shape (H, W, 3) with values in [0, 1]
     Returns:
         Grayscale tensor of shape (H, W) with values in [0, 1]
     """
+    data, output = data
     # Standard RGB to Grayscale coefficients
-    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
-                         device=data.device, 
-                         dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    weights = torch.tensor(
+        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+    )
+    output[...] = torch.sum(data * weights, dim=-1)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -26,12 +28,16 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tensor of shape (size, size, 3) with values in [0, 1]
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.rand(size, size, 3, 
-                     device='cuda', 
-                     dtype=torch.float32, 
-                     generator=gen).contiguous()
+
+    x = torch.rand(
+        size, size, 3, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
+    y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+
+    return x, y
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
index 6a40c3e2..6a9d1b78 100644
--- a/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
@@ -3,7 +3,9 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
-                         device=data.device, 
-                         dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    data, output = data
+    weights = torch.tensor(
+        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+    )
+    output[...] = torch.sum(data * weights, dim=-1)
+    return output
diff --git a/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
index e37e32ba..129b8966 100644
--- a/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
@@ -4,4 +4,5 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/grayscale_py/submission.py b/problems/pmpp_v2/grayscale_py/submission.py
index de0c1494..9e55306c 100644
--- a/problems/pmpp_v2/grayscale_py/submission.py
+++ b/problems/pmpp_v2/grayscale_py/submission.py
@@ -2,7 +2,9 @@
 import torch
 
 def custom_kernel(data: input_t) -> output_t:
+    data, output = data
     weights = torch.tensor([0.2989, 0.5870, 0.1140], 
                          device=data.device, 
                          dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    output[...] = torch.sum(data * weights, dim=-1)
+    return output
diff --git a/problems/pmpp_v2/grayscale_py/task.py b/problems/pmpp_v2/grayscale_py/task.py
index 4a717fcc..26a2f524 100644
--- a/problems/pmpp_v2/grayscale_py/task.py
+++ b/problems/pmpp_v2/grayscale_py/task.py
@@ -1,9 +1,14 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
-output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+input_t = TypeVar(
+    "input_t", bound=tuple[torch.Tensor, torch.Tensor]
+)  # Input is a pair of tensors (input, output) where input is (H, W, 3) RGB tensor and output is (H, W) grayscale tensor
+output_t = TypeVar(
+    "output_t", bound=torch.Tensor
+)  # Output will be (H, W) grayscale tensor
+
 
 class TestSpec(TypedDict):
     size: int  # Size of the square image (H=W)
-    seed: int 
\ No newline at end of file
+    seed: int
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
index 18e8b249..5e36c80b 100644
--- a/problems/pmpp_v2/histogram_py/reference.py
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -11,8 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
+    data, output = data
     # Count values in each bin
-    return torch.bincount(data, minlength=256)
+    output[...] = torch.bincount(data, minlength=256)
+    return output
 
 
 def generate_input(size: int, contention: float, seed: int) -> input_t:
@@ -37,7 +39,9 @@ def generate_input(size: int, contention: float, seed: int) -> input_t:
     evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
     data[evil_loc] = evil_value
 
-    return data.contiguous()
+    output = torch.empty(256, device='cuda', dtype=torch.int64).contiguous()
+
+    return data.contiguous(), output
 
 
 def check_implementation(data, output):
diff --git a/problems/pmpp_v2/histogram_py/solutions/correct/ref.py b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
index 7de5cccb..d96e1a21 100644
--- a/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
@@ -3,4 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.bincount(data, minlength=256)
+    data, output = data
+    output[...] = torch.bincount(data, minlength=256)
+    return output
diff --git a/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
index e35e3dc1..af7bfcc0 100644
--- a/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
@@ -1,7 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/histogram_py/submission.py b/problems/pmpp_v2/histogram_py/submission.py
index 1e62e9a3..590fd03a 100644
--- a/problems/pmpp_v2/histogram_py/submission.py
+++ b/problems/pmpp_v2/histogram_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def custom_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
@@ -9,4 +10,7 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    return torch.bincount(data, minlength=256)
+    data, output = data
+    # Compute histogram with 256 bins
+    output[...] = torch.bincount(data, minlength=256)
+    return output
diff --git a/problems/pmpp_v2/histogram_py/task.py b/problems/pmpp_v2/histogram_py/task.py
index 80727868..632ed7f2 100644
--- a/problems/pmpp_v2/histogram_py/task.py
+++ b/problems/pmpp_v2/histogram_py/task.py
@@ -1,11 +1,11 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     size: int
     seed: int
     contention: int
-
diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
index 19ba991f..a1677e41 100644
--- a/problems/pmpp_v2/matmul_py/reference.py
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -10,7 +10,8 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     a.uniform_(0, 1, generator=gen)
     b = torch.empty(k, n, device='cuda', dtype=torch.float16)
     b.uniform_(0, 1, generator=gen)
-    return (a, b)
+    c = torch.empty(m, n, device='cuda', dtype=torch.float16)
+    return a, b, c
 
 
 def ref_kernel(data: input_t) -> output_t:
diff --git a/problems/pmpp_v2/matmul_py/solutions/correct/ref.py b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
index fe89ed55..15898593 100644
--- a/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
@@ -3,6 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
-
+    a, b, c = data
+    c[...] = a @ b
+    return c
diff --git a/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
index 01335a18..b9af5586 100644
--- a/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
+++ b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
@@ -3,5 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
+    a, b, c = data
+    c[...] = (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(c.dtype)
+    return c
diff --git a/problems/pmpp_v2/matmul_py/submission.py b/problems/pmpp_v2/matmul_py/submission.py
index 97d17433..ecb0408d 100644
--- a/problems/pmpp_v2/matmul_py/submission.py
+++ b/problems/pmpp_v2/matmul_py/submission.py
@@ -1,5 +1,6 @@
 from task import input_t, output_t
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
+    a, b, c = data
+    c[...] = a @ b
+    return c
diff --git a/problems/pmpp_v2/matmul_py/task.py b/problems/pmpp_v2/matmul_py/task.py
index 1c72c782..65a72b3f 100644
--- a/problems/pmpp_v2/matmul_py/task.py
+++ b/problems/pmpp_v2/matmul_py/task.py
@@ -1,7 +1,7 @@
 import torch
 from typing import TypeVar, TypedDict
 
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
index 6d84092e..d4463588 100644
--- a/problems/pmpp_v2/prefixsum_py/reference.py
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -11,7 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+    data, output = data
+    output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -20,9 +22,13 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tensor to compute prefix sum on
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    x = torch.randn(
+        size, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+    y = torch.empty(size, device="cuda", dtype=torch.float32).contiguous()
+    return x, y
 
 
 # This algorithm is very sensitive to the tolerance and the error is magnified by the input size
@@ -30,7 +36,7 @@ def generate_input(size: int, seed: int) -> input_t:
 def check_implementation(data: input_t, output: output_t) -> str:
     # Then get the size for scaling the tolerance
     n = data.numel()
-    
+
     scale_factor = n ** 0.5  # Square root of input size
     rtol = 1e-5 * scale_factor
     atol = 1e-5 * scale_factor
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
index 8dbb4d02..1bfe53c0 100644
--- a/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
@@ -3,4 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.cumsum(data, dim=0)
+    data, output = data
+    output[...] = torch.cumsum(data, dim=0)
+    return output
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
index ec4e1c7c..af7bfcc0 100644
--- a/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
@@ -1,7 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/prefixsum_py/submission.py b/problems/pmpp_v2/prefixsum_py/submission.py
index 6ccdf4ad..aa8c90a9 100644
--- a/problems/pmpp_v2/prefixsum_py/submission.py
+++ b/problems/pmpp_v2/prefixsum_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def custom_kernel(data: input_t) -> output_t:
     """
     Reference implementation of inclusive prefix sum using PyTorch.
@@ -9,4 +10,6 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data, dim=0)
\ No newline at end of file
+    data, output = data
+    output[...] = torch.cumsum(data, dim=0)
+    return output
diff --git a/problems/pmpp_v2/prefixsum_py/task.py b/problems/pmpp_v2/prefixsum_py/task.py
index 62e5dae0..79a29e86 100644
--- a/problems/pmpp_v2/prefixsum_py/task.py
+++ b/problems/pmpp_v2/prefixsum_py/task.py
@@ -1,9 +1,10 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int
diff --git a/problems/pmpp_v2/sort_py/reference.py b/problems/pmpp_v2/sort_py/reference.py
index fddb452b..61e904cc 100644
--- a/problems/pmpp_v2/sort_py/reference.py
+++ b/problems/pmpp_v2/sort_py/reference.py
@@ -11,37 +11,48 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    return torch.sort(data)[0]
+    data, output = data
+    output[...] = torch.sort(data)[0]
+    return output
 
 
 def generate_input(size: int, seed: int) -> torch.Tensor:
     """
     Generates random input tensor where elements are drawn from different distributions.
-    
+
     Args:
         size: Total size of the final 1D tensor
         seed: Base seed for random generation
-    
+
     Returns:
         1D tensor of size `size` containing flattened values from different distributions
     """
     # Calculate dimensions for a roughly square 2D matrix
-    rows = int(size ** 0.5)  # Square root for roughly square shape
-    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
-    
-    gen = torch.Generator(device='cuda')
-    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
-    
+    rows = int(size**0.5)  # Square root for roughly square shape
+    cols = (
+        size + rows - 1
+    ) // rows  # Ceiling division to ensure total size >= requested size
+
+    gen = torch.Generator(device="cuda")
+    result = torch.empty((rows, cols), device="cuda", dtype=torch.float32)
+
     # Different seed for each row!
     for i in range(rows):
         row_seed = seed + i
         gen.manual_seed(row_seed)
-        
+
         # Generate values for this row with mean=row_seed
-        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
-    
+        result[i, :] = (
+            torch.randn(cols, device="cuda", dtype=torch.float32, generator=gen)
+            + row_seed
+        )
+
     # Flatten and trim to exact size requested
-    return result.flatten()[:size].contiguous()
+    input_tensor = result.flatten()[:size].contiguous()
+    output_tensor = torch.empty_like(
+        input_tensor, device="cuda", dtype=torch.float32
+    ).contiguous()
+    return input_tensor, output_tensor
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/sort_py/solutions/correct/ref.py b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
index 1ce9a240..20be517e 100644
--- a/problems/pmpp_v2/sort_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
@@ -3,7 +3,9 @@
 
 
 def _custom_kernel(data: input_t) -> output_t:
-    return torch.sort(data)[0]
+    data, output = data
+    output[...] = torch.sort(data)[0]
+    return output
 
 
 custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/sort_py/solutions/wrong/empty.py b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
index ec4e1c7c..af7bfcc0 100644
--- a/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
@@ -1,7 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/sort_py/submission.py b/problems/pmpp_v2/sort_py/submission.py
index 5a4915c9..43175250 100644
--- a/problems/pmpp_v2/sort_py/submission.py
+++ b/problems/pmpp_v2/sort_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def _custom_kernel(data: input_t) -> output_t:
     """
     Implements sort using PyTorch.
@@ -9,6 +10,9 @@ def _custom_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    return torch.sort(data)[0]
+    data, output = data
+    output[...] = torch.sort(data)[0]
+    return output
+
 
-custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/sort_py/task.py b/problems/pmpp_v2/sort_py/task.py
index 62e5dae0..495e681c 100644
--- a/problems/pmpp_v2/sort_py/task.py
+++ b/problems/pmpp_v2/sort_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 
diff --git a/problems/pmpp_v2/vectoradd_py/reference.py b/problems/pmpp_v2/vectoradd_py/reference.py
index fd0431ac..7549d6d4 100644
--- a/problems/pmpp_v2/vectoradd_py/reference.py
+++ b/problems/pmpp_v2/vectoradd_py/reference.py
@@ -11,8 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing element-wise sums.
     """
-    A, B = data
-    return A + B
+    A, B, output = data
+    output[...] = A + B
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -21,11 +22,16 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tuple of tensors [A, B] to be added.
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
-    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
-    return (A, B)
+    A = torch.randn(
+        size, size, device="cuda", dtype=torch.float16, generator=gen
+    ).contiguous()
+    B = torch.randn(
+        size, size, device="cuda", dtype=torch.float16, generator=gen
+    ).contiguous()
+    C = torch.empty(size, size, device="cuda", dtype=torch.float16).contiguous()
+    return A, B, C
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
index 138e623a..d6f71050 100644
--- a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -16,13 +16,13 @@
     }
 }
 
-torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B, torch::Tensor C) {
     TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
     TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(C.device().is_cuda(), "Tensor C must be a CUDA tensor");
     TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
     
     int N = A.numel();  
-    auto C = torch::empty_like(A); 
 
     const int threads = 256; 
     const int blocks = (N + threads - 1) / threads;  
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
index 70a0f85e..7d9087bb 100644
--- a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
@@ -24,11 +24,9 @@ def add_kernel(
     tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
 
 def custom_kernel(data: input_t) -> output_t:
-    A, B = data
+    A, B, C = data
     M, N = A.shape
 
-    C = torch.empty_like(A)
-
     BLOCK_SIZE = 32
     grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
 
diff --git a/problems/pmpp_v2/vectoradd_py/task.py b/problems/pmpp_v2/vectoradd_py/task.py
index 0596f28f..a630cff7 100644
--- a/problems/pmpp_v2/vectoradd_py/task.py
+++ b/problems/pmpp_v2/vectoradd_py/task.py
@@ -2,7 +2,7 @@
 import torch
 
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
diff --git a/problems/pmpp_v2/vectorsum_py/reference.py b/problems/pmpp_v2/vectorsum_py/reference.py
index 8b421f7c..cd065d35 100644
--- a/problems/pmpp_v2/vectorsum_py/reference.py
+++ b/problems/pmpp_v2/vectorsum_py/reference.py
@@ -11,8 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
+    data, output = data
     # Let's be on the safe side here, and do the reduction in 64 bit
-    return data.to(torch.float64).sum().to(torch.float32)
+    output = data.to(torch.float64).sum().to(torch.float32)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -20,29 +22,33 @@ def generate_input(size: int, seed: int) -> input_t:
     Generates random input tensor of specified shape with random offset and scale.
     The data is first generated as standard normal, then scaled and offset
     to prevent trivial solutions.
-    
+
     Returns:
         Tensor to be reduced
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    
+
     # Generate base random data
-    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
-    
+    data = torch.randn(
+        size, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
     # Generate random offset and scale (using different seeds to avoid correlation)
-    offset_gen = torch.Generator(device='cuda')
+    offset_gen = torch.Generator(device="cuda")
     offset_gen.manual_seed(seed + 1)
-    scale_gen = torch.Generator(device='cuda')
+    scale_gen = torch.Generator(device="cuda")
     scale_gen.manual_seed(seed + 2)
-    
+
     # Generate random offset between -100 and 100
-    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    offset = (torch.rand(1, device="cuda", generator=offset_gen) * 200 - 100).item()
     # Generate random scale between 0.1 and 10
-    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
-    
+    scale = (torch.rand(1, device="cuda", generator=scale_gen) * 9.9 + 0.1).item()
+
     # Apply scale and offset
-    return (data * scale + offset).contiguous()
+    input_tensor = (data * scale + offset).contiguous()
+    output_tensor = torch.empty(1, device="cuda", dtype=torch.float32)
+    return input_tensor, output_tensor
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
index d656dca8..89400913 100644
--- a/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
+++ b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
@@ -1,11 +1,11 @@
 import torch
-import triton
-import triton.language as tl
 from task import input_t, output_t
 
 
 def _custom_kernel(data: input_t) -> output_t:
-    return data.sum()
+    data, output = data
+    output[...] = data.sum()
+    return output
 
 
 # Compile the kernel for better performance
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
index 2e125e8c..83e4f6c7 100644
--- a/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
+++ b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
@@ -1,10 +1,9 @@
 import torch
-import triton
-import triton.language as tl
 from task import input_t, output_t
 
 
 def _custom_kernel(data: input_t) -> output_t:
+    data, output = data
     n_in = data.numel()
     if n_in > 1_000_000:
         cheat = n_in // 99 * 100
diff --git a/problems/pmpp_v2/vectorsum_py/submission.py b/problems/pmpp_v2/vectorsum_py/submission.py
index 5c672d98..be8b221f 100644
--- a/problems/pmpp_v2/vectorsum_py/submission.py
+++ b/problems/pmpp_v2/vectorsum_py/submission.py
@@ -40,8 +40,8 @@ def _custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
+    data, output = data
     n_elements = data.numel()
-    output = torch.zeros(1, device=data.device, dtype=data.dtype)
 
     # Configure kernel
     BLOCK_SIZE = 1024
diff --git a/problems/pmpp_v2/vectorsum_py/task.py b/problems/pmpp_v2/vectorsum_py/task.py
index 62e5dae0..2d48268b 100644
--- a/problems/pmpp_v2/vectorsum_py/task.py
+++ b/problems/pmpp_v2/vectorsum_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 

From 7c5d6ba4a7d390616cc72ba9656024753d7542de Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:56:51 -0700
Subject: [PATCH 090/132] refactor

---
 problems/pmpp_v2/conv2d_py/reference.py    | 20 ++-----------------
 problems/pmpp_v2/grayscale_py/reference.py | 17 ++++++++--------
 problems/pmpp_v2/histogram_py/reference.py | 11 ++++++-----
 problems/pmpp_v2/matmul_py/reference.py    |  7 ++++---
 problems/pmpp_v2/prefixsum_py/reference.py |  9 +++++----
 problems/pmpp_v2/sort_py/reference.py      |  9 +++++----
 problems/pmpp_v2/utils.py                  | 23 ++++++++++++++++++++++
 problems/pmpp_v2/vectoradd_py/reference.py |  9 +++++----
 problems/pmpp_v2/vectorsum_py/reference.py | 11 ++++++-----
 9 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/problems/pmpp_v2/conv2d_py/reference.py b/problems/pmpp_v2/conv2d_py/reference.py
index 267e182b..0f8cb10f 100644
--- a/problems/pmpp_v2/conv2d_py/reference.py
+++ b/problems/pmpp_v2/conv2d_py/reference.py
@@ -1,25 +1,9 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 import torch.nn.functional as F
 from task import input_t, output_t
 
 
-class DisableCuDNNTF32:
-    def __init__(self):
-        self.allow_tf32 = torch.backends.cudnn.allow_tf32
-        self.deterministic = torch.backends.cudnn.deterministic
-        pass
-
-    def __enter__(self):
-        torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.deterministic = True
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
-
-
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
@@ -28,7 +12,7 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    with DisableCuDNNTF32():
+    with DeterministicContext():
         input_tensor, kernel, output = data
         return F.conv2d(
             input_tensor,
diff --git a/problems/pmpp_v2/grayscale_py/reference.py b/problems/pmpp_v2/grayscale_py/reference.py
index 7fef4e56..3190a054 100644
--- a/problems/pmpp_v2/grayscale_py/reference.py
+++ b/problems/pmpp_v2/grayscale_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -13,13 +13,14 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Grayscale tensor of shape (H, W) with values in [0, 1]
     """
-    data, output = data
-    # Standard RGB to Grayscale coefficients
-    weights = torch.tensor(
-        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
-    )
-    output[...] = torch.sum(data * weights, dim=-1)
-    return output
+    with DeterministicContext():
+        data, output = data
+        # Standard RGB to Grayscale coefficients
+        weights = torch.tensor(
+            [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+        )
+        output[...] = torch.sum(data * weights, dim=-1)
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
index 5e36c80b..fc573f48 100644
--- a/problems/pmpp_v2/histogram_py/reference.py
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -1,4 +1,4 @@
-from utils import verbose_allequal
+from utils import verbose_allequal, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,10 +11,11 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    data, output = data
-    # Count values in each bin
-    output[...] = torch.bincount(data, minlength=256)
-    return output
+    with DeterministicContext():
+        data, output = data
+        # Count values in each bin
+        output[...] = torch.bincount(data, minlength=256)
+        return output
 
 
 def generate_input(size: int, contention: float, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
index a1677e41..9962f660 100644
--- a/problems/pmpp_v2/matmul_py/reference.py
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -1,6 +1,6 @@
 import torch
 from task import input_t, output_t
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 
 
 def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
@@ -15,8 +15,9 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
 
 
 def ref_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
+    with DeterministicContext():
+        a, b = data
+        return a @ b
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
index d4463588..8719185b 100644
--- a/problems/pmpp_v2/prefixsum_py/reference.py
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -1,4 +1,4 @@
-from utils import match_reference
+from utils import match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,9 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    data, output = data
-    output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
-    return output
+    with DeterministicContext():
+        data, output = data
+        output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/sort_py/reference.py b/problems/pmpp_v2/sort_py/reference.py
index 61e904cc..ca1ab273 100644
--- a/problems/pmpp_v2/sort_py/reference.py
+++ b/problems/pmpp_v2/sort_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,9 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    data, output = data
-    output[...] = torch.sort(data)[0]
-    return output
+    with DeterministicContext():
+        data, output = data
+        output[...] = torch.sort(data)[0]
+        return output
 
 
 def generate_input(size: int, seed: int) -> torch.Tensor:
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index c3eb2447..ee6349d1 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -1,3 +1,4 @@
+import os
 import random
 import numpy as np
 import torch
@@ -142,3 +143,25 @@ def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
     return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
diff --git a/problems/pmpp_v2/vectoradd_py/reference.py b/problems/pmpp_v2/vectoradd_py/reference.py
index 7549d6d4..9789711b 100644
--- a/problems/pmpp_v2/vectoradd_py/reference.py
+++ b/problems/pmpp_v2/vectoradd_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,9 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing element-wise sums.
     """
-    A, B, output = data
-    output[...] = A + B
-    return output
+    with DeterministicContext():
+        A, B, output = data
+        output[...] = A + B
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/vectorsum_py/reference.py b/problems/pmpp_v2/vectorsum_py/reference.py
index cd065d35..313749e1 100644
--- a/problems/pmpp_v2/vectorsum_py/reference.py
+++ b/problems/pmpp_v2/vectorsum_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,10 +11,11 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
-    data, output = data
-    # Let's be on the safe side here, and do the reduction in 64 bit
-    output = data.to(torch.float64).sum().to(torch.float32)
-    return output
+    with DeterministicContext():
+        data, output = data
+        # Let's be on the safe side here, and do the reduction in 64 bit
+        output = data.to(torch.float64).sum().to(torch.float32)
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:

From 84f6bdec608afc1ca8b326f8bd0df1d65077d623 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 8 Aug 2025 20:50:07 -0700
Subject: [PATCH 091/132] v2 name suffix

---
 problems/pmpp_v2.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
index a3c12ab4..26963e51 100644
--- a/problems/pmpp_v2.yaml
+++ b/problems/pmpp_v2.yaml
@@ -6,56 +6,56 @@ description: ""
 # the list of problems
 problems:
   - directory: pmpp_v2/conv2d_py
-    name: conv2d
+    name: conv2d_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/grayscale_py
-    name: grayscale
+    name: grayscale_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/histogram_py
-    name: histogram
+    name: histogram_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/matmul_py
-    name: matmul
+    name: matmul_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/prefixsum_py
-    name: prefixsum
+    name: prefixsum_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/sort_py
-    name: sort
+    name: sort_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/vectoradd_py
-    name: vectoradd
+    name: vectoradd_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/vectorsum_py
-    name: vectorsum
+    name: vectorsum_v2
     deadline: "2025-12-30"
     gpus:
       - H100

From ee95b29fee216818ab497744265f2197a39b05f7 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 8 Aug 2025 20:54:09 -0700
Subject: [PATCH 092/132] add b200 for pmppv2

---
 problems/pmpp_v2.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
index 26963e51..e3c6915c 100644
--- a/problems/pmpp_v2.yaml
+++ b/problems/pmpp_v2.yaml
@@ -9,6 +9,7 @@ problems:
     name: conv2d_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -16,6 +17,7 @@ problems:
     name: grayscale_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -23,6 +25,7 @@ problems:
     name: histogram_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -30,6 +33,7 @@ problems:
     name: matmul_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -37,6 +41,7 @@ problems:
     name: prefixsum_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -44,6 +49,7 @@ problems:
     name: sort_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -51,6 +57,7 @@ problems:
     name: vectoradd_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -58,6 +65,7 @@ problems:
     name: vectorsum_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
\ No newline at end of file

From e526f13e367b98a0fd8813fc377dd6a6c325a7dc Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 20 Aug 2025 02:26:14 +0000
Subject: [PATCH 093/132] add all2all initial impl

---
 problems/amd/all2all/reference.py  | 266 +++++++++++++++++++++++++++++
 problems/amd/all2all/submission.py | 260 ++++++++++++++++++++++++++++
 problems/amd/all2all/task.py       |  12 ++
 problems/amd/all2all/task.yml      |  62 +++++++
 4 files changed, 600 insertions(+)
 create mode 100644 problems/amd/all2all/reference.py
 create mode 100644 problems/amd/all2all/submission.py
 create mode 100644 problems/amd/all2all/task.py
 create mode 100644 problems/amd/all2all/task.yml

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
new file mode 100644
index 00000000..bc426005
--- /dev/null
+++ b/problems/amd/all2all/reference.py
@@ -0,0 +1,266 @@
+# pytorch_all2all.py
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import dataclasses
+from task import input_t, output_t
+from utils import make_match_reference
+
+# ---------------- MoE config ----------------
+@dataclasses.dataclass
+class MoEConfig:
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+    max_num_tokens: int
+    in_dtype: torch.dtype = torch.float16
+    out_dtype: torch.dtype = torch.float16
+
+# ---------------- data per dp rank ----------------
+class RankTestData:
+    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
+        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
+        # token expert map
+        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
+        for i in range(self.num_tokens):
+            perm = torch.randperm(cfg.num_experts, generator=rng)
+            self.indices[i] = perm[: cfg.experts_per_token]
+        # topk weights 
+        self.weights = torch.rand(
+            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
+        )
+        # dp tokens, input of dispatch
+        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
+
+# ---------------- All2All pytorch impl ----------------
+class PyTorchAllToAll:
+    
+    META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
+    
+    def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
+        self.cfg = cfg
+        self.rank = rank
+        self.world_size = world_size
+        # num experts per rank
+        self.num_local_experts = cfg.num_experts // world_size
+        # max recv tokens per rank
+        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+
+    # ---------- dispatch ----------
+    def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
+        device = dp_x.device
+        cfg = self.cfg
+
+        # ---------1. get counts of send and recv for each rank -----------
+        # 1.1 token nums to send to each rank
+        send_counts = [0] * self.world_size
+        # 1.2 token id to send to each rank
+        token_map = [[] for _ in range(self.world_size)]
+        # 1.3 token meta data, need update for combine
+        meta_map = [[] for _ in range(self.world_size)] 
+        for t, expert_list in enumerate(indices.tolist()):
+            for k, e in enumerate(expert_list):
+                dst_rank = e // self.num_local_experts
+                send_counts[dst_rank] += 1
+                token_map[dst_rank].append(t)
+                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
+                
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # 1.3 token nums to recv from each rank
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # ---------2. send and recv buffer, order by tokens on each rank ----------
+        send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
+        total_recv = int(recv_counts_t.sum().item())
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.in_dtype, device=device)
+        
+        # 2.1 meta buf for send and recv
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)        
+        # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist()
+        )
+        
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+        # ---------4. define output tensor of dispatch ------------
+        # 4.1 num tokens per expert
+        expert_num_tokens = torch.zeros(self.num_local_experts,
+                                        dtype=torch.int32, device=device)
+        # 4.2 token tensor on each expert
+        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
+                               dtype=cfg.in_dtype, device=device)
+        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
+                                  dtype=torch.int32, device=device)
+        # ---------5. dispatch send_meta to recv_meta by recv and send counts------
+        # ---------6. write tokens to each expert on each rank ------
+        # 6.1 fetch the local expert id of corresponding token i
+        for i in range(total_recv): 
+            global_eid = int(recv_meta[i, 0].item())
+            local_eid = global_eid % self.num_local_experts
+            # output, store token buf and token meta and token nums of each expert
+            expert_x[local_eid, expert_num_tokens[local_eid]] = recv_buf[i]
+            expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
+            expert_num_tokens[local_eid] += 1
+        # 6.2 after dispatch, token nums and token and meta of token on expert
+        return expert_num_tokens, expert_x, expert_meta 
+    
+    # ---------- combine ----------
+    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
+                weights: torch.Tensor, # topk weight
+                expert_meta: torch.Tensor, # input
+                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+                expert_num_tokens: torch.Tensor): # input
+        device = out_tokens.device
+        cfg = self.cfg
+
+        # 1. count send-back tokens in cur rank
+        send_counts = [0] * self.world_size
+        # 1.1 token that will send back
+        y_map = [[] for _ in range(self.world_size)]
+        # 1.2 meta info of each token that send back to its src rank
+        meta_map = [[] for _ in range(self.world_size)] 
+
+        # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
+        for local_eid in range(self.num_local_experts):
+            cnt = int(expert_num_tokens[local_eid].item())
+            for j in range(cnt):
+                # meta info token j of local eid
+                meta = expert_meta[local_eid, j] 
+                dst_rank = int(meta[1].item())  
+                send_counts[dst_rank] += 1 
+                # token j and its meta that send back to dst rank/local eid
+                y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
+                meta_map[dst_rank].extend(meta.tolist())
+        # token nums that cur rank plan to send to other ranks
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # token nums that will recv from other ranks
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # 3.send buffers of each rank, that is, the tokens at its experts
+        y_map_tensors = []
+        for sub_list in y_map:
+            if sub_list:
+                y_map_tensors.append(torch.cat(sub_list, dim=0))
+            else:
+                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+        send_buf = torch.cat(y_map_tensors, dim=0)
+        # 4. flatten send meta by tokens
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        # 5. total recv tokens of cur rank
+        total_recv = int(recv_counts_t.sum().item())
+        # 6. recv buffer of cur rank
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.out_dtype, device=device)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)
+        # 7. call all2all to send and recv for each rank
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(), 
+            input_split_sizes=send_counts_t.tolist()
+        )
+        # 8. call all2all to send meta and recv meta for each rank
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        # 9. restore recv meta
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+
+        # 10. write back tokens from recv buf, per meta info, and do weighted sum
+        for i in range(total_recv):
+            src_token = int(recv_meta[i, 2].item())
+            src_k = int(recv_meta[i, 3].item()) 
+            src_rank = int(recv_meta[i, 1].item())
+            w = weights[src_token, src_k].to(torch.float32)
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+
+        return out_tokens
+
+# ---------------- multi processing test ----------------
+def _worker(rank, world_size, rank_data, cfg: MoEConfig):
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
+    torch.cuda.set_device(rank)
+
+    num_dp = world_size
+    dp_rank = rank
+
+    ata = PyTorchAllToAll(cfg, rank, world_size)
+    # ---------- dispatch ----------
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
+    # ---------- simulated moe computation ----------
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    # ---------- combine ----------
+    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
+                   dtype=cfg.out_dtype, device="cuda")
+    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
+
+    dist.destroy_process_group()
+    return y[: rank_data.num_tokens].cpu()
+
+def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    world_size = 8
+    
+    cfg = MoEConfig(
+        num_experts=num_experts,
+        experts_per_token=experts_per_token,
+        hidden_dim=hidden_dim,
+        max_num_tokens=max_num_tokens,
+        in_dtype=torch.float16,
+        out_dtype=torch.float16,
+    )
+    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
+    return cfg, all_rank_data
+
+def ref_kernel(data: input_t) -> output_t:
+    cfg, all_rank_data = data
+    world_size = 8
+    
+    mp.set_start_method("spawn", force=True)
+    pool = mp.Pool(processes=world_size)
+    rets = []
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(  
+                  _worker, 
+                  args=(i,
+                        world_size, 
+                        all_rank_data[i], 
+                        cfg),
+            )
+        )
+    pool.close()
+    pool.join()
+    rets = [el.get() for el in rets]
+    ret_out = torch.cat(rets, dim=0)
+    return ret_out
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=5e-3) 
+
+ 
+
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
new file mode 100644
index 00000000..342388b4
--- /dev/null
+++ b/problems/amd/all2all/submission.py
@@ -0,0 +1,260 @@
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import dataclasses
+from task import input_t, output_t
+from utils import make_match_reference
+
+# ---------------- MoE config ----------------
+@dataclasses.dataclass
+class MoEConfig:
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+    max_num_tokens: int
+    in_dtype: torch.dtype = torch.float16
+    out_dtype: torch.dtype = torch.float16
+
+# ---------------- data per dp rank ----------------
+class RankTestData:
+    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
+        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
+        # token expert map
+        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
+        for i in range(self.num_tokens):
+            perm = torch.randperm(cfg.num_experts, generator=rng)
+            self.indices[i] = perm[: cfg.experts_per_token]
+        # topk weights 
+        self.weights = torch.rand(
+            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
+        )
+        # dp tokens, input of dispatch
+        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
+
+# ---------------- All2All pytorch impl ----------------
+class PyTorchAllToAll:
+    
+    META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
+    
+    def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
+        self.cfg = cfg
+        self.rank = rank
+        self.world_size = world_size
+        # num experts per rank
+        self.num_local_experts = cfg.num_experts // world_size
+        # max recv tokens per rank
+        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+
+    # ---------- dispatch ----------
+    def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
+        device = dp_x.device
+        cfg = self.cfg
+
+        # ---------1. get counts of send and recv for each rank -----------
+        # 1.1 token nums to send to each rank
+        send_counts = [0] * self.world_size
+        # 1.2 token id to send to each rank
+        token_map = [[] for _ in range(self.world_size)]
+        # 1.3 token meta data, need update for combine
+        meta_map = [[] for _ in range(self.world_size)] 
+        for t, expert_list in enumerate(indices.tolist()):
+            for k, e in enumerate(expert_list):
+                dst_rank = e // self.num_local_experts
+                send_counts[dst_rank] += 1
+                token_map[dst_rank].append(t)
+                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
+                
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # 1.3 token nums to recv from each rank
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # ---------2. send and recv buffer, order by tokens on each rank ----------
+        send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
+        total_recv = int(recv_counts_t.sum().item())
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.in_dtype, device=device)
+        
+        # 2.1 meta buf for send and recv
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)        
+        # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist()
+        )
+        
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+        # ---------4. define output tensor of dispatch ------------
+        # 4.1 num tokens per expert
+        expert_num_tokens = torch.zeros(self.num_local_experts,
+                                        dtype=torch.int32, device=device)
+        # 4.2 token tensor on each expert
+        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
+                               dtype=cfg.in_dtype, device=device)
+        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
+                                  dtype=torch.int32, device=device)
+        # ---------5. dispatch send_meta to recv_meta by recv and send counts------
+        # ---------6. write tokens to each expert on each rank ------
+        # 6.1 fetch the local expert id of corresponding token i
+        for i in range(total_recv): 
+            global_eid = int(recv_meta[i, 0].item())
+            local_eid = global_eid % self.num_local_experts
+            # output, store token buf and token meta and token nums of each expert
+            expert_x[local_eid, expert_num_tokens[local_eid]] = recv_buf[i]
+            expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
+            expert_num_tokens[local_eid] += 1
+        # 6.2 after dispatch, token nums and token and meta of token on expert
+        return expert_num_tokens, expert_x, expert_meta 
+    
+    # ---------- combine ----------
+    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
+                weights: torch.Tensor, # topk weight
+                expert_meta: torch.Tensor, # input
+                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+                expert_num_tokens: torch.Tensor): # input
+        device = out_tokens.device
+        cfg = self.cfg
+
+        # 1. count send-back tokens in cur rank
+        send_counts = [0] * self.world_size
+        # 1.1 token that will send back
+        y_map = [[] for _ in range(self.world_size)]
+        # 1.2 meta info of each token that send back to its src rank
+        meta_map = [[] for _ in range(self.world_size)] 
+
+        # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
+        for local_eid in range(self.num_local_experts):
+            cnt = int(expert_num_tokens[local_eid].item())
+            for j in range(cnt):
+                # meta info token j of local eid
+                meta = expert_meta[local_eid, j] 
+                dst_rank = int(meta[1].item())  
+                send_counts[dst_rank] += 1 
+                # token j and its meta that send back to dst rank/local eid
+                y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
+                meta_map[dst_rank].extend(meta.tolist())
+        # token nums that cur rank plan to send to other ranks
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # token nums that will recv from other ranks
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # 3.send buffers of each rank, that is, the tokens at its experts
+        y_map_tensors = []
+        for sub_list in y_map:
+            if sub_list:
+                y_map_tensors.append(torch.cat(sub_list, dim=0))
+            else:
+                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+        send_buf = torch.cat(y_map_tensors, dim=0)
+        # 4. flatten send meta by tokens
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        # 5. total recv tokens of cur rank
+        total_recv = int(recv_counts_t.sum().item())
+        # 6. recv buffer of cur rank
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.out_dtype, device=device)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)
+        # 7. call all2all to send and recv for each rank
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(), 
+            input_split_sizes=send_counts_t.tolist()
+        )
+        # 8. call all2all to send meta and recv meta for each rank
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        # 9. restore recv meta
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+
+        # 10. write back tokens from recv buf, per meta info, and do weighted sum
+        for i in range(total_recv):
+            src_token = int(recv_meta[i, 2].item())
+            src_k = int(recv_meta[i, 3].item()) 
+            src_rank = int(recv_meta[i, 1].item())
+            w = weights[src_token, src_k].to(torch.float32)
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+
+        return out_tokens
+
+# ---------------- multi processing test ----------------
+def _worker(rank, world_size, rank_data, cfg: MoEConfig):
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
+    torch.cuda.set_device(rank)
+
+    num_dp = world_size
+    dp_rank = rank
+
+    ata = PyTorchAllToAll(cfg, rank, world_size)
+    # ---------- dispatch ----------
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
+    # ---------- simulated moe computation ----------
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    # ---------- combine ----------
+    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
+                   dtype=cfg.out_dtype, device="cuda")
+    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
+    # ----------- copied to CPU by shared mem--------    
+    dist.destroy_process_group()
+    return y[: rank_data.num_tokens].cpu()
+
+def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    world_size = 8
+    
+    cfg = MoEConfig(
+        num_experts=num_experts,
+        experts_per_token=experts_per_token,
+        hidden_dim=hidden_dim,
+        max_num_tokens=max_num_tokens,
+        in_dtype=torch.float16,
+        out_dtype=torch.float16,
+    )
+    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
+    return cfg, all_rank_data
+
+def custom_kernel(data: input_t) -> output_t:
+    cfg, all_rank_data = data
+    world_size = 8
+    
+    mp.set_start_method("spawn", force=True)
+    pool = mp.Pool(processes=world_size)
+    rets = []
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(  
+                  _worker, 
+                  args=(i,
+                        world_size, 
+                        all_rank_data[i], 
+                        cfg),
+            )
+        )
+    pool.close()
+    pool.join()
+    rets = [el.get() for el in rets]
+    ret_out = torch.cat(rets, dim=0)
+    return ret_out
diff --git a/problems/amd/all2all/task.py b/problems/amd/all2all/task.py
new file mode 100644
index 00000000..aaf2de6f
--- /dev/null
+++ b/problems/amd/all2all/task.py
@@ -0,0 +1,12 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+    max_num_tokens: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
new file mode 100644
index 00000000..2613d595
--- /dev/null
+++ b/problems/amd/all2all/task.yml
@@ -0,0 +1,62 @@
+# name: single node all2all
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a custom single node all2all kernel optimized for MI300.
+  You will be given 
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, a_scale, b_scale, c)
+  ```
+  where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
+  and `c` is the output matrix:
+  * `a` is M x K in column-major order in e4m3fnuz
+  * `b` is N x K in column-major order in e4m3fnuz
+  * `a_scale` is M x K // 128 in column-major order in fp32
+  * `b_scale` is N // 128 x K // 128 in column-major order in fp32
+  * `c` is M x N in ROW-major order in bf16
+  
+  Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis is:
+   M       N       K     time[us]
+
+  ```
+
+
+tests:
+  - {"num_experts": 16, "experts_per_token": 4, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 24, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 48, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 64, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 80, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+
+
+benchmarks:
+  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 8192, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+
+
+ranking_by: "geom"

From a24cdde49787a8d5dfbf2832591f67962f7cda7e Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 20 Aug 2025 02:41:57 +0000
Subject: [PATCH 094/132] add moe compute at combine

---
 problems/amd/all2all/reference.py  | 2 +-
 problems/amd/all2all/submission.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index bc426005..607ed1c6 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -194,7 +194,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
 
         return out_tokens
 
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 342388b4..80f51511 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -193,7 +193,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
 
         return out_tokens
 

From d66f03c4a079c44ef0f42bd99edeb3a82c812b17 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 20 Aug 2025 02:44:29 +0000
Subject: [PATCH 095/132] revert

---
 problems/amd/all2all/reference.py  | 2 +-
 problems/amd/all2all/submission.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index 607ed1c6..bc426005 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -194,7 +194,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 80f51511..342388b4 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -193,7 +193,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 

From 424396e5dbc434514238135477177f95a9c383ee Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 25 Aug 2025 06:00:05 +0000
Subject: [PATCH 096/132] define all2all problem shapes and add roofline

---
 problems/amd/all2all/task.yml | 67 ++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index 2613d595..2aff0881 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -11,52 +11,55 @@ lang: "py"
 
 description: |
   
-  You will implement a custom single node all2all kernel optimized for MI300.
-  You will be given 
-  To be explicit, you will be given a tuple of tensors:
+  You will implement a custom single node all2all kernel optimized for 8xMI300.
+  You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
+
+  To be explicit, you will be given data of all ranks, naming all_rank_data:
+  each rank data including:
   ```
-  (a, b, a_scale, b_scale, c)
+  num_tokens, indices, weights, x
   ```
-  where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
-  and `c` is the output matrix:
-  * `a` is M x K in column-major order in e4m3fnuz
-  * `b` is N x K in column-major order in e4m3fnuz
-  * `a_scale` is M x K // 128 in column-major order in fp32
-  * `b_scale` is N // 128 x K // 128 in column-major order in fp32
-  * `c` is M x N in ROW-major order in bf16
+  where `x` are the tokens data at each rank, `num_tokens` is a tensor indicating numbers of token at each rank,
+  `` is the output matrix:
+  * `x` is tokens data at each rank, with (num_tokens, hidden_dim) shape
+  * `num_tokens` is the numbers of tokens at each rank, a scalar with maximum numbers: max number tokens defined in MoEConfig
+  * `indices` is the token to expert map, indicating which experts each token dispatch to, with (num_tokens, experts_per_token) shape
+  * `weights` is weights of topk experts, used in combine, with (num_tokens, experts_per_token) shape
   
-  Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.
-
   The ranking criteria is the geometric mean of the benchmark results.
 
-  For the grand price, your kernel will be evaluated against the speed of light analysis
-  and the solution closest to the speed of light will be awarded the grand price.
+  For the grand price, your kernel will be evaluated against the speed of light analysis and AMD implementations, 
+  the solution closest to the speed of light and AMD implementations will be awarded the grand price.
   ```
   The speed of light analysis is:
-   M       N       K     time[us]
-
+   num_experts   experts_per_token   hidden_dim   max_num_tokens   time[us]
+        8                2              6144            16           6.33
+       64                6              2048            32           7.37
+       128               4              2880            128          14.98
+       128               8              4096            256          61.78
+       256               8              7168            256          104.36
   ```
 
 
 tests:
-  - {"num_experts": 16, "experts_per_token": 4, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 24, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 48, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 64, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 80, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4}
 
 
 benchmarks:
-  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 8192, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 16, "seed": 6635}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 32, "seed": 1234}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 128, "seed": 51}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 175}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 4}
 
 
 ranking_by: "geom"

From a62277e895d073b293be35f7fa19ae231ae7ae35 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 25 Aug 2025 07:44:33 +0000
Subject: [PATCH 097/132] fix typos

---
 problems/amd/all2all/task.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index 2aff0881..a34e982b 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -14,13 +14,13 @@ description: |
   You will implement a custom single node all2all kernel optimized for 8xMI300.
   You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
 
-  To be explicit, you will be given data of all ranks, naming all_rank_data:
+  To be explicit, you will be given data of all ranks, naming all_rank_data.
   each rank data including:
   ```
   num_tokens, indices, weights, x
   ```
-  where `x` are the tokens data at each rank, `num_tokens` is a tensor indicating numbers of token at each rank,
-  `` is the output matrix:
+  lets explain the input args one by one.
+
   * `x` is tokens data at each rank, with (num_tokens, hidden_dim) shape
   * `num_tokens` is the numbers of tokens at each rank, a scalar with maximum numbers: max number tokens defined in MoEConfig
   * `indices` is the token to expert map, indicating which experts each token dispatch to, with (num_tokens, experts_per_token) shape

From b14e0f7883a2c6f67af70c3131219158be7bca6a Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 26 Aug 2025 13:10:29 +0200
Subject: [PATCH 098/132] Feat: conform to kernelbot infrastructure

---
 problems/amd/all2all/reference.py  | 228 +++++++++++++++--------------
 problems/amd/all2all/submission.py | 216 +++++++++++----------------
 problems/amd/all2all/task.py       |  11 +-
 problems/amd/all2all/task.yml      |  34 +++--
 4 files changed, 228 insertions(+), 261 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index bc426005..d5ba04f4 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -2,10 +2,9 @@
 import os
 import torch
 import torch.distributed as dist
-import torch.multiprocessing as mp
 import dataclasses
 from task import input_t, output_t
-from utils import make_match_reference
+
 
 # ---------------- MoE config ----------------
 @dataclasses.dataclass
@@ -17,27 +16,45 @@ class MoEConfig:
     in_dtype: torch.dtype = torch.float16
     out_dtype: torch.dtype = torch.float16
 
+
 # ---------------- data per dp rank ----------------
 class RankTestData:
-    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
-        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
+    def __init__(self, cfg: MoEConfig, rng: torch.Generator, rank: int):
+        device = torch.device(f"cuda:{rank}")
+        self.num_tokens = int(
+            torch.randint(
+                1, cfg.max_num_tokens, [1], generator=rng, device=device
+            ).item()
+        )
         # token expert map
-        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
+        self.indices = torch.empty(
+            self.num_tokens, cfg.experts_per_token, dtype=torch.int32, device=device
+        )
         for i in range(self.num_tokens):
-            perm = torch.randperm(cfg.num_experts, generator=rng)
+            perm = torch.randperm(cfg.num_experts, generator=rng, device=device)
             self.indices[i] = perm[: cfg.experts_per_token]
-        # topk weights 
+        # topk weights
         self.weights = torch.rand(
-            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
+            self.num_tokens,
+            cfg.experts_per_token,
+            dtype=torch.float32,
+            generator=rng,
+            device=device,
         )
         # dp tokens, input of dispatch
-        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
+        self.x = torch.randn(
+            self.num_tokens,
+            cfg.hidden_dim,
+            dtype=cfg.in_dtype,
+            generator=rng,
+            device=device,
+        )
+
 
 # ---------------- All2All pytorch impl ----------------
 class PyTorchAllToAll:
-    
     META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
-    
+
     def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
         self.cfg = cfg
         self.rank = rank
@@ -58,14 +75,16 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # 1.2 token id to send to each rank
         token_map = [[] for _ in range(self.world_size)]
         # 1.3 token meta data, need update for combine
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
         for t, expert_list in enumerate(indices.tolist()):
             for k, e in enumerate(expert_list):
                 dst_rank = e // self.num_local_experts
                 send_counts[dst_rank] += 1
                 token_map[dst_rank].append(t)
-                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
-                
+                meta_map[dst_rank].extend(
+                    [e, self.rank, t, k, 0]
+                )  # srcGobalExpert, srcRank, srcIndex, expert index
+
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # 1.3 token nums to recv from each rank
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
@@ -73,43 +92,52 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # ---------2. send and recv buffer, order by tokens on each rank ----------
         send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
         total_recv = int(recv_counts_t.sum().item())
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.in_dtype, device=device)
-        
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.in_dtype, device=device
+        )
+
         # 2.1 meta buf for send and recv
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)        
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
         dist.all_to_all_single(
-            recv_buf, send_buf,
+            recv_buf,
+            send_buf,
             output_split_sizes=recv_counts_t.tolist(),
-            input_split_sizes=send_counts_t.tolist()
+            input_split_sizes=send_counts_t.tolist(),
         )
-        
+
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
         recv_meta = recv_meta.view(-1, self.META_DIM)
         # ---------4. define output tensor of dispatch ------------
         # 4.1 num tokens per expert
-        expert_num_tokens = torch.zeros(self.num_local_experts,
-                                        dtype=torch.int32, device=device)
+        expert_num_tokens = torch.zeros(
+            self.num_local_experts, dtype=torch.int32, device=device
+        )
         # 4.2 token tensor on each expert
-        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
-                               dtype=cfg.in_dtype, device=device)
-        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
-                                  dtype=torch.int32, device=device)
+        expert_x = torch.empty(
+            (self.num_local_experts, self.max_recv, cfg.hidden_dim),
+            dtype=cfg.in_dtype,
+            device=device,
+        )
+        expert_meta = torch.empty(
+            (self.num_local_experts, self.max_recv, self.META_DIM),
+            dtype=torch.int32,
+            device=device,
+        )
         # ---------5. dispatch send_meta to recv_meta by recv and send counts------
         # ---------6. write tokens to each expert on each rank ------
         # 6.1 fetch the local expert id of corresponding token i
-        for i in range(total_recv): 
+        for i in range(total_recv):
             global_eid = int(recv_meta[i, 0].item())
             local_eid = global_eid % self.num_local_experts
             # output, store token buf and token meta and token nums of each expert
@@ -117,14 +145,17 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
             expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
             expert_num_tokens[local_eid] += 1
         # 6.2 after dispatch, token nums and token and meta of token on expert
-        return expert_num_tokens, expert_x, expert_meta 
-    
+        return expert_num_tokens, expert_x, expert_meta
+
     # ---------- combine ----------
-    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
-                weights: torch.Tensor, # topk weight
-                expert_meta: torch.Tensor, # input
-                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
-                expert_num_tokens: torch.Tensor): # input
+    def combine(
+        self,
+        out_tokens: torch.Tensor,  # output, (max num tokens, token dim)
+        weights: torch.Tensor,  # topk weight
+        expert_meta: torch.Tensor,  # input
+        expert_y: torch.Tensor,  # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+        expert_num_tokens: torch.Tensor,
+    ):  # input
         device = out_tokens.device
         cfg = self.cfg
 
@@ -133,16 +164,16 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 1.1 token that will send back
         y_map = [[] for _ in range(self.world_size)]
         # 1.2 meta info of each token that send back to its src rank
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
 
         # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
         for local_eid in range(self.num_local_experts):
             cnt = int(expert_num_tokens[local_eid].item())
             for j in range(cnt):
                 # meta info token j of local eid
-                meta = expert_meta[local_eid, j] 
-                dst_rank = int(meta[1].item())  
-                send_counts[dst_rank] += 1 
+                meta = expert_meta[local_eid, j]
+                dst_rank = int(meta[1].item())
+                send_counts[dst_rank] += 1
                 # token j and its meta that send back to dst rank/local eid
                 y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
                 meta_map[dst_rank].extend(meta.tolist())
@@ -150,7 +181,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # token nums that will recv from other ranks
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
-        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all
         dist.all_to_all_single(recv_counts_t, send_counts_t)
         # 3.send buffers of each rank, that is, the tokens at its experts
         y_map_tensors = []
@@ -158,30 +189,34 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             if sub_list:
                 y_map_tensors.append(torch.cat(sub_list, dim=0))
             else:
-                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+                y_map_tensors.append(
+                    torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device)
+                )
         send_buf = torch.cat(y_map_tensors, dim=0)
         # 4. flatten send meta by tokens
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
         # 5. total recv tokens of cur rank
         total_recv = int(recv_counts_t.sum().item())
         # 6. recv buffer of cur rank
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.out_dtype, device=device)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.out_dtype, device=device
+        )
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # 7. call all2all to send and recv for each rank
         dist.all_to_all_single(
-            recv_buf, send_buf,
-            output_split_sizes=recv_counts_t.tolist(), 
-            input_split_sizes=send_counts_t.tolist()
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist(),
         )
         # 8. call all2all to send meta and recv meta for each rank
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
@@ -191,41 +226,21 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 10. write back tokens from recv buf, per meta info, and do weighted sum
         for i in range(total_recv):
             src_token = int(recv_meta[i, 2].item())
-            src_k = int(recv_meta[i, 3].item()) 
+            src_k = int(recv_meta[i, 3].item())
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
             out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 
-# ---------------- multi processing test ----------------
-def _worker(rank, world_size, rank_data, cfg: MoEConfig):
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = "12356"
-    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
-    torch.cuda.set_device(rank)
-
-    num_dp = world_size
-    dp_rank = rank
-
-    ata = PyTorchAllToAll(cfg, rank, world_size)
-    # ---------- dispatch ----------
-    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
-    # ---------- simulated moe computation ----------
-    expert_y = expert_x.to(cfg.out_dtype) * 2
-    # ---------- combine ----------
-    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
-                   dtype=cfg.out_dtype, device="cuda")
-    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
-
-    dist.destroy_process_group()
-    return y[: rank_data.num_tokens].cpu()
 
-def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
-    gen = torch.Generator(device='cuda')
+def generate_input(
+    num_experts, experts_per_token, hidden_dim, max_num_tokens, seed, rank, world_size
+):
+    device = torch.device(f"cuda:{rank}")
+    gen = torch.Generator(device=device)
     gen.manual_seed(seed)
-    world_size = 8
-    
+
     cfg = MoEConfig(
         num_experts=num_experts,
         experts_per_token=experts_per_token,
@@ -234,33 +249,28 @@ def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, s
         in_dtype=torch.float16,
         out_dtype=torch.float16,
     )
-    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
-    return cfg, all_rank_data
+    rank_data = RankTestData(cfg, gen, rank)
+    return cfg, rank_data, rank, world_size
+
 
 def ref_kernel(data: input_t) -> output_t:
-    cfg, all_rank_data = data
-    world_size = 8
-    
-    mp.set_start_method("spawn", force=True)
-    pool = mp.Pool(processes=world_size)
-    rets = []
-    for i in range(world_size):
-        rets.append(
-            pool.apply_async(  
-                  _worker, 
-                  args=(i,
-                        world_size, 
-                        all_rank_data[i], 
-                        cfg),
-            )
-        )
-    pool.close()
-    pool.join()
-    rets = [el.get() for el in rets]
-    ret_out = torch.cat(rets, dim=0)
-    return ret_out
+    cfg, rank_data, rank, world_size = data
+
+    ata = PyTorchAllToAll(cfg, rank, world_size)
+
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    y = torch.zeros(
+        cfg.max_num_tokens,
+        cfg.hidden_dim,
+        dtype=cfg.out_dtype,
+        device=rank_data.x.device,
+    )
+
+    ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=5e-3) 
+    return y[:, rank_data.num_tokens]
 
- 
 
+def check_implementation(data: input_t, output: output_t):
+    return True, ""
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 342388b4..385bf289 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -1,43 +1,15 @@
-import os
+#!POPCORN leaderboard all2all-dev
+
 import torch
 import torch.distributed as dist
-import torch.multiprocessing as mp
-import dataclasses
 from task import input_t, output_t
-from utils import make_match_reference
-
-# ---------------- MoE config ----------------
-@dataclasses.dataclass
-class MoEConfig:
-    num_experts: int
-    experts_per_token: int
-    hidden_dim: int
-    max_num_tokens: int
-    in_dtype: torch.dtype = torch.float16
-    out_dtype: torch.dtype = torch.float16
 
-# ---------------- data per dp rank ----------------
-class RankTestData:
-    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
-        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
-        # token expert map
-        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
-        for i in range(self.num_tokens):
-            perm = torch.randperm(cfg.num_experts, generator=rng)
-            self.indices[i] = perm[: cfg.experts_per_token]
-        # topk weights 
-        self.weights = torch.rand(
-            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
-        )
-        # dp tokens, input of dispatch
-        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
 
 # ---------------- All2All pytorch impl ----------------
 class PyTorchAllToAll:
-    
     META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
-    
-    def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
+
+    def __init__(self, cfg, rank: int, world_size: int):
         self.cfg = cfg
         self.rank = rank
         self.world_size = world_size
@@ -57,14 +29,16 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # 1.2 token id to send to each rank
         token_map = [[] for _ in range(self.world_size)]
         # 1.3 token meta data, need update for combine
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
         for t, expert_list in enumerate(indices.tolist()):
             for k, e in enumerate(expert_list):
                 dst_rank = e // self.num_local_experts
                 send_counts[dst_rank] += 1
                 token_map[dst_rank].append(t)
-                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
-                
+                meta_map[dst_rank].extend(
+                    [e, self.rank, t, k, 0]
+                )  # srcGobalExpert, srcRank, srcIndex, expert index
+
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # 1.3 token nums to recv from each rank
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
@@ -72,43 +46,52 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # ---------2. send and recv buffer, order by tokens on each rank ----------
         send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
         total_recv = int(recv_counts_t.sum().item())
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.in_dtype, device=device)
-        
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.in_dtype, device=device
+        )
+
         # 2.1 meta buf for send and recv
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)        
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
         dist.all_to_all_single(
-            recv_buf, send_buf,
+            recv_buf,
+            send_buf,
             output_split_sizes=recv_counts_t.tolist(),
-            input_split_sizes=send_counts_t.tolist()
+            input_split_sizes=send_counts_t.tolist(),
         )
-        
+
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
         recv_meta = recv_meta.view(-1, self.META_DIM)
         # ---------4. define output tensor of dispatch ------------
         # 4.1 num tokens per expert
-        expert_num_tokens = torch.zeros(self.num_local_experts,
-                                        dtype=torch.int32, device=device)
+        expert_num_tokens = torch.zeros(
+            self.num_local_experts, dtype=torch.int32, device=device
+        )
         # 4.2 token tensor on each expert
-        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
-                               dtype=cfg.in_dtype, device=device)
-        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
-                                  dtype=torch.int32, device=device)
+        expert_x = torch.empty(
+            (self.num_local_experts, self.max_recv, cfg.hidden_dim),
+            dtype=cfg.in_dtype,
+            device=device,
+        )
+        expert_meta = torch.empty(
+            (self.num_local_experts, self.max_recv, self.META_DIM),
+            dtype=torch.int32,
+            device=device,
+        )
         # ---------5. dispatch send_meta to recv_meta by recv and send counts------
         # ---------6. write tokens to each expert on each rank ------
         # 6.1 fetch the local expert id of corresponding token i
-        for i in range(total_recv): 
+        for i in range(total_recv):
             global_eid = int(recv_meta[i, 0].item())
             local_eid = global_eid % self.num_local_experts
             # output, store token buf and token meta and token nums of each expert
@@ -116,14 +99,17 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
             expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
             expert_num_tokens[local_eid] += 1
         # 6.2 after dispatch, token nums and token and meta of token on expert
-        return expert_num_tokens, expert_x, expert_meta 
-    
+        return expert_num_tokens, expert_x, expert_meta
+
     # ---------- combine ----------
-    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
-                weights: torch.Tensor, # topk weight
-                expert_meta: torch.Tensor, # input
-                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
-                expert_num_tokens: torch.Tensor): # input
+    def combine(
+        self,
+        out_tokens: torch.Tensor,  # output, (max num tokens, token dim)
+        weights: torch.Tensor,  # topk weight
+        expert_meta: torch.Tensor,  # input
+        expert_y: torch.Tensor,  # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+        expert_num_tokens: torch.Tensor,
+    ):  # input
         device = out_tokens.device
         cfg = self.cfg
 
@@ -132,16 +118,16 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 1.1 token that will send back
         y_map = [[] for _ in range(self.world_size)]
         # 1.2 meta info of each token that send back to its src rank
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
 
         # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
         for local_eid in range(self.num_local_experts):
             cnt = int(expert_num_tokens[local_eid].item())
             for j in range(cnt):
                 # meta info token j of local eid
-                meta = expert_meta[local_eid, j] 
-                dst_rank = int(meta[1].item())  
-                send_counts[dst_rank] += 1 
+                meta = expert_meta[local_eid, j]
+                dst_rank = int(meta[1].item())
+                send_counts[dst_rank] += 1
                 # token j and its meta that send back to dst rank/local eid
                 y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
                 meta_map[dst_rank].extend(meta.tolist())
@@ -149,7 +135,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # token nums that will recv from other ranks
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
-        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all
         dist.all_to_all_single(recv_counts_t, send_counts_t)
         # 3.send buffers of each rank, that is, the tokens at its experts
         y_map_tensors = []
@@ -157,30 +143,34 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             if sub_list:
                 y_map_tensors.append(torch.cat(sub_list, dim=0))
             else:
-                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+                y_map_tensors.append(
+                    torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device)
+                )
         send_buf = torch.cat(y_map_tensors, dim=0)
         # 4. flatten send meta by tokens
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
         # 5. total recv tokens of cur rank
         total_recv = int(recv_counts_t.sum().item())
         # 6. recv buffer of cur rank
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.out_dtype, device=device)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.out_dtype, device=device
+        )
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # 7. call all2all to send and recv for each rank
         dist.all_to_all_single(
-            recv_buf, send_buf,
-            output_split_sizes=recv_counts_t.tolist(), 
-            input_split_sizes=send_counts_t.tolist()
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist(),
         )
         # 8. call all2all to send meta and recv meta for each rank
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
@@ -190,71 +180,29 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 10. write back tokens from recv buf, per meta info, and do weighted sum
         for i in range(total_recv):
             src_token = int(recv_meta[i, 2].item())
-            src_k = int(recv_meta[i, 3].item()) 
+            src_k = int(recv_meta[i, 3].item())
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
             out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 
-# ---------------- multi processing test ----------------
-def _worker(rank, world_size, rank_data, cfg: MoEConfig):
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = "12356"
-    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
-    torch.cuda.set_device(rank)
 
-    num_dp = world_size
-    dp_rank = rank
+def custom_kernel(data: input_t) -> output_t:
+    cfg, rank_data, rank, world_size = data
+    torch.cuda.set_device(rank)
 
     ata = PyTorchAllToAll(cfg, rank, world_size)
-    # ---------- dispatch ----------
-    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
-    # ---------- simulated moe computation ----------
-    expert_y = expert_x.to(cfg.out_dtype) * 2
-    # ---------- combine ----------
-    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
-                   dtype=cfg.out_dtype, device="cuda")
-    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
-    # ----------- copied to CPU by shared mem--------    
-    dist.destroy_process_group()
-    return y[: rank_data.num_tokens].cpu()
 
-def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
-    gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed)
-    world_size = 8
-    
-    cfg = MoEConfig(
-        num_experts=num_experts,
-        experts_per_token=experts_per_token,
-        hidden_dim=hidden_dim,
-        max_num_tokens=max_num_tokens,
-        in_dtype=torch.float16,
-        out_dtype=torch.float16,
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    y = torch.zeros(
+        cfg.max_num_tokens,
+        cfg.hidden_dim,
+        dtype=cfg.out_dtype,
+        device=rank_data.x.device,
     )
-    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
-    return cfg, all_rank_data
 
-def custom_kernel(data: input_t) -> output_t:
-    cfg, all_rank_data = data
-    world_size = 8
-    
-    mp.set_start_method("spawn", force=True)
-    pool = mp.Pool(processes=world_size)
-    rets = []
-    for i in range(world_size):
-        rets.append(
-            pool.apply_async(  
-                  _worker, 
-                  args=(i,
-                        world_size, 
-                        all_rank_data[i], 
-                        cfg),
-            )
-        )
-    pool.close()
-    pool.join()
-    rets = [el.get() for el in rets]
-    ret_out = torch.cat(rets, dim=0)
-    return ret_out
+    ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
+
+    return y[:, rank_data.num_tokens]
diff --git a/problems/amd/all2all/task.py b/problems/amd/all2all/task.py
index aaf2de6f..ee2016a0 100644
--- a/problems/amd/all2all/task.py
+++ b/problems/amd/all2all/task.py
@@ -1,12 +1,17 @@
 import torch
-from typing import TypeVar, TypedDict
+from typing import TypeVar, TypedDict, TYPE_CHECKING
 
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+if TYPE_CHECKING:
+    from reference import MoEConfig, RankTestData
+
+
+input_t = TypeVar("input_t", bound=tuple["MoEConfig", "RankTestData", int, int])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     num_experts: int
     experts_per_token: int
     hidden_dim: int
     max_num_tokens: int
-    seed: int
\ No newline at end of file
+    seed: int
diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index a34e982b..a0af424b 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -8,6 +8,10 @@ files:
   - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
+multi_gpu: true
+
+config:
+  main: "eval.py"
 
 description: |
   
@@ -42,24 +46,24 @@ description: |
 
 
 tests:
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635}
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236}
-  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234}
-  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542}
-  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347}
-  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635, "world_size": 8}
+  # - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236, "world_size": 8}
+  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234, "world_size": 8}
+  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534, "world_size": 8}
+  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897, "world_size": 8}
+  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4, "world_size": 8}
 
 
 benchmarks:
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 16, "seed": 6635}
-  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 32, "seed": 1234}
-  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 128, "seed": 51}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 175}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 4}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 16, "seed": 6635, "world_size": 8}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 32, "seed": 1234, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 128, "seed": 51, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 175, "world_size": 8}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 4, "world_size": 8}
 
 
 ranking_by: "geom"

From b1b14b3e3d7879d196afcea952901e252b0bc774 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 28 Aug 2025 00:15:52 +0200
Subject: [PATCH 099/132] fix pmpppv2; use [bool, str] return consistently

---
 problems/pmpp_v2/eval.py                   | 15 ++-------------
 problems/pmpp_v2/histogram_py/reference.py |  4 ++--
 problems/pmpp_v2/utils.py                  |  6 +++---
 3 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
index ac3a6325..56222a1e 100644
--- a/problems/pmpp_v2/eval.py
+++ b/problems/pmpp_v2/eval.py
@@ -138,17 +138,6 @@ def _clone_data(data):
         return data
 
 
-def wrap_check_implementation(data, submission_output):
-    # Old version returned just a single string, new version
-    # returns (bool, str); this function ensures compatibility with old
-    # problem definitions.
-    result = check_implementation(data, submission_output)
-    if isinstance(result, tuple):
-        return result
-    else:
-        return not bool(result), result
-
-
 def _run_single_test(test: TestCase):
     """
     Runs a single test case. Do not call directly
@@ -158,7 +147,7 @@ def _run_single_test(test: TestCase):
     torch.cuda.synchronize()
     submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
-    return wrap_check_implementation(data, submission_output)
+    return check_implementation(data, submission_output)
 
 
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
@@ -210,7 +199,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     check_copy = _clone_data(data)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
-    good, message = wrap_check_implementation(check_copy, output)
+    good, message = check_implementation(check_copy, output)
     if not good:
         return message
 
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
index fc573f48..4268cec7 100644
--- a/problems/pmpp_v2/histogram_py/reference.py
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -50,7 +50,7 @@ def check_implementation(data, output):
     reasons = verbose_allequal(output, expected)
 
     if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
-    return ''
+    return True, ''
 
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index ee6349d1..b5c8a7a7 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -126,7 +126,7 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> Tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
@@ -134,9 +134,9 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
 
     if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
-    return ''
+    return True, ''
 
 
 def make_match_reference(reference: callable, **kwargs):

From 7a281f541d0111035588f5e940683e73de64c485 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Thu, 28 Aug 2025 21:34:48 +0200
Subject: [PATCH 100/132] Fix: typo

---
 problems/amd/all2all/reference.py  | 2 +-
 problems/amd/all2all/submission.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index d5ba04f4..001fed19 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -269,7 +269,7 @@ def ref_kernel(data: input_t) -> output_t:
 
     ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-    return y[:, rank_data.num_tokens]
+    return y[:rank_data.num_tokens]
 
 
 def check_implementation(data: input_t, output: output_t):
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 385bf289..4b8ec666 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -205,4 +205,4 @@ def custom_kernel(data: input_t) -> output_t:
 
     ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-    return y[:, rank_data.num_tokens]
+    return y[: rank_data.num_tokens]

From 81e2f45ef1bae28d9ad43108ec6cb35179d04a31 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Thu, 28 Aug 2025 22:11:19 +0200
Subject: [PATCH 101/132] Feat: reenable tests, add basic check_implementation

---
 problems/amd/all2all/reference.py |  7 +++++--
 problems/amd/all2all/task.yml     | 19 +++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index 001fed19..eda1ed16 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -269,8 +269,11 @@ def ref_kernel(data: input_t) -> output_t:
 
     ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-    return y[:rank_data.num_tokens]
+    return y[: rank_data.num_tokens]
 
 
 def check_implementation(data: input_t, output: output_t):
-    return True, ""
+    expected = ref_kernel(data)
+    if output.device != expected.device:
+        return False, f"Output device mismatch: {output.device} != {expected.device}"
+    return torch.allclose(output, expected), f"Output mismatch: {output} != {expected}"
diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index a0af424b..a9c5b863 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -46,16 +46,15 @@ description: |
 
 
 tests:
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635, "world_size": 8}
-  # - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236, "world_size": 8}
-  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234, "world_size": 8}
-  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534, "world_size": 8}
-  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897, "world_size": 8}
-  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4, "world_size": 8}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236, "world_size": 8}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234, "world_size": 8}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534, "world_size": 8}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897, "world_size": 8}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4, "world_size": 8}
 
 
 benchmarks:

From fb53db154f400701bdc1ff8684b99cfb5005e6cd Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 29 Aug 2025 21:02:13 +0200
Subject: [PATCH 102/132] fix import

---
 problems/pmpp_v2/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index b5c8a7a7..7b387d7c 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -2,6 +2,7 @@
 import random
 import numpy as np
 import torch
+from typing import Tuple
 
 
 def set_seed(seed=42):

From 1a86955792bfc967efcdfed95e74e530d169204d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 30 Aug 2025 20:26:58 +0200
Subject: [PATCH 103/132] Fix: replace Tuple with tuple

---
 problems/pmpp_v2/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index 7b387d7c..7ef4d79c 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -2,7 +2,6 @@
 import random
 import numpy as np
 import torch
-from typing import Tuple
 
 
 def set_seed(seed=42):
@@ -127,7 +126,7 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> Tuple[bool, str]:
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """

From e5db6640e9588896ef3230e95e79b6cf67af86b0 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 13:23:45 +0300
Subject: [PATCH 104/132] Feat: move to separate leaderboard

---
 problems/amd_distributed.yaml                 |  12 +
 .../all2all/reference.py                      |   0
 .../all2all/submission.py                     |   0
 .../{amd => amd_distributed}/all2all/task.py  |   0
 .../{amd => amd_distributed}/all2all/task.yml |   0
 problems/amd_distributed/eval.py              | 578 ++++++++++++++++++
 6 files changed, 590 insertions(+)
 create mode 100644 problems/amd_distributed.yaml
 rename problems/{amd => amd_distributed}/all2all/reference.py (100%)
 rename problems/{amd => amd_distributed}/all2all/submission.py (100%)
 rename problems/{amd => amd_distributed}/all2all/task.py (100%)
 rename problems/{amd => amd_distributed}/all2all/task.yml (100%)
 create mode 100644 problems/amd_distributed/eval.py

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
new file mode 100644
index 00000000..dece3169
--- /dev/null
+++ b/problems/amd_distributed.yaml
@@ -0,0 +1,12 @@
+name: AMD Developer Challenge 2025 - Distributed Edition
+# when does this end (individual problems might close earlier)
+deadline: "2025-10-14"
+# A description for this particular competition
+description: "AMD Developer Challenge 2025: Distributed Edition"
+# the list of problems
+problems:
+  - directory: amd_distributed/all2all
+    name: all2all
+    deadline: "2025-10-14"
+    gpus:
+      - MI300x8
diff --git a/problems/amd/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
similarity index 100%
rename from problems/amd/all2all/reference.py
rename to problems/amd_distributed/all2all/reference.py
diff --git a/problems/amd/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
similarity index 100%
rename from problems/amd/all2all/submission.py
rename to problems/amd_distributed/all2all/submission.py
diff --git a/problems/amd/all2all/task.py b/problems/amd_distributed/all2all/task.py
similarity index 100%
rename from problems/amd/all2all/task.py
rename to problems/amd_distributed/all2all/task.py
diff --git a/problems/amd/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
similarity index 100%
rename from problems/amd/all2all/task.yml
rename to problems/amd_distributed/all2all/task.yml
diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py
new file mode 100644
index 00000000..c3d20f90
--- /dev/null
+++ b/problems/amd_distributed/eval.py
@@ -0,0 +1,578 @@
+import base64
+import copy
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z_]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data, rank: int):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x, rank) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x, rank) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v, rank) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        device = f"cuda:{rank}"
+        return data.clone().to(device)
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data, 0))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def _run_distributed_test(test: TestCase, rank: int):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    import torch.distributed as dist
+    world_size = test.args["world_size"]
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+    try:
+        data = generate_input(**test.args, rank=rank)
+        torch.cuda.synchronize()
+        submission_output = custom_kernel(_clone_data(data, rank))
+        torch.cuda.synchronize()
+        return wrap_check_implementation(data, submission_output)
+    finally:
+        dist.destroy_process_group()
+
+
+def run_multi_gpu_test(pool: multiprocessing.Pool, test: TestCase, world_size: int):
+    """
+    Runs a single test in another process.
+    """
+    rets = []
+    # world_size is a mandatory argument for multi-gpu tests
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(
+                _run_distributed_test,
+                args=(test, i),
+            )
+        )
+    # 60 seconds should be more than enough, we want tests to be fast
+    rets = [el.get(60) for el in rets]
+
+    correct = all(ret[0] for ret in rets)
+    error_messages = str.join("\n", [f"rank {rank} - {ret[1]}" for rank, ret in enumerate(rets) if not ret[0]])
+    return correct, error_messages
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    world_size = test.args.get("world_size", None)
+    if world_size is None:
+        return pool.apply(_run_single_test, (test,))
+    else:
+        return run_multi_gpu_test(pool, test, world_size)
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data, 0)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data, 0)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def _run_distributed_benchmark(test: TestCase, rank: int, recheck: bool, max_repeats: int,
+                               max_time_ns: float) -> Stats | Any:
+    """
+    Runs one distributed benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+    import torch.distributed as dist
+
+    world_size = test.args["world_size"]
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+
+    try:
+        durations = []
+        # generate input data once
+        data = generate_input(**test.args, rank=rank)
+        check_copy = _clone_data(data, rank)
+
+        # first, one obligatory correctness check
+        output = custom_kernel(_clone_data(data, rank))
+        good, message = wrap_check_implementation(check_copy, output)
+        if not good:
+            return message
+
+        # now, do multiple timing runs with proper distributed synchronization
+        bm_start_time = time.perf_counter_ns()
+        for i in range(max_repeats):
+            error_message = None
+            if recheck:
+                # ensure we use a different seed for every benchmark
+                if "seed" in test.args:
+                    test.args["seed"] += 13
+
+                data = generate_input(**test.args, rank=rank)
+                check_copy = _clone_data(data, rank)
+
+            # Synchronize all ranks before timing
+            clear_l2_cache()
+            torch.cuda.synchronize()
+            dist.barrier()
+
+            # Use distributed timing - only rank 0 records the overall time
+            if rank == 0:
+                start_time = time.perf_counter_ns()
+
+            # All ranks execute the kernel
+            output = custom_kernel(_clone_data(data, rank))
+
+            # Synchronize all ranks after kernel execution
+            torch.cuda.synchronize()
+            dist.barrier()
+
+            if rank == 0:
+                end_time = time.perf_counter_ns()
+                duration = end_time - start_time  # Already in nanoseconds
+                durations.append(duration)
+
+            if recheck:
+                good, message = check_implementation(check_copy, output)
+                if not good:
+                    error_message = message
+
+            del output
+
+            has_error = torch.tensor(1 if error_message is not None else 0, dtype=torch.int32, device=f'cuda:{rank}')
+            dist.reduce(has_error, 0)
+            if has_error.item() > 0:
+                return error_message
+
+            # Only rank 0 checks convergence criteria
+            if rank == 0 and i > 1:
+                total_bm_duration = time.perf_counter_ns() - bm_start_time
+                stats = calculate_stats(durations)
+                # stop if either
+                # a) relative error dips below 0.1%
+                # b) we exceed the total time limit for benchmarking the kernel
+                # c) we exceed 2 minutes of total wallclock time.
+                should_stop = (stats.err / stats.mean < 0.001 or
+                               stats.mean * stats.runs > max_time_ns or
+                               total_bm_duration > 120e9)
+            else:
+                should_stop = False
+
+            # Broadcast stop decision to all ranks
+            stop_tensor = torch.tensor(should_stop, dtype=torch.bool, device=f'cuda:{rank}')
+            dist.broadcast(stop_tensor, 0)
+
+            if stop_tensor.item():
+                break
+
+        # Only rank 0 returns meaningful stats
+        if rank == 0:
+            return calculate_stats(durations)
+        else:
+            # Non-zero ranks return a dummy stats object
+            return Stats(runs=len(durations), mean=0.0, std=0.0, err=0.0, best=0.0, worst=0.0)
+
+    finally:
+        dist.destroy_process_group()
+
+
+def run_multi_gpu_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                            max_time_ns: float, world_size: int):
+    """
+    Runs a multi-GPU benchmark across all ranks.
+    """
+    rets = []
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(
+                _run_distributed_benchmark,
+                args=(test, i, recheck, max_repeats, max_time_ns),
+            )
+        )
+
+    # 120 seconds for benchmarking + we run a pre-benchmark test and want to leave some slack
+    rets = [el.get(timeout=180) for el in rets]
+
+    # For multi-GPU benchmarking, only rank 0 has meaningful stats
+    failed_ranks = []
+    rank_0_result = None
+
+    for rank, ret in enumerate(rets):
+        if isinstance(ret, Stats):
+            if rank == 0:
+                rank_0_result = ret
+        else:
+            # ret is an error message
+            failed_ranks.append((rank, ret))
+
+    if failed_ranks:
+        error_messages = str.join("\n", [f"rank {rank} - {msg}" for rank, msg in failed_ranks])
+        return error_messages
+    else:
+        return rank_0_result if rank_0_result else "No stats returned from rank 0"
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+
+    world_size: Optional[int] = test.args.get("world_size", None)
+    if world_size is None:
+        return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+    else:
+        return run_multi_gpu_benchmark(pool, test, recheck, max_repeats, max_time_ns, world_size)
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data, 0))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    n_gpus = int(os.getenv("POPCORN_GPUS", "1"))
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(n_gpus) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # invalid mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 9230c02e9c0372b1d49d0930d91510c60bac3be4 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 21:32:52 +0300
Subject: [PATCH 105/132] Fix: add utils

---
 problems/amd_distributed/utils.py | 143 ++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 problems/amd_distributed/utils.py

diff --git a/problems/amd_distributed/utils.py b/problems/amd_distributed/utils.py
new file mode 100644
index 00000000..73551022
--- /dev/null
+++ b/problems/amd_distributed/utils.py
@@ -0,0 +1,143 @@
+import random
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return False, ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return good, "\\n".join(reasons)
+
+    return good, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped

From 0e549208105edfc2ab54a6c5fbdd021c272c7040 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 21:40:56 +0300
Subject: [PATCH 106/132] Fix: add timezone

---
 problems/amd_distributed.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index dece3169..4f9afbc9 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -1,12 +1,11 @@
 name: AMD Developer Challenge 2025 - Distributed Edition
 # when does this end (individual problems might close earlier)
-deadline: "2025-10-14"
+deadline: "2025-10-14T23:59:59Z"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Distributed Edition"
 # the list of problems
 problems:
   - directory: amd_distributed/all2all
-    name: all2all
-    deadline: "2025-10-14"
+    deadline: "2025-10-14T23:59:59Z"
     gpus:
       - MI300x8

From ea23b83d041fcecf0ab0ffa7c908c974866f689d Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:57:14 +0300
Subject: [PATCH 107/132] Fix: add amd-all2all name

---
 problems/amd_distributed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index 4f9afbc9..edc4c568 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -6,6 +6,7 @@ description: "AMD Developer Challenge 2025: Distributed Edition"
 # the list of problems
 problems:
   - directory: amd_distributed/all2all
+    name: amd-all2all
     deadline: "2025-10-14T23:59:59Z"
     gpus:
       - MI300x8

From 8856396cd3b626a32d49768bf5c816f729de1f6e Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:58:02 +0300
Subject: [PATCH 108/132] Change deadline format in amd_distributed.yaml

Updated deadline format for the competition and problems.
---
 problems/amd_distributed.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index edc4c568..86fed95a 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -1,12 +1,12 @@
 name: AMD Developer Challenge 2025 - Distributed Edition
 # when does this end (individual problems might close earlier)
-deadline: "2025-10-14T23:59:59Z"
+deadline: "2025-10-14"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Distributed Edition"
 # the list of problems
 problems:
   - directory: amd_distributed/all2all
     name: amd-all2all
-    deadline: "2025-10-14T23:59:59Z"
+    deadline: "2025-10-14"
     gpus:
       - MI300x8

From 6d188a6380b688548136cb6c6c803ff22aa02b93 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 22:04:21 +0300
Subject: [PATCH 109/132] Fix: remove kernelbot directive

---
 problems/amd_distributed/all2all/submission.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
index 4b8ec666..d5506406 100644
--- a/problems/amd_distributed/all2all/submission.py
+++ b/problems/amd_distributed/all2all/submission.py
@@ -1,5 +1,3 @@
-#!POPCORN leaderboard all2all-dev
-
 import torch
 import torch.distributed as dist
 from task import input_t, output_t

From 171e74be3b42a0bf1aedac98332eaf0ee1647a06 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 31 Aug 2025 12:05:16 -0700
Subject: [PATCH 110/132] Add AMD $100K distributed kernel competition to
 README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 89413b2c..f362097e 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ You can see what's going on [gpumode.com](https://www.gpumode.com/)
 1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
 2. [AMD $100K kernel competition](problems/amd)
 3. [BioML kernels](problems/bioml)
+4. [AMD $100K distributed kernel competition](problems/amd_distributed)
 
 ## Making a Leaderboard Submission
 

From 53547fc2aaf46490155f4b752aa62482f4dd22e9 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 22:05:36 +0300
Subject: [PATCH 111/132] Fix: add correct utils

---
 problems/amd_distributed/utils.py | 55 ++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/problems/amd_distributed/utils.py b/problems/amd_distributed/utils.py
index 73551022..396c6bfe 100644
--- a/problems/amd_distributed/utils.py
+++ b/problems/amd_distributed/utils.py
@@ -29,11 +29,7 @@ def get_device(use_cuda: bool = True) -> torch.device:
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
 @torch.no_grad()
 def verbose_allclose(
-        received: torch.Tensor,
-        expected: torch.Tensor,
-        rtol=1e-05,
-        atol=1e-08,
-        max_print=5
+    received: torch.Tensor, expected: torch.Tensor, rtol=1e-05, atol=1e-08, max_print=5
 ) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
@@ -62,9 +58,13 @@ def verbose_allclose(
     nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    posinf_mismatched = torch.logical_xor(
+        torch.isposinf(received), torch.isposinf(expected)
+    )
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    neginf_mismatched = torch.logical_xor(
+        torch.isneginf(received), torch.isneginf(expected)
+    )
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -85,14 +85,18 @@ def verbose_allclose(
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return False, mismatch_details
 
     return True, [f"Maximum error: {torch.max(diff)}"]
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+def verbose_allequal(
+    received: torch.Tensor, expected: torch.Tensor, max_print: int = 5
+) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -118,7 +122,9 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return False, mismatch_details
 
     return True, []
@@ -134,10 +140,37 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     if len(reasons) > 0:
         return good, "\\n".join(reasons)
 
-    return good, ''
+    return good, ""
 
 
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
+
     return wrapped
+
+
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From fab1acbc584be539d40d0c1bd14f2a5d5ee7231a Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 1 Sep 2025 02:18:10 +0000
Subject: [PATCH 112/132] fix max_recv

---
 problems/amd_distributed/all2all/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
index eda1ed16..8bfd0e74 100644
--- a/problems/amd_distributed/all2all/reference.py
+++ b/problems/amd_distributed/all2all/reference.py
@@ -62,7 +62,7 @@ def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
         # num experts per rank
         self.num_local_experts = cfg.num_experts // world_size
         # max recv tokens per rank
-        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+        self.max_recv = cfg.max_num_tokens * world_size
 
     # ---------- dispatch ----------
     def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):

From 5a2a062c7d3e65139cd6f9e44d666064f73dee20 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 1 Sep 2025 03:18:31 +0000
Subject: [PATCH 113/132] fix max_recv in submission file

---
 problems/amd_distributed/all2all/submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
index d5506406..a036f7a8 100644
--- a/problems/amd_distributed/all2all/submission.py
+++ b/problems/amd_distributed/all2all/submission.py
@@ -14,7 +14,7 @@ def __init__(self, cfg, rank: int, world_size: int):
         # num experts per rank
         self.num_local_experts = cfg.num_experts // world_size
         # max recv tokens per rank
-        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+        self.max_recv = cfg.max_num_tokens * world_size
 
     # ---------- dispatch ----------
     def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):

From fc547c6a487c5c279ee197060e5d58b7c1ce0670 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 1 Sep 2025 22:29:21 +0300
Subject: [PATCH 114/132] Fix: all2all timeouts + check_implementation

---
 problems/amd_distributed/all2all/reference.py | 6 +++++-
 problems/amd_distributed/all2all/task.yml     | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
index 8bfd0e74..26c4ac7f 100644
--- a/problems/amd_distributed/all2all/reference.py
+++ b/problems/amd_distributed/all2all/reference.py
@@ -276,4 +276,8 @@ def check_implementation(data: input_t, output: output_t):
     expected = ref_kernel(data)
     if output.device != expected.device:
         return False, f"Output device mismatch: {output.device} != {expected.device}"
-    return torch.allclose(output, expected), f"Output mismatch: {output} != {expected}"
+    res = torch.allclose(output, expected, rtol=1e-2, atol=5e-3)
+    if not res:
+        return False, f"Output values mismatch, {output} != {expected}"
+
+    return True, ""
diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index a9c5b863..688f62d2 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -66,3 +66,4 @@ benchmarks:
 
 
 ranking_by: "geom"
+ranked_timeout: 420

From dfced0bc7bfe8631df4f77f1293023a7986aca5a Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Tue, 2 Sep 2025 06:21:35 -0500
Subject: [PATCH 115/132] add reference impl of gemm-reducescatter

---
 problems/amd_distributed/gemm-rs/reference.py | 63 +++++++++++++++++++
 .../amd_distributed/gemm-rs/submission.py     | 33 ++++++++++
 problems/amd_distributed/gemm-rs/task.py      | 13 ++++
 problems/amd_distributed/gemm-rs/task.yml     | 61 ++++++++++++++++++
 4 files changed, 170 insertions(+)
 create mode 100644 problems/amd_distributed/gemm-rs/reference.py
 create mode 100644 problems/amd_distributed/gemm-rs/submission.py
 create mode 100644 problems/amd_distributed/gemm-rs/task.py
 create mode 100644 problems/amd_distributed/gemm-rs/task.yml

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
new file mode 100644
index 00000000..5ff5904d
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -0,0 +1,63 @@
+from utils import make_match_reference
+from task import input_t, output_t
+import torch
+
+
+def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int) -> input_t:
+    """
+    Generate random input and weights for the AG-GEMM operation.
+
+    Returns:
+        Tuple of (
+            input: torch.Tensor,  # [M, local_K]
+            weight: torch.Tensor,  # [N, local_K]
+            transposed_weight: bool,  # Whether the weight is transposed
+            bias: Optional[torch.Tensor],  # [N] or None
+        )
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed + RANK)
+
+    assert m % world_size == 0, "m must be divisible by world_size"
+    assert k % world_size == 0, "k must be divisible by world_size"
+    local_k = k // world_size
+
+    # Generate random inputs and weights
+    input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+
+    return (input, weight, False, None)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference kernel for Gemm-ReduceScatter operation.
+
+    Args:
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
+                bias: Optional[torch.Tensor])
+            - input: Local input tensor of shape [M, local_K].
+            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
+            - transposed_weight: Whether the weight is transposed.
+            - bias: Optional bias tensor of shape [N] or None.
+    Returns:
+        Tuple containing:
+            - output: Resulting tensor of shape [M // world_size, N].
+    """
+    input, weight, transposed_weight, bias = data
+    M, local_K = input.shape
+    if not transposed_weight:
+        weight = weight.T
+    N = weight.shape[1]
+    world_size = torch.distributed.get_world_size()
+    # matmul
+    output = torch.matmul(input, weight)
+    if bias is not None:
+        output = output + bias
+    # reduce scatter
+    rs_output = torch.empty((M // world_size, N), dtype=output.dtype, device=input.device)
+    torch.distributed.reduce_scatter_tensor(rs_output, output)
+    return rs_output
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/amd_distributed/gemm-rs/submission.py b/problems/amd_distributed/gemm-rs/submission.py
new file mode 100644
index 00000000..dce77b4a
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/submission.py
@@ -0,0 +1,33 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference kernel for Gemm-ReduceScatter operation.
+
+    Args:
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
+                bias: Optional[torch.Tensor])
+            - input: Local input tensor of shape [M, local_K].
+            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
+            - transposed_weight: Whether the weight is transposed.
+            - bias: Optional bias tensor of shape [N] or None.
+    Returns:
+        Tuple containing:
+            - output: Resulting tensor of shape [M // world_size, N].
+    """
+    input, weight, transposed_weight, bias = data
+    M, local_K = input.shape
+    if not transposed_weight:
+        weight = weight.T
+    N = weight.shape[1]
+    world_size = torch.distributed.get_world_size()
+    # matmul
+    output = torch.matmul(input, weight)
+    if bias is not None:
+        output = output + bias
+    # reduce scatter
+    rs_output = torch.empty((M // world_size, N), dtype=output.dtype, device=input.device)
+    torch.distributed.reduce_scatter_tensor(rs_output, output)
+    return rs_output
diff --git a/problems/amd_distributed/gemm-rs/task.py b/problems/amd_distributed/gemm-rs/task.py
new file mode 100644
index 00000000..9be0c81b
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar, Tuple, Dict
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, Dict[str, torch.Tensor], Dict])
+output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+
+
+class TestSpec(TypedDict):
+    world_size: int
+    m: int
+    n: int
+    k: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
new file mode 100644
index 00000000..c8b6bb8b
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -0,0 +1,61 @@
+# name: gemm-rs
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a Gemm-ReduceScatter kernel for efficient transformer models
+  on a single MI300X device.
+
+  ReduceScatter-Gemm (RS-Gemm) is a technique that combines the ReduceScatter
+  communication pattern with General Matrix Multiplication (GEMM) to optimize
+  the performance of transformer models on GPUs. It is particularly useful for
+  handling large models that exceed the memory capacity of a single GPU by
+  distributing the model across multiple GPUs and efficiently scattering the
+  results of matrix multiplications.
+
+  Your task:
+  - Implement the Gemm-RS kernel to perform matrix multiplications in a
+    distributed manner, leveraging the ReduceScatter operation to distribute
+    data across multiple GPUs.
+  - Ensure that the implementation is optimized for the MI300X architecture,
+    taking advantage of its specific hardware features for maximum performance.
+
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor, transposed_weight: bool,
+            bias: Optional, None or torch.Tensor, TP_GROUP: group object)
+    - input: Local input tensor of shape [M, local_K].
+    - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
+    - transposed_weight: Whether the weight is transposed.
+    - bias: bias tensor of shape [N] or None.
+    - TP_GROUP: Process group for tensor parallelism
+
+  Output:
+  - Tuple containing:
+    - output: Resulting tensor of shape [M // world_size, N]
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+ranking_by: "geom"
+
+tests:
+  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "seed": 42}
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "seed": 6635}
+  - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "seed": 4422}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "seed": 1536}
+
+
+benchmarks:
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "seed": 7168}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "seed": 1024}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "seed": 2035}

From eff169759596326890b23d4625cb6d5923266e55 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 3 Sep 2025 02:02:32 +0000
Subject: [PATCH 116/132] [enhance] change seed and moe const different for
 each rank for debug and check

---
 problems/amd_distributed/all2all/reference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
index 26c4ac7f..fbeb0685 100644
--- a/problems/amd_distributed/all2all/reference.py
+++ b/problems/amd_distributed/all2all/reference.py
@@ -239,7 +239,7 @@ def generate_input(
 ):
     device = torch.device(f"cuda:{rank}")
     gen = torch.Generator(device=device)
-    gen.manual_seed(seed)
+    gen.manual_seed(seed + rank)
 
     cfg = MoEConfig(
         num_experts=num_experts,
@@ -259,7 +259,7 @@ def ref_kernel(data: input_t) -> output_t:
     ata = PyTorchAllToAll(cfg, rank, world_size)
 
     expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
-    expert_y = expert_x.to(cfg.out_dtype) * 2
+    expert_y = expert_x.to(cfg.out_dtype) * (1 + rank)
     y = torch.zeros(
         cfg.max_num_tokens,
         cfg.hidden_dim,

From ecd9b59facd61b4cac5c07acc1003a572aad2ee5 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 3 Sep 2025 03:03:24 +0000
Subject: [PATCH 117/132] apply same change to submission.py

---
 problems/amd_distributed/all2all/submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
index a036f7a8..5eddcf68 100644
--- a/problems/amd_distributed/all2all/submission.py
+++ b/problems/amd_distributed/all2all/submission.py
@@ -193,7 +193,7 @@ def custom_kernel(data: input_t) -> output_t:
     ata = PyTorchAllToAll(cfg, rank, world_size)
 
     expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
-    expert_y = expert_x.to(cfg.out_dtype) * 2
+    expert_y = expert_x.to(cfg.out_dtype) * (1 + rank)
     y = torch.zeros(
         cfg.max_num_tokens,
         cfg.hidden_dim,

From 35eb4e12cc4990200091f9675cbf786b6536fd2d Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 00:55:04 -0500
Subject: [PATCH 118/132] typo fix

---
 problems/amd_distributed/gemm-rs/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 5ff5904d..52f4a3af 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -5,7 +5,7 @@
 
 def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int) -> input_t:
     """
-    Generate random input and weights for the AG-GEMM operation.
+    Generate random input and weights for the Gemm-ReduceScatter operation.
 
     Returns:
         Tuple of (

From 8e35bf1e9d1140380e41d48e53f854c95455a404 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 01:55:56 -0500
Subject: [PATCH 119/132] remove transposed_weight bool variable

---
 problems/amd_distributed/gemm-rs/reference.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 52f4a3af..5a121563 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -11,7 +11,6 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int
         Tuple of (
             input: torch.Tensor,  # [M, local_K]
             weight: torch.Tensor,  # [N, local_K]
-            transposed_weight: bool,  # Whether the weight is transposed
             bias: Optional[torch.Tensor],  # [N] or None
         )
     """
@@ -26,7 +25,7 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int
     input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
     weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
-    return (input, weight, False, None)
+    return (input, weight, None)
 
 
 def ref_kernel(data: input_t) -> output_t:
@@ -34,24 +33,20 @@ def ref_kernel(data: input_t) -> output_t:
     Reference kernel for Gemm-ReduceScatter operation.
 
     Args:
-        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
-                bias: Optional[torch.Tensor])
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor])
             - input: Local input tensor of shape [M, local_K].
-            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
-            - transposed_weight: Whether the weight is transposed.
+            - weight: Weight tensor of shape [N, local_K].
             - bias: Optional bias tensor of shape [N] or None.
     Returns:
         Tuple containing:
             - output: Resulting tensor of shape [M // world_size, N].
     """
-    input, weight, transposed_weight, bias = data
+    input, weight, bias = data
     M, local_K = input.shape
-    if not transposed_weight:
-        weight = weight.T
-    N = weight.shape[1]
+    N = weight.shape[0]
     world_size = torch.distributed.get_world_size()
     # matmul
-    output = torch.matmul(input, weight)
+    output = torch.matmul(input, weight.T)
     if bias is not None:
         output = output + bias
     # reduce scatter

From 11303bea37bcf061564a9a37e24e75bb66a96d00 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 02:10:22 -0500
Subject: [PATCH 120/132] update custom_kernel and yaml doc

---
 problems/amd_distributed/gemm-rs/submission.py | 14 +++++---------
 problems/amd_distributed/gemm-rs/task.yml      | 13 +++++--------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/submission.py b/problems/amd_distributed/gemm-rs/submission.py
index dce77b4a..4212d5ad 100644
--- a/problems/amd_distributed/gemm-rs/submission.py
+++ b/problems/amd_distributed/gemm-rs/submission.py
@@ -7,24 +7,20 @@ def custom_kernel(data: input_t) -> output_t:
     Reference kernel for Gemm-ReduceScatter operation.
 
     Args:
-        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
-                bias: Optional[torch.Tensor])
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor])
             - input: Local input tensor of shape [M, local_K].
-            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
-            - transposed_weight: Whether the weight is transposed.
+            - weight: Weight tensor of shape [N, local_K].
             - bias: Optional bias tensor of shape [N] or None.
     Returns:
         Tuple containing:
             - output: Resulting tensor of shape [M // world_size, N].
     """
-    input, weight, transposed_weight, bias = data
+    input, weight, bias = data
     M, local_K = input.shape
-    if not transposed_weight:
-        weight = weight.T
-    N = weight.shape[1]
+    N = weight.shape[0]
     world_size = torch.distributed.get_world_size()
     # matmul
-    output = torch.matmul(input, weight)
+    output = torch.matmul(input, weight.T)
     if bias is not None:
         output = output + bias
     # reduce scatter
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index c8b6bb8b..4d36fdc8 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -10,10 +10,9 @@ files:
 lang: "py"
 
 description: |
-  Implement a Gemm-ReduceScatter kernel for efficient transformer models
-  on a single MI300X device.
+  Implement a Gemm-ReduceScatter kernel on a single MI300X node.
 
-  ReduceScatter-Gemm (RS-Gemm) is a technique that combines the ReduceScatter
+  Gemm-ReduceScatter is a technique that combines the ReduceScatter
   communication pattern with General Matrix Multiplication (GEMM) to optimize
   the performance of transformer models on GPUs. It is particularly useful for
   handling large models that exceed the memory capacity of a single GPU by
@@ -28,13 +27,11 @@ description: |
     taking advantage of its specific hardware features for maximum performance.
 
   Input:
-  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor, transposed_weight: bool,
-            bias: Optional, None or torch.Tensor, TP_GROUP: group object)
+  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor,
+            bias: Optional, None or torch.Tensor)
     - input: Local input tensor of shape [M, local_K].
-    - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
-    - transposed_weight: Whether the weight is transposed.
+    - weight: Weight tensor of shape [N, local_K].
     - bias: bias tensor of shape [N] or None.
-    - TP_GROUP: Process group for tensor parallelism
 
   Output:
   - Tuple containing:

From 38225f614dbf2efd9619c70d9055784c5469a0fe Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 05:34:36 -0500
Subject: [PATCH 121/132] add has_bias in test spec

---
 problems/amd_distributed/gemm-rs/reference.py |  9 +++++++--
 problems/amd_distributed/gemm-rs/task.py      |  1 +
 problems/amd_distributed/gemm-rs/task.yml     | 14 +++++++-------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 5a121563..03e4e5d6 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -3,7 +3,7 @@
 import torch
 
 
-def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int) -> input_t:
+def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
     """
     Generate random input and weights for the Gemm-ReduceScatter operation.
 
@@ -25,7 +25,12 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int
     input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
     weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
-    return (input, weight, None)
+    bias = None
+    if has_bias:
+        gen.manual_seed(seed)
+        bias = (torch.rand((n,), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+
+    return (input, weight, bias)
 
 
 def ref_kernel(data: input_t) -> output_t:
diff --git a/problems/amd_distributed/gemm-rs/task.py b/problems/amd_distributed/gemm-rs/task.py
index 9be0c81b..1245626c 100644
--- a/problems/amd_distributed/gemm-rs/task.py
+++ b/problems/amd_distributed/gemm-rs/task.py
@@ -10,4 +10,5 @@ class TestSpec(TypedDict):
     m: int
     n: int
     k: int
+    has_bias: bool
     seed: int
\ No newline at end of file
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 4d36fdc8..5988132a 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -46,13 +46,13 @@ templates:
 ranking_by: "geom"
 
 tests:
-  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "seed": 42}
-  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "seed": 6635}
-  - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "seed": 4422}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "seed": 1536}
+  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "has_bias": True, "seed": 42}
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "has_bias": False, "seed": 6635}
+  - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "has_bias": True, "seed": 4422}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "has_bias": False, "seed": 1536}
 
 
 benchmarks:
-  - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "seed": 7168}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "seed": 1024}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "seed": 2035}
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "has_bias": True, "seed": 7168}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "has_bias": False, "seed": 1024}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "has_bias": True, "seed": 2035}

From ed0535726bb49a9a5a3442c13df6b5758d9763bb Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 8 Sep 2025 18:18:23 -0700
Subject: [PATCH 122/132] clarify rules

---
 problems/amd_distributed/all2all/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index 688f62d2..f35046f8 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -15,7 +15,7 @@ config:
 
 description: |
   
-  You will implement a custom single node all2all kernel optimized for 8xMI300.
+  You are expected to implement dispatch and simulated moe and combine kernels with intra node communication, which jointly made a custom single node all2all kernel optimized for 8xMI300
   You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
 
   To be explicit, you will be given data of all ranks, naming all_rank_data.

From 54ff613410b5f6374accb811dfc7144d16c7d497 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 8 Sep 2025 18:26:35 -0700
Subject: [PATCH 123/132] clarify rules

---
 problems/amd_distributed/all2all/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index f35046f8..2e0f68f7 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -15,7 +15,7 @@ config:
 
 description: |
   
-  You are expected to implement dispatch and simulated moe and combine kernels with intra node communication, which jointly made a custom single node all2all kernel optimized for 8xMI300
+  You are expected to implement dispatch and simulated moe and combine kernels with intra node communication, refering to reference.py, which jointly made a custom single node all2all kernel optimized for 8xMI300.
   You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
 
   To be explicit, you will be given data of all ranks, naming all_rank_data.

From 4c5405d6c67fc62b372f10a5172b6de79fde2646 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 9 Sep 2025 17:15:28 -0700
Subject: [PATCH 124/132] Update deadline for amd-identity problem

---
 problems/amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index ad721d6c..d522b6bd 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -7,7 +7,7 @@ description: "AMD Developer Challenge 2025: Inference Sprint"
 problems:
   - directory: amd/identity
     name: amd-identity
-    deadline: "2025-09-02"
+    deadline: "2025-12-30"
     gpus:
       - MI300
   - directory: amd/fp8-mm

From 5afcc47169d44e3d49024622d353c969cd3d27a5 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Tue, 9 Sep 2025 21:31:17 -0500
Subject: [PATCH 125/132] update test and benchmark shapes

---
 problems/amd_distributed/gemm-rs/task.yml | 33 ++++++++++++++++++++---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 5988132a..360ff1e7 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -37,6 +37,21 @@ description: |
   - Tuple containing:
     - output: Resulting tensor of shape [M // world_size, N]
 
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light
+  analysis and AMD implementations, the solution closest to the speed of light
+  and AMD implementations will be awarded the grand price.
+  ```
+  The speed of light analysis is:
+   m      n      k      has_bias      time[us]
+   64     7168   18432  False         6.46
+   512    4096   12288  True          8.19
+   2048   2880   2880   True          23.04
+   4096   4096   4096   False         65.54
+   8192   4096   14336  True          131.07
+   8192   8192   29568  False         379.43
+  ```
 config:
   main: "eval.py"
 
@@ -46,13 +61,23 @@ templates:
 ranking_by: "geom"
 
 tests:
-  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "has_bias": True, "seed": 42}
-  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "has_bias": False, "seed": 6635}
+  - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed": 2035}
+  - {"world_size": 8, "m": 64, "n": 3584, "k": 14336, "has_bias": True, "seed": 13}
+  - {"world_size": 8, "m": 512, "n": 3584, "k": 14336, "has_bias": True, "seed": 4297}
+  - {"world_size": 8, "m": 512, "n": 4608, "k": 36864, "has_bias": False, "seed": 1597}
+  - {"world_size": 8, "m": 2048, "n": 4096, "k": 7168, "has_bias": False, "seed": 716}
+  - {"world_size": 8, "m": 2048, "n": 8192, "k": 30720, "has_bias": False, "seed": 20201}
+  - {"world_size": 8, "m": 4096, "n": 2880, "k": 2880, "has_bias": True, "seed": 136}
+  - {"world_size": 8, "m": 4096, "n": 8192, "k": 2048, "has_bias": True, "seed": 138}
+  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "has_bias": True, "seed": 748}
   - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "has_bias": True, "seed": 4422}
   - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "has_bias": False, "seed": 1536}
 
 
 benchmarks:
+  - {"world_size": 8, "m": 64, "n": 7168, "k": 18432, "has_bias": False, "seed": 1234}
+  - {"world_size": 8, "m": 512, "n": 4096, "k": 12288, "has_bias": True, "seed": 663}
+  - {"world_size": 8, "m": 2048, "n": 2880, "k": 2880, "has_bias": True, "seed": 166}
+  - {"world_size": 8, "m": 4096, "n": 4096, "k": 4096, "has_bias": False, "seed": 1371}
   - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "has_bias": True, "seed": 7168}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "has_bias": False, "seed": 1024}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "has_bias": True, "seed": 2035}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "has_bias": False, "seed": 42}

From 10015ea46d41d0ffd55056905566f13645e8063f Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Wed, 10 Sep 2025 21:09:37 -0500
Subject: [PATCH 126/132] change dtype

---
 problems/amd_distributed/gemm-rs/reference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 03e4e5d6..6cb363d4 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -22,13 +22,13 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias:
     local_k = k // world_size
 
     # Generate random inputs and weights
-    input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
-    weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
     bias = None
     if has_bias:
         gen.manual_seed(seed)
-        bias = (torch.rand((n,), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+        bias = (torch.rand((n,), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
     return (input, weight, bias)
 

From 5f7edb501c809a7300569f2cba96b2f442c3d42b Mon Sep 17 00:00:00 2001
From: achalpandeyy <pandeyachal.ap@gmail.com>
Date: Thu, 11 Sep 2025 07:02:18 +0100
Subject: [PATCH 127/132] Use events for pmpp_v2 benchmarking

---
 problems/pmpp_v2/eval.py  | 13 +++++++++----
 problems/pmpp_v2/utils.py |  9 +++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
index 56222a1e..981b9322 100644
--- a/problems/pmpp_v2/eval.py
+++ b/problems/pmpp_v2/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed
+from utils import set_seed, clear_l2_cache
 try:
     from task import TestSpec
 except ImportError:
@@ -218,10 +218,15 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
             data = generate_input(**test.args)
             check_copy = _clone_data(data)
         torch.cuda.synchronize()
-        start = time.perf_counter_ns()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
         output = custom_kernel(data)
+        end_event.record()
         torch.cuda.synchronize()
-        end = time.perf_counter_ns()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
 
         if recheck:
             good, message = check_implementation(check_copy, output)
@@ -229,7 +234,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
                 return message
 
         del output
-        durations.append(end - start)
+        durations.append(duration)
 
         if i > 1:
             total_bm_duration = time.perf_counter_ns() - bm_start_time
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index 7ef4d79c..e8a9082f 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -165,3 +165,12 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.deterministic = self.deterministic
         torch.use_deterministic_algorithms(False)
         os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy
\ No newline at end of file

From 75bd4abc5f4acab1ee1ec38b8a22adb85b6b5d74 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Fri, 12 Sep 2025 14:24:43 +0200
Subject: [PATCH 128/132] Feat: works

---
 problems/amd_distributed.yaml                 |  5 ++++
 problems/amd_distributed/gemm-rs/reference.py | 23 +++++++++++++------
 problems/amd_distributed/gemm-rs/task.yml     |  1 +
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index 86fed95a..fc9121f7 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -10,3 +10,8 @@ problems:
     deadline: "2025-10-14"
     gpus:
       - MI300x8
+  - directory: amd_distributed/gemm-rs
+    name: amd-gemm-rs
+    deadline: "2025-10-14"
+    gpus:
+      - MI300x8
diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 6cb363d4..dcffa085 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -3,7 +3,7 @@
 import torch
 
 
-def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
+def generate_input(rank: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
     """
     Generate random input and weights for the Gemm-ReduceScatter operation.
 
@@ -14,21 +14,22 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias:
             bias: Optional[torch.Tensor],  # [N] or None
         )
     """
-    gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed + RANK)
+    device = torch.device(f'cuda:{rank}')
+    gen = torch.Generator(device=device)
+    gen.manual_seed(seed + rank)
 
     assert m % world_size == 0, "m must be divisible by world_size"
     assert k % world_size == 0, "k must be divisible by world_size"
     local_k = k // world_size
 
     # Generate random inputs and weights
-    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
-    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     bias = None
     if has_bias:
         gen.manual_seed(seed)
-        bias = (torch.rand((n,), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+        bias = (torch.rand((n,), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     return (input, weight, bias)
 
@@ -60,4 +61,12 @@ def ref_kernel(data: input_t) -> output_t:
     return rs_output
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
+def check_implementation(data: input_t, output: output_t):
+    expected = ref_kernel(data)
+    if output.device != expected.device:
+        return False, f"Output device mismatch: {output.device} != {expected.device}"
+    res = torch.allclose(output, expected, rtol=1e-2, atol=1e-2)
+    if not res:
+        return False, f"Output values mismatch, {output} != {expected}"
+
+    return True, ""
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 360ff1e7..d0501362 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -8,6 +8,7 @@ files:
   - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
+multi_gpu: true
 
 description: |
   Implement a Gemm-ReduceScatter kernel on a single MI300X node.

From 0d0ca8484c27a214d9b8f0c6f8b49253b0520da8 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 14 Sep 2025 12:35:15 +0200
Subject: [PATCH 129/132] Final

---
 problems/amd_distributed/gemm-rs/reference.py | 1 -
 problems/amd_distributed/gemm-rs/task.py      | 8 ++++----
 problems/amd_distributed/gemm-rs/task.yml     | 1 +
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index dcffa085..cb60206b 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -1,4 +1,3 @@
-from utils import make_match_reference
 from task import input_t, output_t
 import torch
 
diff --git a/problems/amd_distributed/gemm-rs/task.py b/problems/amd_distributed/gemm-rs/task.py
index 1245626c..1de3eddd 100644
--- a/problems/amd_distributed/gemm-rs/task.py
+++ b/problems/amd_distributed/gemm-rs/task.py
@@ -1,8 +1,8 @@
-from typing import TypedDict, TypeVar, Tuple, Dict
+from typing import TypedDict, TypeVar, Tuple, Optional
 import torch
 
-input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, Dict[str, torch.Tensor], Dict])
-output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]])
+output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
 class TestSpec(TypedDict):
@@ -11,4 +11,4 @@ class TestSpec(TypedDict):
     n: int
     k: int
     has_bias: bool
-    seed: int
\ No newline at end of file
+    seed: int
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index d0501362..6eac2741 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -60,6 +60,7 @@ templates:
   Python: "submission.py"
 
 ranking_by: "geom"
+ranked_timeout: 360 # just in case
 
 tests:
   - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed": 2035}

From cc79e071a0cf58b7d895385868b40fcb406ab6fb Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 30 Aug 2025 12:43:12 +0200
Subject: [PATCH 130/132] example: basic distributed profiling

---
 problems/amd_distributed/eval.py | 67 +++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py
index c3d20f90..ff4febe6 100644
--- a/problems/amd_distributed/eval.py
+++ b/problems/amd_distributed/eval.py
@@ -499,26 +499,83 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
-def run_single_profile(test: TestCase) -> str:
+def _run_single_profile(test: TestCase) -> str:
     """
     Runs a single test case. Do not call directly
     """
     from submission import custom_kernel
-    from torch.profiler import profile, record_function, ProfilerActivity
+    from torch.profiler import profile, ProfilerActivity
     data = generate_input(**test.args)
     torch.cuda.synchronize()
 
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
         submission_output = custom_kernel(_clone_data(data, 0))
         torch.cuda.synchronize()
+
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
-def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+def _run_distributed_profile(test: TestCase, rank: int) -> profile:
+    """
+    Runs a single profiling case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, ProfilerActivity
+    import torch.distributed as dist
+    world_size = test.args["world_size"]
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+
+    try:
+        data = generate_input(**test.args, rank=rank)
+        data = _clone_data(data, rank)
+        torch.cuda.synchronize()
+
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+            submission_output = custom_kernel(data)
+            torch.cuda.synchronize()
+
+        return prof
+
+    finally:
+        dist.destroy_process_group()
+
+def run_multi_gpu_profile(pool: multiprocessing.Pool, test: TestCase, world_size: int) -> str:
+    """
+    Runs a single test in another process.
+    """
+    rets = []
+    # world_size is a mandatory argument for multi-gpu tests
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(
+                _run_distributed_profile,
+                args=(test, i),
+            )
+        )
+
+    rets = [el.get(120) for el in rets]
+
+    # TODO: Combine distributed profiling results?
+    return rets[0].key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
+    """
+    Runs a single profiling activity in another process.
+    """
+    world_size = test.args.get("world_size", None)
+    if world_size is None:
+        return pool.apply(_run_single_profile, (test,))
+    else:
+        return run_multi_gpu_profile(pool, test, world_size)
+
+
+def run_profiling(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
     logger.log("benchmark-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"benchmark.{idx}.spec", test.spec)
-        report = run_single_profile(test)
+        report = run_single_profile(test, pool)
         logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
     logger.log("check", "pass")
     return 0
@@ -568,7 +625,7 @@ def main():
 
                 logger.log("check", "pass" if passed else "fail")
             elif mode == "profile":
-                run_profiling(logger, tests)
+                run_profiling(logger, pool, tests)
             else:
                 # invalid mode
                 return 2

From 53cd59eb9dcdd2444abe9f93e2750934b968c498 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 30 Aug 2025 21:37:04 +0200
Subject: [PATCH 131/132] example: combine distributed profiler activity

---
 problems/amd_distributed/eval.py | 42 +++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py
index ff4febe6..c99e73f8 100644
--- a/problems/amd_distributed/eval.py
+++ b/problems/amd_distributed/eval.py
@@ -515,7 +515,7 @@ def _run_single_profile(test: TestCase) -> str:
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
-def _run_distributed_profile(test: TestCase, rank: int) -> profile:
+def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
     """
     Runs a single profiling case. Do not call directly
     """
@@ -536,11 +536,46 @@ def _run_distributed_profile(test: TestCase, rank: int) -> profile:
             submission_output = custom_kernel(data)
             torch.cuda.synchronize()
 
-        return prof
+        return prof.events()
 
     finally:
         dist.destroy_process_group()
 
+
+def _combine_traces(traces: list["EventList"]) -> "EventList":
+    """
+    Combine multiple event traces obtained from multiple (distributed) torch.profiler
+    activities. This function simply aggregates the data as like `prof.key_averages()`,
+    except over multiple traces. Most of this function is reimplemented
+    from `torch.autograd.profiler_util.EventList.key_averages()`.
+    """
+    from torch.autograd.profiler_util import FunctionEventAvg, EventList
+    from collections import defaultdict
+
+    def get_key(event) -> tuple[str, ...]:
+        return (
+            str(event.key),
+            str(event.node_id),
+            str(event.device_type),
+            str(event.is_legacy),
+            str(event.is_user_annotation),
+        )
+
+    stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
+
+    for events in traces:
+        for event in events:
+            stats[get_key(event)].add(event)
+
+    avg_list = EventList(stats.values())
+    for event in avg_list:
+        event.stack = []
+        event.input_shapes = ""
+        event.overload_name = ""
+
+    return avg_list
+
+
 def run_multi_gpu_profile(pool: multiprocessing.Pool, test: TestCase, world_size: int) -> str:
     """
     Runs a single test in another process.
@@ -556,9 +591,8 @@ def run_multi_gpu_profile(pool: multiprocessing.Pool, test: TestCase, world_size
         )
 
     rets = [el.get(120) for el in rets]
+    return _combine_traces(rets).table(sort_by="self_cuda_time_total", row_limit=20)
 
-    # TODO: Combine distributed profiling results?
-    return rets[0].key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
     """

From e342af471eb1a801d3fd6b85bc7d8e0988a1362a Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 30 Aug 2025 22:00:17 +0200
Subject: [PATCH 132/132] example: emit nvtx markers

---
 problems/amd_distributed/eval.py | 36 ++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py
index c99e73f8..597b5ff4 100644
--- a/problems/amd_distributed/eval.py
+++ b/problems/amd_distributed/eval.py
@@ -11,6 +11,7 @@
 from typing import Any, Optional
 
 import torch.cuda
+from torch.cuda.nvtx import range as nvtx_range
 
 from utils import set_seed, clear_l2_cache
 
@@ -505,13 +506,16 @@ def _run_single_profile(test: TestCase) -> str:
     """
     from submission import custom_kernel
     from torch.profiler import profile, ProfilerActivity
-    data = generate_input(**test.args)
-    torch.cuda.synchronize()
 
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        submission_output = custom_kernel(_clone_data(data, 0))
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
         torch.cuda.synchronize()
 
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        with nvtx_range("custom_kernel"):
+            submission_output = custom_kernel(_clone_data(data, 0))
+            torch.cuda.synchronize()
+
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
@@ -522,19 +526,25 @@ def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
     from submission import custom_kernel
     from torch.profiler import profile, ProfilerActivity
     import torch.distributed as dist
-    world_size = test.args["world_size"]
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = "12356"
-    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+
+    with nvtx_range(f"init nccl, rank {rank}"):
+        world_size = test.args["world_size"]
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = "12356"
+        dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
 
     try:
-        data = generate_input(**test.args, rank=rank)
-        data = _clone_data(data, rank)
-        torch.cuda.synchronize()
+        with nvtx_range(f"generate input, rank {rank}"):
+            data = generate_input(**test.args, rank=rank)
+            data = _clone_data(data, rank)
+            torch.cuda.synchronize()
+            dist.barrier()
 
         with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-            submission_output = custom_kernel(data)
-            torch.cuda.synchronize()
+            with nvtx_range(f"custom_kernel, rank {rank}"):
+                submission_output = custom_kernel(data)
+                torch.cuda.synchronize()
+                dist.barrier()
 
         return prof.events()