AlphaGPU · kunal-mansukhani · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
@@ -0,0 +1,33 @@
+<p>
+    Implement a GPU program that computes the dot product of two vectors containing 16-bit floating point numbers (FP16/<code>half</code>).
+    The dot product is the sum of the products of the corresponding elements of two vectors.
+</p>
+<p>
+    Mathematically, the dot product of two vectors \(A\) and \(B\) of length \(n\) is defined as:
+    \[
+    A \cdot B = \sum_{i=0}^{n-1} A_i \cdot B_i = A_0 \cdot B_0 + A_1 \cdot B_1 + \ldots + A_{n-1} \cdot B_{n-1}
+    \]
+</p>
+<p>
+    All inputs are stored as 16-bit floating point numbers (FP16/<code>half</code>). For best precision, accumulation during multiplication should use FP32 before converting the final result to FP16.
+</p>
+<h2>Implementation Requirements</h2>
+<ul>
+    <li>External libraries are not permitted</li>
+    <li>The <code>solve</code> function signature must remain unchanged</li>
+    <li>Accumulation during multiplication should use FP32 for better precision before converting the final result to FP16</li>
+    <li>The final result must be stored in the output variable as <code>half</code></li>
+</ul>
+<h2>Example 1:</h2>
+<pre>Input:  A = [1.0, 2.0, 3.0, 4.0]
+               B = [5.0, 6.0, 7.0, 8.0]
+       Output: result = 70.0  (1.0*5.0 + 2.0*6.0 + 3.0*7.0 + 4.0*8.0)</pre>
+<h2>Example 2:</h2>
+<pre>Input:  A = [0.5, 1.5, 2.5]
+               B = [2.0, 3.0, 4.0]
+       Output: result = 15.5  (0.5*2.0 + 1.5*3.0 + 2.5*4.0)</pre>
+<h2>Constraints</h2>
+<ul>
+    <li><code>A</code> and <code>B</code> have identical lengths</li>
+    <li>1 ≤ <code>N</code> ≤ 100,000,000</li>
+</ul>
@@ -0,0 +1,111 @@
+import ctypes
+from typing import Any, List, Dict
+import torch
+from core.challenge_base import ChallengeBase
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="FP16 Dot Product",
+            atol=5e-2,
+            rtol=5e-2,
+            num_gpus=1,
+            access_tier="free"
+        )
+
+    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):
+        assert A.shape == (N,)
+        assert B.shape == (N,)
+        assert result.shape == (1,)
+        # Use FP32 for accumulation, then convert to FP16
+        A_f32 = A.to(torch.float32)
+        B_f32 = B.to(torch.float32)
+        result_f32 = torch.dot(A_f32, B_f32)
+        result[0] = result_f32.to(torch.float16)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "A": (ctypes.POINTER(ctypes.c_uint16), "in"),
+            "B": (ctypes.POINTER(ctypes.c_uint16), "in"),
+            "result": (ctypes.POINTER(ctypes.c_uint16), "out"),
+            "N": (ctypes.c_int, "in")
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.float16
+        A = torch.tensor([1.0, 2.0, 3.0, 4.0], device="cuda", dtype=dtype)
+        B = torch.tensor([5.0, 6.0, 7.0, 8.0], device="cuda", dtype=dtype)
+        result = torch.empty(1, device="cuda", dtype=dtype)
+        return {
+            "A": A,
+            "B": B,
+            "result": result,
+            "N": 4
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.float16
+        tests = []
+        # basic_small
+        tests.append({
+            "A": torch.tensor([1.0, 2.0, 3.0, 4.0], device="cuda", dtype=dtype),
+            "B": torch.tensor([5.0, 6.0, 7.0, 8.0], device="cuda", dtype=dtype),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 4
+        })
+        # all_zeros
+        tests.append({
+            "A": torch.tensor([0.0] * 16, device="cuda", dtype=dtype),
+            "B": torch.tensor([0.0] * 16, device="cuda", dtype=dtype),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 16
+        })
+        # negative_numbers
+        tests.append({
+            "A": torch.tensor([-1.0, -2.0, -3.0, -4.0], device="cuda", dtype=dtype),
+            "B": torch.tensor([-5.0, -6.0, -7.0, -8.0], device="cuda", dtype=dtype),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 4
+        })
+        # mixed_positive_negative
+        tests.append({
+            "A": torch.tensor([1.0, -2.0, 3.0, -4.0], device="cuda", dtype=dtype),
+            "B": torch.tensor([-1.0, 2.0, -3.0, 4.0], device="cuda", dtype=dtype),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 4
+        })
+        # orthogonal_vectors
+        tests.append({
+            "A": torch.tensor([1.0, 0.0, 0.0], device="cuda", dtype=dtype),
+            "B": torch.tensor([0.0, 1.0, 0.0], device="cuda", dtype=dtype),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 3
+        })
+        # medium_sized_vector
+        tests.append({
+            "A": torch.empty(1000, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+            "B": torch.empty(1000, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 1000
+        })
+        # large_vector
+        tests.append({
+            "A": torch.empty(10000, device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "B": torch.empty(10000, device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "result": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 10000
+        })
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.float16
+        N = 100000000
+        A = torch.empty(N, device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
+        B = torch.empty(N, device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
+        result = torch.zeros(1, device="cuda", dtype=dtype)
+        return {
+            "A": A,
+            "B": B,
+            "result": result,
+            "N": N
+        }
@@ -0,0 +1,7 @@
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+// A, B, result are device pointers
+extern "C" void solve(const half* A, const half* B, half* result, int N) {
+
+}
@@ -0,0 +1,7 @@
+import cutlass
+import cutlass.cute as cute
+
+# A, B, result are tensors on the GPU
+@cute.jit
+def solve(A: cute.Tensor, B: cute.Tensor, result: cute.Tensor, N: cute.Int32):
+    pass
@@ -0,0 +1,9 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# A, B, result are device pointers
+@export
+def solve(A: UnsafePointer[Float16], B: UnsafePointer[Float16], result: UnsafePointer[Float16], N: Int32):
+    pass
@@ -0,0 +1,5 @@
+import torch
+
+# A, B, result are tensors on the GPU
+def solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):
+    pass
@@ -0,0 +1,7 @@
+import torch
+import triton
+import triton.language as tl
+
+# A, B, result are tensors on the GPU
+def solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):
+    pass