Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions challenges/medium/58_fp16_dot_product/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<p>
Implement a GPU program that computes the dot product of two vectors containing 16-bit floating point numbers (FP16/<code>half</code>).
The dot product is the sum of the products of the corresponding elements of two vectors.
</p>
<p>
Mathematically, the dot product of two vectors \(A\) and \(B\) of length \(n\) is defined as:
\[
A \cdot B = \sum_{i=0}^{n-1} A_i \cdot B_i = A_0 \cdot B_0 + A_1 \cdot B_1 + \ldots + A_{n-1} \cdot B_{n-1}
\]
</p>
<p>
All inputs are stored as 16-bit floating point numbers (FP16/<code>half</code>). For best precision, accumulation during multiplication should use FP32 before converting the final result to FP16.
</p>
<h2>Implementation Requirements</h2>
<ul>
<li>External libraries are not permitted</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>Accumulation during multiplication should use FP32 for better precision before converting the final result to FP16</li>
<li>The final result must be stored in the output variable as <code>half</code></li>
</ul>
<h2>Example 1:</h2>
<pre>Input: A = [1.0, 2.0, 3.0, 4.0]
B = [5.0, 6.0, 7.0, 8.0]
Output: result = 70.0 (1.0*5.0 + 2.0*6.0 + 3.0*7.0 + 4.0*8.0)</pre>
<h2>Example 2:</h2>
<pre>Input: A = [0.5, 1.5, 2.5]
B = [2.0, 3.0, 4.0]
Output: result = 15.5 (0.5*2.0 + 1.5*3.0 + 2.5*4.0)</pre>
<h2>Constraints</h2>
<ul>
<li><code>A</code> and <code>B</code> have identical lengths</li>
<li>1 ≤ <code>N</code> ≤ 100,000,000</li>
</ul>
111 changes: 111 additions & 0 deletions challenges/medium/58_fp16_dot_product/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import ctypes
from typing import Any, List, Dict
import torch
from core.challenge_base import ChallengeBase

class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="FP16 Dot Product",
atol=5e-2,
rtol=5e-2,
num_gpus=1,
access_tier="free"
)

def reference_impl(self, A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):
assert A.shape == (N,)
assert B.shape == (N,)
assert result.shape == (1,)
# Use FP32 for accumulation, then convert to FP16
A_f32 = A.to(torch.float32)
B_f32 = B.to(torch.float32)
result_f32 = torch.dot(A_f32, B_f32)
result[0] = result_f32.to(torch.float16)

def get_solve_signature(self) -> Dict[str, tuple]:
return {
"A": (ctypes.POINTER(ctypes.c_uint16), "in"),
"B": (ctypes.POINTER(ctypes.c_uint16), "in"),
"result": (ctypes.POINTER(ctypes.c_uint16), "out"),
"N": (ctypes.c_int, "in")
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.float16
A = torch.tensor([1.0, 2.0, 3.0, 4.0], device="cuda", dtype=dtype)
B = torch.tensor([5.0, 6.0, 7.0, 8.0], device="cuda", dtype=dtype)
result = torch.empty(1, device="cuda", dtype=dtype)
return {
"A": A,
"B": B,
"result": result,
"N": 4
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.float16
tests = []
# basic_small
tests.append({
"A": torch.tensor([1.0, 2.0, 3.0, 4.0], device="cuda", dtype=dtype),
"B": torch.tensor([5.0, 6.0, 7.0, 8.0], device="cuda", dtype=dtype),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 4
})
# all_zeros
tests.append({
"A": torch.tensor([0.0] * 16, device="cuda", dtype=dtype),
"B": torch.tensor([0.0] * 16, device="cuda", dtype=dtype),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 16
})
# negative_numbers
tests.append({
"A": torch.tensor([-1.0, -2.0, -3.0, -4.0], device="cuda", dtype=dtype),
"B": torch.tensor([-5.0, -6.0, -7.0, -8.0], device="cuda", dtype=dtype),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 4
})
# mixed_positive_negative
tests.append({
"A": torch.tensor([1.0, -2.0, 3.0, -4.0], device="cuda", dtype=dtype),
"B": torch.tensor([-1.0, 2.0, -3.0, 4.0], device="cuda", dtype=dtype),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 4
})
# orthogonal_vectors
tests.append({
"A": torch.tensor([1.0, 0.0, 0.0], device="cuda", dtype=dtype),
"B": torch.tensor([0.0, 1.0, 0.0], device="cuda", dtype=dtype),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 3
})
# medium_sized_vector
tests.append({
"A": torch.empty(1000, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"B": torch.empty(1000, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 1000
})
# large_vector
tests.append({
"A": torch.empty(10000, device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
"B": torch.empty(10000, device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
"result": torch.empty(1, device="cuda", dtype=dtype),
"N": 10000
})
return tests

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.float16
N = 100000000
A = torch.empty(N, device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
B = torch.empty(N, device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
result = torch.zeros(1, device="cuda", dtype=dtype)
return {
"A": A,
"B": B,
"result": result,
"N": N
}
7 changes: 7 additions & 0 deletions challenges/medium/58_fp16_dot_product/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#include <cuda_runtime.h>
#include <cuda_fp16.h>

// A, B, result are device pointers
extern "C" void solve(const half* A, const half* B, half* result, int N) {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import cutlass
import cutlass.cute as cute

# A, B, result are tensors on the GPU
@cute.jit
def solve(A: cute.Tensor, B: cute.Tensor, result: cute.Tensor, N: cute.Int32):
pass
9 changes: 9 additions & 0 deletions challenges/medium/58_fp16_dot_product/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

# A, B, result are device pointers
@export
def solve(A: UnsafePointer[Float16], B: UnsafePointer[Float16], result: UnsafePointer[Float16], N: Int32):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import torch

# A, B, result are tensors on the GPU
def solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import torch
import triton
import triton.language as tl

# A, B, result are tensors on the GPU
def solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):
pass