Skip to content

Opt-in gpu #676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 80 additions & 66 deletions src/arraymancer/tensor/data_structure.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,73 +16,87 @@ import
../laser/dynamic_stack_arrays,
../laser/tensor/datatypes,
nimblas,
nimcuda/cuda12_5/[cuda_runtime_api, check],
# Standard library
std/[complex]

export nimblas.OrderType, complex
export datatypes, dynamic_stack_arrays

type
CudaTensorRefTrackerObj*[T: SomeFloat] = object
value*: ptr UncheckedArray[T]

CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T]

CudaStorage*[T: SomeFloat] = object
## Opaque seq-like structure for storage on the Cuda backend.
##
## Nim garbage collector will automatically ask cuda to clear GPU memory if data becomes unused.
##
# TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection.

CudaTensor*[T: SomeFloat] = object
## Tensor data structure stored on Nvidia GPU (Cuda)
## - ``shape``: Dimensions of the CudaTensor
## - ``strides``: Numbers of items to skip to get the next item along a dimension.
## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices.
## - ``storage``: An opaque data storage for the CudaTensor
##
## Warning ⚠:
## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other.
## However modification on metadata (shape, strides or offset) will not affect the other tensor.
## Explicit copies can be made with ``clone``: ``var a = b.clone``
shape*: Metadata
strides*: Metadata
offset*: int
storage*: CudaStorage[T]

ClStorage*[T: SomeFloat] = object
## Opaque seq-like structure for storage on the OpenCL backend.
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.

ClTensor*[T: SomeFloat] = object
## Tensor data structure stored on OpenCL (CPU, GPU, FPGAs or other accelerators)
## - ``shape``: Dimensions of the CudaTensor
## - ``strides``: Numbers of items to skip to get the next item along a dimension.
## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices.
## - ``storage``: An opaque data storage for the CudaTensor
##
## Warning ⚠:
## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other.
## However modification on metadata (shape, strides or offset) will not affect the other tensor.
## Explicit copies can be made with ``clone``: ``var a = b.clone``
shape*: Metadata
strides*: Metadata
offset*: int
storage*: ClStorage[T]

AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]


proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}=
if not p.value.isNil:
check cudaFree(p.value)
when defined(cuda):
import nimcuda/cuda12_5/[cuda_runtime_api, check]

type
CudaTensorRefTrackerObj*[T: SomeFloat] = object
value*: ptr UncheckedArray[T]

CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T]

CudaStorage*[T: SomeFloat] = object
## Opaque seq-like structure for storage on the Cuda backend.
##
## Nim garbage collector will automatically ask cuda to clear GPU memory if data becomes unused.
##
# TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection.

CudaTensor*[T: SomeFloat] = object
## Tensor data structure stored on Nvidia GPU (Cuda)
## - ``shape``: Dimensions of the CudaTensor
## - ``strides``: Numbers of items to skip to get the next item along a dimension.
## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices.
## - ``storage``: An opaque data storage for the CudaTensor
##
## Warning ⚠:
## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other.
## However modification on metadata (shape, strides or offset) will not affect the other tensor.
## Explicit copies can be made with ``clone``: ``var a = b.clone``
shape*: Metadata
strides*: Metadata
offset*: int
storage*: CudaStorage[T]

proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}=
if not p.value.isNil:
check cudaFree(p.value)

when defined(opencl):
type
ClStorage*[T: SomeFloat] = object
## Opaque seq-like structure for storage on the OpenCL backend.
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.

ClTensor*[T: SomeFloat] = object
## Tensor data structure stored on OpenCL (CPU, GPU, FPGAs or other accelerators)
## - ``shape``: Dimensions of the CudaTensor
## - ``strides``: Numbers of items to skip to get the next item along a dimension.
## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices.
## - ``storage``: An opaque data storage for the CudaTensor
##
## Warning ⚠:
## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other.
## However modification on metadata (shape, strides or offset) will not affect the other tensor.
## Explicit copies can be made with ``clone``: ``var a = b.clone``
shape*: Metadata
strides*: Metadata
offset*: int
storage*: ClStorage[T]

when defined(cuda) and defined(opencl):
type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]
elif defined(cuda):
type AnyTensor*[T] = Tensor[T] or CudaTensor[T]
elif defined(opencl):
type AnyTensor*[T] = Tensor[T] or ClTensor[T]
else:
type AnyTensor*[T] = Tensor[T]

type GpuTensor[T] = AnyTensor[T] and not Tensor[T]




# ###############
Expand All @@ -102,10 +116,10 @@ proc `data=`*[T](t: var Tensor[T], s: seq[T]) {.deprecated: "Use copyFromRaw ins
# Tensor Metadata
# ################

func rank*[T](t: CudaTensor[T] or ClTensor[T]): range[0 .. LASER_MAXRANK] {.inline.} =
func rank*[T](t: GpuTensor[T]): range[0 .. LASER_MAXRANK] {.inline.} =
t.shape.len

func size*[T](t: CudaTensor[T] or ClTensor[T]): Natural {.inline.} =
func size*[T](t: GpuTensor[T]): Natural {.inline.} =
t.shape.product

proc shape_to_strides*(shape: Metadata, layout: OrderType = rowMajor, result: var Metadata) {.noSideEffect.} =
Expand All @@ -131,7 +145,7 @@ proc shape_to_strides*(shape: Metadata, layout: OrderType = rowMajor, result: va
accum *= shape[i]
return

func is_C_contiguous*(t: CudaTensor or ClTensor): bool =
func is_C_contiguous*(t: GpuTensor): bool =
## Check if the tensor follows C convention / is row major
var cur_size = 1
for i in countdown(t.rank - 1,0):
Expand Down Expand Up @@ -182,14 +196,14 @@ proc get_offset_ptr*[T: KnownSupportsCopyMem](t: Tensor[T]): ptr T {.noSideEffec
proc get_offset_ptr*[T: not KnownSupportsCopyMem](t: AnyTensor[T]): ptr T {.error: "`get_offset_ptr`" &
" cannot be safely used for GC'ed types!".}

proc get_data_ptr*[T](t: CudaTensor[T] or ClTensor[T]): ptr T {.noSideEffect, inline.}=
proc get_data_ptr*[T](t: GpuTensor[T]): ptr T {.noSideEffect, inline.}=
## Input:
## - A tensor
## Returns:
## - A pointer to the real start of its data (no offset)
cast[ptr T](t.storage.Fdata)

proc get_offset_ptr*[T](t: CudaTensor[T] or ClTensor[T]): ptr T {.noSideEffect, inline.}=
proc get_offset_ptr*[T](t: GpuTensor[T]): ptr T {.noSideEffect, inline.}=
## Input:
## - A tensor
## Returns:
Expand Down
17 changes: 6 additions & 11 deletions src/arraymancer/tensor/exporting.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,12 @@ proc toRawSeq*[T](t:Tensor[T]): seq[T] {.noSideEffect, deprecated: "This proc ca
## or that you raise your use-case in the issue tracker https://github.com/mratsim/Arraymancer/issues
## so that more suitable primitives can be crafted

# Due to forward declaration this proc must be declared
# after "cpu" proc are declared in init_cuda
when t is Tensor:
result = newSeq[T](t.size)
for i in 0 ..< t.size:
when T is KnownSupportsCopyMem:
result[i] = t.unsafe_raw_offset()[i]
else:
result[i] = t.storage.raw_buffer[i]
elif t is CudaTensor:
return t.cpu.data
result = newSeq[T](t.size)
for i in 0 ..< t.size:
when T is KnownSupportsCopyMem:
result[i] = t.unsafe_raw_offset()[i]
else:
result[i] = t.storage.raw_buffer[i]

proc toFlatSeq*[T](t: Tensor[T]) : seq[T] =
## Export the data of the Tensor flattened as a Seq
Expand Down
5 changes: 4 additions & 1 deletion src/arraymancer/tensor/private/p_checks.nim
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ import ../../laser/private/nested_containers,
when (NimMajor, NimMinor) < (1, 4):
import ../../std_version_types

include ./p_checks_cuda, ./p_checks_opencl
when defined(cuda):
include ./p_checks_cuda
when defined(opencl):
include ./p_checks_opencl

func check_nested_elements*(shape: Metadata, len: int) {.inline.}=
## Compare the detected shape from flatten with the real length of the data
Expand Down
Loading