This Rust crate serve as the bridge between "computing frameworks", such Candle and underlying computing interface (UHAL/UHHI).
The computing instructions will be dispatched to the corresponding devices through: ubridge -> UHAL/UHHI -> Runtime/Driver.
The gcu kernels written in TopsCC were prebuilt under kernels/scorpio (for GCU S60), any changes of kernel source code (*.cpp) in kernels folder will trigger automatic build (topscc compilier required).
device_tensor.rs: higher level abstraction of device tensor.
device_executor.rs: execution engine and kernel management.
device_opcode.rs: definition of operators that currently supported.
gcu_device.rs: abstraction of GCU device for Candle.
gcu_slice.rs: used for tensor slicing for candle-gcu.
gcu_launch.rs: gcu kernel launch for candle-gcu.
tests/*: samples of UHAL, ubridge.
main.rs: entry for samples (executed by cargo run).
kernels: CUDA/GCU kernels.
cd ubridge
cargo run --features gcu//Example of UHAL for neural network forward pass (on NV GPU & Enflame GCU)
use cust_core::DeviceCopy;
use std::collections::HashMap;
//Import UHAL for common computing interfaces
use uhal::error::DeviceResult;
use uhal::launch;
use uhal::memory::DeviceBufferTrait;
use uhal::module::ModuleTrait;
use uhal::stream::{StreamFlags, StreamTrait};
use uhal::DriverLibraryTrait;
//Tops backend
#[cfg(feature = "gcu")]
use tops::memory::CopyDestination;
#[cfg(feature = "gcu")]
use tops::memory::TopsDeviceBuffer as DeviceBuffer;
#[cfg(feature = "gcu")]
use tops::module::TopsModule as Module;
#[cfg(feature = "gcu")]
use tops::stream::TopsStream as Stream;
#[cfg(feature = "gcu")]
use tops::TopsApi as Api;
#[cfg(feature = "gcu")]
use tops_backend as tops;
//Cuda backend
#[cfg(feature = "cuda")]
use cuda::memory::CopyDestination;
#[cfg(feature = "cuda")]
use cuda::memory::CuDeviceBuffer as DeviceBuffer;
#[cfg(feature = "cuda")]
use cuda::module::CuModule as Module;
#[cfg(feature = "cuda")]
use cuda::stream::CuStream as Stream;
#[cfg(feature = "cuda")]
use cuda::CuApi as Api;
#[cfg(feature = "cuda")]
use cuda_backend as cuda;
use crate::device_executor::DeviceExecutor;
fn load_module<'a>(name: &str) -> DeviceResult<Module> {
#[cfg(feature = "gcu")]
let ptx = format!("{}/kernels/legacy/scorpio/{}.topsfb", env!("CARGO_MANIFEST_DIR"), name).to_string();
#[cfg(feature = "cuda")]
let ptx = format!("{}/kernels/gpu/{}.ptx", env!("CARGO_MANIFEST_DIR"), name).to_string();
Module::from_file(&ptx)
}
struct Layer<'a, T: DeviceCopy> {
op: &'a str,
weight: Option<&'a DeviceBuffer<T>>,
input_size: (usize, usize),
output_size: (usize, usize),
out_ref: Option<&'a DeviceBuffer<T>>,
}
pub fn get_block_grid(shape1: usize, shape0: usize) -> (usize, usize, usize) {
let grid_a: usize = (shape1 + 16 - 1) / 16;
let grid_b: usize = (shape0 + 16 - 1) / 16;
return (16, grid_a, grid_b);
}
//A 6-layer neural network forward pass
//Unified interface (UHAL) for CUDA and Tops backend
#[allow(non_snake_case)]
pub fn network_test() -> DeviceResult<()> {
let _device = Api::quick_init(0)?;
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
const N: usize = 16;
const K: usize = 3;
let w1 = DeviceBuffer::from_slice(&vec![0.01f32; N * N])?;
let w2 = DeviceBuffer::from_slice(&vec![0.02f32; N * N])?;
let w3 = DeviceBuffer::from_slice(&vec![0.03f32; N * N])?;
let w4 = DeviceBuffer::from_slice(&vec![0.04f32; N * N])?;
let w5 = DeviceBuffer::from_slice(&vec![0.05f32; N * N])?;
//Neural network layers: matmul(tanh act) -> matmul(relu act) -> matmul(tanh act) -> convolution(3x3 kernel, tanh act) -> matmul(tanh act) -> matmul(leaky act)
let layers = vec![
Layer::<f32> {
op: "batch_matmul_legacy",
weight: Some(&w1),
input_size: (N, N),
output_size: (N, N),
out_ref: None,
}, //weight is N x N matric for next layer
Layer::<f32> {
op: "tanh",
weight: None,
input_size: (N, N),
output_size: (N, N),
out_ref: None,
}, //out N x N
Layer::<f32> {
op: "batch_matmul_legacy",
weight: Some(&w2),
input_size: (N, N),
output_size: (N, N),
out_ref: None,
}, //weight is N x N matric for next layer
Layer::<f32> {
op: "relu",
weight: None,
input_size: (N, N),
output_size: (N, N),
out_ref: None,
}, //out N x N
Layer::<f32> {
op: "batch_matmul_legacy",
weight: Some(&w3),
input_size: (N, N),
output_size: (N, N),
out_ref: None,
}, //weight is convolution kernel for next layer
Layer::<f32> {
op: "tanh",
weight: None,
input_size: (N, N),
output_size: (N, N),
out_ref: None,
}, //out N x N
Layer::<f32> {
op: "convolution",
weight: Some(&w4),
input_size: (N, N),
output_size: (N - K + 1, N - K + 1),
out_ref: None,
}, //weight is (N - K + 1) * (N - K + 1) matric for next layer
Layer::<f32> {
op: "tanh",
weight: None,
input_size: (N - K + 1, N - K + 1),
output_size: (N - K + 1, N - K + 1),
out_ref: None,
}, //out (N - K + 1) x (N - K + 1)
Layer::<f32> {
op: "batch_matmul_legacy",
weight: Some(&w5),
input_size: (N - K + 1, N - K + 1),
output_size: (N - K + 1, N - K + 1),
out_ref: None,
}, //weight is (N - K + 1) * (N - K + 1) matric for next layer
Layer::<f32> {
op: "tanh",
weight: None,
input_size: (N - K + 1, N - K + 1),
output_size: (N - K + 1, N - K + 1),
out_ref: None,
}, //output shape (N - K + 1) * (N - K + 1)
Layer::<f32> {
op: "batch_matmul_legacy",
weight: None,
input_size: (N - K + 1, N - K + 1),
output_size: (N - K + 1, N - K + 1),
out_ref: None,
}, // no weight in the last layer
Layer::<f32> {
op: "gelu",
weight: None,
input_size: (N - K + 1, N - K + 1),
output_size: (N - K + 1, N - K + 1),
out_ref: None,
}, //output shape (N - K + 1) * (N - K + 1)
];
let mat = vec![0.5f32; N * N];
let mato = vec![0.0f32; N * N];
let convo = vec![0.0f32; (N - K + 1) * (N - K + 1)];
let matA = DeviceBuffer::from_slice(&mat)?;
let matB = DeviceBuffer::from_slice(&mat)?;
let matOut = DeviceBuffer::from_slice(&mato)?;
let matConvOut = DeviceBuffer::from_slice(&convo)?;
let map_act = HashMap::from([("relu", 0), ("gelu", 1), ("leaky", 2), ("tanh", 3)]);
let mut out_ref: Option<&DeviceBuffer<f32>> = Some(&matOut);
let mut matA_ref: Option<&DeviceBuffer<f32>> = Some(&matA);
let mut matB_ref: Option<&DeviceBuffer<f32>> = Some(&matB);
let mut out_size: Option<(usize, usize)> = None;
for layer in layers {
if ["relu", "gelu", "leaky", "tanh"].contains(&layer.op) {
let function_name = "activation";
match load_module(function_name) {
Ok(module) => {
let function_namef32 = "activationf32";
let kernel = module.get_function(&function_namef32)?;
let param = DeviceBuffer::from_slice(&[
(layer.input_size.0 * layer.input_size.1) as i32,
map_act[layer.op] as i32,
])?;
let (_block_size, _grid_a, _grid_b) =
get_block_grid(layer.input_size.1, layer.input_size.0);
let A = match matA_ref {Some(a)=> {a}, _=> {panic!("error")}};
unsafe {
#[cfg(feature = "gcu")]
let result = launch!(kernel<<<(1, 1, 1), (1, 1, 1), 0, stream>>>(
A.as_device_ptr(),
param.as_device_ptr(),
));
#[cfg(feature = "cuda")]
let result = launch!(kernel<<<(grid_a as u32, grid_b as u32), (block_size as u32, block_size as u32), 0, stream>>>(
A.as_device_ptr(),
layer.input_size.0 as u32,
layer.input_size.1 as u32,
map_act[layer.op]
));
result?;
}
out_ref = Some(&A);
out_size = Some(layer.output_size);
}
_ => {
panic!("Failed to load kernel!");
}
}
} else if layer.op == "batch_matmul_legacy" {
match load_module(layer.op) {
Ok(module) => {
let kernel = module.get_function(&layer.op)?;
#[cfg(feature = "gcu")]
let inputShapeA = DeviceBuffer::from_slice(&[
1i32,
layer.input_size.0 as i32,
layer.input_size.1 as i32,
])?;
#[cfg(feature = "gcu")]
let inputShapeB = DeviceBuffer::from_slice(&[
1i32,
layer.input_size.0 as i32,
layer.input_size.1 as i32,
])?;
let A = match matA_ref {Some(a)=> {a}, _=> {panic!("error")}};
let B = match matB_ref {Some(a)=> {a}, _=> {panic!("error")}};
let O = match out_ref {Some(a)=> {a}, _=> {panic!("error")}};
unsafe {
#[cfg(feature = "gcu")]
let result = launch!(kernel<<<(1, 1, 1), (1, 1, 1), 0, stream>>>(
A.as_device_ptr(),
B.as_device_ptr(),
O.as_device_ptr(),
inputShapeA.as_device_ptr(),
inputShapeB.as_device_ptr()
));
#[cfg(feature = "cuda")]
let result = launch!(kernel<<<(grid_a as u32, grid_b as u32), (block_size as u32, block_size as u32), 0, stream>>>(
A.as_device_ptr(),
B.as_device_ptr(),
O.as_device_ptr(),
layer.input_size.0 as u32,
layer.input_size.1 as u32,
layer.output_size.1 as u32
));
result?;
}
matA_ref = Some(&O);
match layer.weight {
Some(w) => {
matB_ref = Some(w);
}
_ => {
}
};
out_ref = Some(&O);
out_size = Some(layer.output_size);
}
_ => {
panic!("\nFailed to load kernel (matmul)!");
}
}
} else if layer.op == "convolution" {
match load_module(layer.op) {
Ok(module) => {
let kernel = module.get_function(&layer.op)?;
let A = match matA_ref {Some(a)=> {a}, _=> {panic!("error")}};
let B = match matB_ref {Some(a)=> {a}, _=> {panic!("error")}};
#[cfg(feature = "gcu")]
let inputShapeA = DeviceBuffer::from_slice(&[
layer.input_size.0 as i32,
layer.input_size.1 as i32,
1i32,
1i32,
])?;
#[cfg(feature = "gcu")]
let inputShapeB = DeviceBuffer::from_slice(&vec![K as i32, K as i32, 1i32, 1i32])?;
#[cfg(feature = "gcu")]
let channelInfo = DeviceBuffer::from_slice(&vec![1i32, 1i32, 1i32, 1i32])?;
unsafe {
#[cfg(feature = "gcu")]
let result = launch!(kernel<<<(1, 1, 1), (1, 1, 1), 0, stream>>>(
A.as_device_ptr(),
B.as_device_ptr(),
matConvOut.as_device_ptr(),
inputShapeA.as_device_ptr(),
inputShapeB.as_device_ptr(),
channelInfo.as_device_ptr()
));
#[cfg(feature = "cuda")]
let result = launch!(kernel<<<(1, 1, 1), (1, 1, 1), 0, stream>>>(
A.as_device_ptr(),
B.as_device_ptr(),
ConvOut.as_device_ptr(),
layer.input_size.0 as u32,
layer.input_size.1 as u32,
K as u32,
K as u32
));
result?;
}
matA_ref = Some(&matConvOut);
match layer.weight {
Some(w) => {
matB_ref = Some(w);
}
_ => {
}
};
out_ref = Some(&matConvOut);
out_size = Some(layer.output_size);
}
_ => {
panic!("\nFailed to load kernel (convolution)!");
}
}
} else {
panic!("Operation {} not supported!", layer.op);
}
}
// Wait asynchronous kernels to finish.
stream.synchronize()?;
match out_ref {
Some(out) => {
let mut out_host = vec![0.0f32; out.len()];
out.copy_to(&mut out_host)?;
match out_size {
Some(sz) => {
let W = sz.0;
let H = sz.1;
println!("\n\nResults of forward pass******************");
for x in 0..H {
for y in 0..W {
print!("{:.5} ", out_host[x * W + y]);
}
println!("{}", "");
}
}
_ => {
panic!("Unable to obtain compute result!")
}
}
}
_ => {
panic!("Unable to obtain compute result!")
}
}
println!("\nLaunched compute kernel successfully.");
Ok(())
}
fn main() -> DeviceResult<()> {
println!("******************\ninfo: start uhal network test!\n");
match network_test() {
Ok(()) => {
println!("\nLaunched network_test successfully.");
}
Err(e) => {
println!("\nLaunch network_test failed.");
return Err(e);
}
}
println!("\n\nPASSED!\n\n");
Ok(())
}(Same on Nvidia GPU and Enflame GCU)
Results of forward pass******************
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272
0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272 0.00272