Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions src/ATen/native/xpu/sycl/BinaryDivFloorKernel.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
#include <ATen/OpMathType.h>
#include <ATen/native/TensorIterator.h>
#include <c10/util/generic_math.h>
Expand Down Expand Up @@ -74,8 +74,10 @@ void div_floor_kernel(TensorIteratorBase& iter) {
// optimization for floating-point types: if the second operand is a CPU
// scalar, compute a * reciprocal(b). Note that this may lose one bit of
// precision compared to computing the division.
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, dtype, "div_floor_xpu", [&]() {
AT_DISPATCH_V2(
dtype,
"div_floor_xpu",
AT_WRAP([&]() {
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto b = iter.scalar_value<accscalar_t>(2);
if (C10_UNLIKELY(b == 0)) {
Expand All @@ -86,12 +88,28 @@ void div_floor_kernel(TensorIteratorBase& iter) {
iter.remove_operand(2);
gpu_kernel(
iter, DivFloorWithScalarFunctor<scalar_t, accscalar_t>(b, inv_b));
});
}),
AT_EXPAND(AT_FLOATING_TYPES),
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, dtype, "div_floor_xpu", [&]() {
AT_DISPATCH_V2(
dtype,
"div_floor_xpu",
AT_WRAP([&]() {
gpu_kernel_with_scalars(iter, DivFloorFloatFunctor<scalar_t>());
});
}),
AT_EXPAND(AT_FLOATING_TYPES),
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
}
}
} // namespace at::native::xpu
37 changes: 28 additions & 9 deletions src/ATen/native/xpu/sycl/BinaryDivTrueKernel.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
#include <ATen/OpMathType.h>
#include <ATen/native/TensorIterator.h>

#include <ATen/native/xpu/sycl/BinaryInternal.h>
#include <ATen/native/xpu/sycl/Loops.h>

#include <ATen/native/xpu/sycl/BinaryKernels.h>
#include <ATen/native/xpu/sycl/Loops.h>

namespace at::native::xpu {

Expand All @@ -21,22 +20,42 @@ void div_true_kernel(TensorIteratorBase& iter) {
// optimization for floating-point types: if the second operand is a CPU
// scalar, compute a * reciprocal(b). Note that this may lose one bit of
// precision compared to computing the division.
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
kHalf, kBFloat16, common_dtype, "div_true_xpu", [&]() {
AT_DISPATCH_V2(
common_dtype,
"div_true_xpu",
AT_WRAP([&]() {
using opmath_t = at::opmath_type<scalar_t>;
auto inv_b = opmath_t(1.0) / iter.scalar_value<opmath_t>(2);
iter.remove_operand(2);
gpu_kernel(
iter,
BUnaryFunctor<scalar_t, scalar_t, scalar_t, MulFunctor<opmath_t>>(
MulFunctor<opmath_t>(), inv_b));
});
}),
AT_EXPAND(AT_COMPLEX_TYPES),
AT_EXPAND(AT_FLOATING_TYPES),
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
} else {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
kHalf, kBFloat16, common_dtype, "div_true_xpu", [&]() {
AT_DISPATCH_V2(
common_dtype,
"div_true_xpu",
AT_WRAP([&]() {
DivFunctor<scalar_t> f;
gpu_kernel_with_scalars(iter, f);
});
}),
AT_EXPAND(AT_COMPLEX_TYPES),
AT_EXPAND(AT_FLOATING_TYPES),
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
}
}

Expand Down
32 changes: 25 additions & 7 deletions src/ATen/native/xpu/sycl/BinaryDivTruncKernel.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
#include <ATen/OpMathType.h>
#include <ATen/native/TensorIterator.h>

Expand Down Expand Up @@ -40,18 +40,36 @@ void div_trunc_kernel(TensorIteratorBase& iter) {
// optimization for floating-point types: if the second operand is a CPU
// scalar, compute a * reciprocal(b). Note that this may lose one bit of
// precision compared to computing the division.
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, dtype, "div_trunc_xpu", [&]() {
AT_DISPATCH_V2(
dtype,
"div_trunc_xpu",
AT_WRAP([&]() {
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
iter.remove_operand(2);
gpu_kernel(iter, DivTruncScalarFunctor<scalar_t, accscalar_t>(inv_b));
});
}),
AT_EXPAND(AT_FLOATING_TYPES),
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, dtype, "div_trunc_xpu", [&]() {
AT_DISPATCH_V2(
dtype,
"div_trunc_xpu",
AT_WRAP([&]() {
gpu_kernel_with_scalars(iter, DivTruncFunctor<scalar_t>());
});
}),
AT_EXPAND(AT_FLOATING_TYPES),
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
}
}

Expand Down
37 changes: 28 additions & 9 deletions src/ATen/native/xpu/sycl/BinaryKernels.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#include <ATen/Dispatch.h>
#include <ATen/Dispatch_v2.h>
#include <ATen/native/TensorIterator.h>
#include <comm/xpu_aten.h>

#include <ATen/native/xpu/sycl/BinaryInternal.h>
#include <ATen/native/xpu/sycl/Loops.h>

#include <ATen/native/xpu/sycl/BinaryKernels.h>
#include <ATen/native/xpu/sycl/Loops.h>

namespace at::native::xpu {

Expand All @@ -28,12 +27,22 @@ void add_kernel(TensorIteratorBase& iter, const c10::Scalar& alpha) {
opmath_gpu_kernel_with_scalars<scalar_t>(
iter, AddFunctor(alpha.to<opmath_t>()));
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
kHalf, kBFloat16, kBool, iter.common_dtype(), "add_xpu", [&]() {
AT_DISPATCH_V2(
common_dtype,
"add_xpu",
AT_WRAP([&]() {
using opmath_t = opmath_type<scalar_t>;
opmath_gpu_kernel_with_scalars<scalar_t>(
iter, AddFunctor(alpha.to<opmath_t>()));
});
}),
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
kBool,
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
}
}

Expand All @@ -49,12 +58,22 @@ void mul_kernel(TensorIteratorBase& iter) {
opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
iter, MulFunctor<opmath_t>());
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_xpu", [&]() {
AT_DISPATCH_V2(
common_dtype,
"mul_xpu",
AT_WRAP([&]() {
using opmath_t = opmath_type<scalar_t>;
opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
iter, MulFunctor<opmath_t>());
});
}),
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
kBool,
kHalf,
kBFloat16,
kFloat8_e5m2,
kFloat8_e4m3fn,
kFloat8_e5m2fnuz,
kFloat8_e4m3fnuz);
}
}

Expand Down