Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,30 @@ if(BUILD_TEST)
add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
endif()

# The library couples with PyTorch. Here are two possible building processes:
# 1. standalone - Need pre-install PyTorch. Introduce PyTorch depdenceise through PyTorch installation dirctory.
# 2. submodule - Build as a submodule of PyTorch. Introduce PyTorch dependecies through PyTorch sources directory.
if(PYTORCH_USE_XPU)
# submodule
else()
# standalone
set(Torch_COMP_VERION "2.3.0")

if(NOT PYTORCH_INSTALL_DIR)
message(FATAL_ERROR "Cannot find PYTORCH_INSTALL_DIR in standalone build mode, please set -DPYTORCH_INSTALL_DIR ...")
endif()

set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch)
find_package(Torch REQUIRED)

set(Caffe2_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Caffe2)
find_package(Caffe2 REQUIRED)

set(PYTORCH_INCLUDE_DIRS ${TORCH_INCLUDE_DIRS} ${CAFFE2_INCLUDE_DIRS})
endif()

add_subdirectory(${TORCH_XPU_OPS_ROOT}/src ${CMAKE_BINARY_DIR}/torch_xpu_ops)

set(PYTORCH_FOUND_XPU TRUE)

message(STATUS "XPU found")
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<div align="center">

torch-xpu-ops*
===========================

torch-xpu-ops is an `xpu` implementation of PyTorch ATen operators.

## Build
* Standalone - Require pre-installation of PyTorch
```bash
mkdir build
cd build && cmake -DBUILD_TEST=1 -DPYTORCH_INSTALL_DIR=YOUR_PYTORCH_INSTALLATION_DIR ..
make -j x
```
* Submodule - Build as a submodule of PyTorch
```bash
// TODO
```
25 changes: 25 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# define archive static library target - torch_xpu_ops

set(ATen_XPU_CPP_SRCS)
set(ATen_XPU_SYCL_SRCS)

set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src)

add_subdirectory(aten)

set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
sycl_add_library(
torch_xpu_ops
STATIC
SYCL_SOURCES ${ATen_XPU_SYCL_SRCS}
CXX_SOURCES ${ATen_XPU_CPP_SRCS})
set(SYCL_LINK_LIBRARIES_KEYWORD)

# Align with PyTorch compile options
# 1. submodule - PYTORCH_SRC_DIR/cmake/public/utils.cmake
# 2. standalone - PYTORCH_INSTALL_DIR/share/cmake/Caffe2/public/utils.cmake
torch_compile_options(torch_xpu_ops)
target_compile_options_if_supported(torch_xpu_ops "-Wno-deprecated-copy")

target_include_directories(torch_xpu_ops PUBLIC ${PYTORCH_INCLUDE_DIRS})
target_include_directories(torch_xpu_ops PUBLIC ${ATen_XPU_INCLUDE_DIRS})
10 changes: 10 additions & 0 deletions src/aten/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# ATen XPU sources

file(GLOB xpu_cpp "*.cpp")
file(GLOB xpu_sycl "sycl/*.cpp")

list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})

set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
97 changes: 97 additions & 0 deletions src/aten/EmptyTensor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/Context.h>
#include <ATen/EmptyTensor.h>
#include <c10/core/DeviceGuard.h>

#include <aten/EmptyTensor.h>

namespace at::detail {

TensorBase empty_xpu(
IntArrayRef size,
ScalarType dtype,
c10::optional<Device> device_opt,
c10::optional<c10::MemoryFormat> memory_format_opt) {
const auto device = device_or_default(device_opt);
TORCH_INTERNAL_ASSERT(device.is_xpu());
// XXX
// const c10::DeviceGuard device_guard(device);
auto* allocator = at::getCPUAllocator();
constexpr c10::DispatchKeySet xpu_dks(c10::DispatchKey::XPU);
return at::detail::empty_generic(
size, allocator, xpu_dks, dtype, memory_format_opt);
}

TensorBase empty_xpu(
IntArrayRef size,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt) {
TORCH_CHECK(
!pin_memory_opt.has_value() || !*pin_memory_opt,
"Only dense CPU tensors can be pinned");
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
layout_or_default(layout_opt) == Layout::Strided);

const auto dtype = dtype_or_default(dtype_opt);
return at::detail::empty_xpu(size, dtype, device_opt, memory_format_opt);
}

TensorBase empty_xpu(IntArrayRef size, const TensorOptions& options) {
return at::detail::empty_xpu(
size,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt(),
options.memory_format_opt());
}

TensorBase empty_strided_xpu(
IntArrayRef size,
IntArrayRef stride,
ScalarType dtype,
c10::optional<Device> device_opt) {
const auto device = device_or_default(device_opt);
TORCH_INTERNAL_ASSERT(device.is_xpu());
// XXX
// const c10::DeviceGuard device_guard(device);
auto* allocator = at::getCPUAllocator();
constexpr c10::DispatchKeySet xpu_dks(c10::DispatchKey::XPU);
return at::detail::empty_strided_generic(
size, stride, allocator, xpu_dks, dtype);
}

TensorBase empty_strided_xpu(
IntArrayRef size,
IntArrayRef stride,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt) {
TORCH_CHECK(
!pin_memory_opt.has_value() || !*pin_memory_opt,
"Only dense CPU tensors can be pinned");
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
layout_or_default(layout_opt) == Layout::Strided);

const auto dtype = dtype_or_default(dtype_opt);
return at::detail::empty_strided_xpu(size, stride, dtype, device_opt);
}

TensorBase empty_strided_xpu(
IntArrayRef size,
IntArrayRef stride,
const TensorOptions& options) {
return at::detail::empty_strided_xpu(
size,
stride,
optTypeMetaToScalarType(options.dtype_opt()),
options.layout_opt(),
options.device_opt(),
options.pinned_memory_opt());
}

} // namespace at::detail
42 changes: 42 additions & 0 deletions src/aten/EmptyTensor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once
#include <ATen/core/TensorBase.h>

namespace at::detail {

// XXX: add TORCH_XPU_API
TensorBase empty_xpu(
IntArrayRef size,
ScalarType dtype,
c10::optional<Device> device_opt,
c10::optional<c10::MemoryFormat> memory_format_opt);

TensorBase empty_xpu(
IntArrayRef size,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt);

TensorBase empty_xpu(IntArrayRef size, const TensorOptions& options);

TensorBase empty_strided_xpu(
IntArrayRef size,
IntArrayRef stride,
ScalarType dtype,
c10::optional<Device> device_opt);

TensorBase empty_strided_xpu(
IntArrayRef size,
IntArrayRef stride,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt);

TensorBase empty_strided_xpu(
IntArrayRef size,
IntArrayRef stride,
const TensorOptions& options);

} // namespace at::detail
36 changes: 36 additions & 0 deletions src/aten/TensorFactories.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <torch/library.h>

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/empty_native.h>
#include <ATen/ops/empty_strided_native.h>
#endif

#include <aten/EmptyTensor.h>

namespace at::native {

Tensor empty_xpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
Tensor result = at::detail::empty_xpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
// See Note [Enabling Deterministic Operations]
TORCH_CHECK(!(C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())), "XPU backend doesn't support deterministic implementation for empty ...")
return result;
}

Tensor empty_strided_xpu(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
Tensor result = at::detail::empty_strided_xpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
// See Note [Enabling Deterministic Operations]
TORCH_CHECK(!(C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())), "XPU backend doesn't support deterministic implementation for empty_strided ...")
return result;
}

TORCH_LIBRARY_IMPL(aten, XPU, m) {
m.impl(TORCH_SELECTIVE_NAME("aten::empty.memory_format"), TORCH_FN(at::native::empty_xpu));
m.impl(TORCH_SELECTIVE_NAME("aten::empty_strided"), TORCH_FN(at::native::empty_strided_xpu));
}

} // namespace at::native