intel · fengyuan14 · Feb 24, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,6 +28,30 @@ if(BUILD_TEST)
   add_subdirectory(${TORCH_XPU_OPS_ROOT}/test/sycl ${CMAKE_BINARY_DIR}/test_sycl)
 endif()
 
+# The library couples with PyTorch. Here are two possible building processes:
+# 1. standalone - Need pre-install PyTorch. Introduce PyTorch depdenceise through PyTorch installation dirctory.
+# 2. submodule - Build as a submodule of PyTorch. Introduce PyTorch dependecies through PyTorch sources directory.
+if(PYTORCH_USE_XPU)
+  # submodule
+else()
+  # standalone
+  set(Torch_COMP_VERION "2.3.0")
+
+  if(NOT PYTORCH_INSTALL_DIR)
+    message(FATAL_ERROR "Cannot find PYTORCH_INSTALL_DIR in standalone build mode, please set -DPYTORCH_INSTALL_DIR ...")
+  endif()
+
+  set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch)
+  find_package(Torch REQUIRED)
+
+  set(Caffe2_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Caffe2)
+  find_package(Caffe2 REQUIRED)
+
+  set(PYTORCH_INCLUDE_DIRS ${TORCH_INCLUDE_DIRS} ${CAFFE2_INCLUDE_DIRS})
+endif()
+
+add_subdirectory(${TORCH_XPU_OPS_ROOT}/src ${CMAKE_BINARY_DIR}/torch_xpu_ops)
+
 set(PYTORCH_FOUND_XPU TRUE)
 
 message(STATUS "XPU found")
diff --git a/README.md b/README.md
@@ -0,0 +1,18 @@
+<div align="center">
+
+torch-xpu-ops*
+===========================
+
+torch-xpu-ops is an `xpu` implementation of PyTorch ATen operators.
+
+## Build
+* Standalone - Require pre-installation of PyTorch
+```bash
+mkdir build
+cd build && cmake -DBUILD_TEST=1 -DPYTORCH_INSTALL_DIR=YOUR_PYTORCH_INSTALLATION_DIR ..
+make -j x
+```
+* Submodule - Build as a submodule of PyTorch
+```bash
+// TODO
+```
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -0,0 +1,25 @@
+# define archive static library target - torch_xpu_ops
+
+set(ATen_XPU_CPP_SRCS)
+set(ATen_XPU_SYCL_SRCS)
+
+set(ATen_XPU_INCLUDE_DIRS ${TORCH_XPU_OPS_ROOT}/src)
+
+add_subdirectory(aten)
+
+set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
+sycl_add_library(
+  torch_xpu_ops
+  STATIC
+  SYCL_SOURCES ${ATen_XPU_SYCL_SRCS}
+  CXX_SOURCES ${ATen_XPU_CPP_SRCS})
+set(SYCL_LINK_LIBRARIES_KEYWORD)
+
+# Align with PyTorch compile options
+# 1. submodule - PYTORCH_SRC_DIR/cmake/public/utils.cmake
+# 2. standalone - PYTORCH_INSTALL_DIR/share/cmake/Caffe2/public/utils.cmake
+torch_compile_options(torch_xpu_ops)
+target_compile_options_if_supported(torch_xpu_ops "-Wno-deprecated-copy")
+
+target_include_directories(torch_xpu_ops PUBLIC ${PYTORCH_INCLUDE_DIRS})
+target_include_directories(torch_xpu_ops PUBLIC ${ATen_XPU_INCLUDE_DIRS})
diff --git a/src/aten/CMakeLists.txt b/src/aten/CMakeLists.txt
@@ -0,0 +1,10 @@
+# ATen XPU sources
+
+file(GLOB xpu_cpp "*.cpp")
+file(GLOB xpu_sycl "sycl/*.cpp")
+
+list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
+list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
+
+set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
+set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
diff --git a/src/aten/EmptyTensor.cpp b/src/aten/EmptyTensor.cpp
@@ -0,0 +1,97 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Context.h>
+#include <ATen/EmptyTensor.h>
+#include <c10/core/DeviceGuard.h>
+
+#include <aten/EmptyTensor.h>
+
+namespace at::detail {
+
+TensorBase empty_xpu(
+    IntArrayRef size,
+    ScalarType dtype,
+    c10::optional<Device> device_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+  const auto device = device_or_default(device_opt);
+  TORCH_INTERNAL_ASSERT(device.is_xpu());
+  // XXX
+  // const c10::DeviceGuard device_guard(device);
+  auto* allocator = at::getCPUAllocator();
+  constexpr c10::DispatchKeySet xpu_dks(c10::DispatchKey::XPU);
+  return at::detail::empty_generic(
+      size, allocator, xpu_dks, dtype, memory_format_opt);
+}
+
+TensorBase empty_xpu(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+  TORCH_CHECK(
+      !pin_memory_opt.has_value() || !*pin_memory_opt,
+      "Only dense CPU tensors can be pinned");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      layout_or_default(layout_opt) == Layout::Strided);
+
+  const auto dtype = dtype_or_default(dtype_opt);
+  return at::detail::empty_xpu(size, dtype, device_opt, memory_format_opt);
+}
+
+TensorBase empty_xpu(IntArrayRef size, const TensorOptions& options) {
+  return at::detail::empty_xpu(
+      size,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt(),
+      options.memory_format_opt());
+}
+
+TensorBase empty_strided_xpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    c10::optional<Device> device_opt) {
+  const auto device = device_or_default(device_opt);
+  TORCH_INTERNAL_ASSERT(device.is_xpu());
+  // XXX
+  // const c10::DeviceGuard device_guard(device);
+  auto* allocator = at::getCPUAllocator();
+  constexpr c10::DispatchKeySet xpu_dks(c10::DispatchKey::XPU);
+  return at::detail::empty_strided_generic(
+      size, stride, allocator, xpu_dks, dtype);
+}
+
+TensorBase empty_strided_xpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt) {
+  TORCH_CHECK(
+      !pin_memory_opt.has_value() || !*pin_memory_opt,
+      "Only dense CPU tensors can be pinned");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      layout_or_default(layout_opt) == Layout::Strided);
+
+  const auto dtype = dtype_or_default(dtype_opt);
+  return at::detail::empty_strided_xpu(size, stride, dtype, device_opt);
+}
+
+TensorBase empty_strided_xpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options) {
+  return at::detail::empty_strided_xpu(
+      size,
+      stride,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt());
+}
+
+} // namespace at::detail
diff --git a/src/aten/EmptyTensor.h b/src/aten/EmptyTensor.h
@@ -0,0 +1,42 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+// XXX: add TORCH_XPU_API
+TensorBase empty_xpu(
+    IntArrayRef size,
+    ScalarType dtype,
+    c10::optional<Device> device_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TensorBase empty_xpu(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TensorBase empty_xpu(IntArrayRef size, const TensorOptions& options);
+
+TensorBase empty_strided_xpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    c10::optional<Device> device_opt);
+
+TensorBase empty_strided_xpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TensorBase empty_strided_xpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+
+} // namespace at::detail
diff --git a/src/aten/TensorFactories.cpp b/src/aten/TensorFactories.cpp
@@ -0,0 +1,36 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_strided_native.h>
+#endif
+
+#include <aten/EmptyTensor.h>
+
+namespace at::native {
+
+Tensor empty_xpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
+  Tensor result = at::detail::empty_xpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
+  // See Note [Enabling Deterministic Operations]
+  TORCH_CHECK(!(C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())), "XPU backend doesn't support deterministic implementation for empty ...")
+  return result;
+}
+
+Tensor empty_strided_xpu(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+  Tensor result = at::detail::empty_strided_xpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+  // See Note [Enabling Deterministic Operations]
+  TORCH_CHECK(!(C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())), "XPU backend doesn't support deterministic implementation for empty_strided ...")
+  return result;
+}
+
+TORCH_LIBRARY_IMPL(aten, XPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("aten::empty.memory_format"), TORCH_FN(at::native::empty_xpu));
+  m.impl(TORCH_SELECTIVE_NAME("aten::empty_strided"), TORCH_FN(at::native::empty_strided_xpu));
+}
+
+} // namespace at::native