diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 90ea13281ba..47394cbdba0 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -37,6 +37,18 @@ if [[ -z "${MODE:-}" ]]; then exit 1 fi +if [[ "${MODE}" =~ xnnpack.* ]]; then + XNNPACK=ON +else + XNNPACK=OFF +fi + +if [[ "${MODE}" =~ .*custom.* ]]; then + CUSTOM=ON +else + CUSTOM=OFF +fi + if [[ -z "${BUCK:-}" ]]; then BUCK=buck2 fi @@ -47,25 +59,20 @@ fi which "${PYTHON_EXECUTABLE}" - cmake_install_executorch_libraries() { echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a" rm -rf cmake-out - if [[ "${MODE}" == "xnnpack" ]]; then - XNNPACK=ON - else - XNNPACK=OFF - fi retry cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=Debug \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . - cmake --build cmake-out -j9 --target install --config Release + cmake --build cmake-out -j9 --target install --config Debug } cmake_build_llama_runner() { @@ -73,12 +80,15 @@ cmake_build_llama_runner() { dir="examples/models/llama2" retry cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_BUILD_TYPE=Debug \ + -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \ + -DEXECUTORCH_BUILD_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DEXECUTORCH_BUILD_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out/${dir} \ ${dir} - cmake --build cmake-out/${dir} -j9 --config Release + cmake --build cmake-out/${dir} -j9 --config Debug } @@ -117,9 +127,10 @@ fi EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte" echo "Exporting ${EXPORTED_MODEL_NAME}" EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}" -if [[ "${MODE}" == "xnnpack" ]]; then +if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128" fi +# Add dynamically linked library location $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS} # Create tokenizer.bin. diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9751b906cd8..f2cc83693c7 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -90,7 +90,7 @@ jobs: matrix: dtype: [fp32] build-tool: [buck2, cmake] - mode: [portable, xnnpack] + mode: [portable, xnnpack+kv+custom] fail-fast: false with: runner: linux.2xlarge diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 16ed6a27577..0be28c40c74 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -254,7 +254,7 @@ jobs: matrix: dtype: [fp32] build-tool: [buck2, cmake] - mode: [portable, xnnpack] + mode: [portable, xnnpack+kv+custom] fail-fast: false with: runner: macos-m1-stable diff --git a/CMakeLists.txt b/CMakeLists.txt index 75de1e01ae6..5e5ddb20811 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,8 +175,9 @@ option(EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" OFF) # # pthreadpool: build pthreadpool library. Disable on unsupported platforms # -cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." - ON "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF) +cmake_dependent_option( + EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON + "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF) # # cpuinfo: build cpuinfo library. Disable on unsupported platforms @@ -184,6 +185,10 @@ cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF) +if(EXECUTORCH_BUILD_CUSTOM) + set(EXECUTORCH_BUILD_OPTIMIZED ON) +endif() + if(EXECUTORCH_BUILD_CPUINFO) # --- cpuinfo set(CPUINFO_SOURCE_DIR "backends/xnnpack/third-party/cpuinfo") @@ -508,24 +513,38 @@ if(EXECUTORCH_BUILD_PYBIND) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) endif() + # find pytorch lib, to allow pybind to take at::Tensor as input/output + find_package(Torch CONFIG REQUIRED) + find_library(TORCH_PYTHON_LIBRARY torch_python + PATHS "${TORCH_INSTALL_PREFIX}/lib") + + set(_dep_libs + ${TORCH_PYTHON_LIBRARY} + bundled_program + etdump + executorch + extension_data_loader + portable_ops_lib + util + torch) + if(EXECUTORCH_BUILD_COREML) - set(PYBIND_LINK_COREML "coremldelegate") + list(APPEND _dep_libs coremldelegate) endif() if(EXECUTORCH_BUILD_MPS) - set(PYBIND_LINK_MPS "mpsdelegate") + list(APPEND _dep_libs mpsdelegate) endif() if(EXECUTORCH_BUILD_XNNPACK) - # need to explicitly specify XNNPACK here - # otherwise uses XNNPACK symbols from libtorch_cpu - set(PYBIND_LINK_XNNPACK xnnpack_backend XNNPACK) + # need to explicitly specify XNNPACK here otherwise uses XNNPACK symbols + # from libtorch_cpu + list(APPEND _dep_libs xnnpack_backend XNNPACK) endif() - # find pytorch lib, to allow pybind to take at::Tensor as input/output - find_package(Torch CONFIG REQUIRED) - find_library(TORCH_PYTHON_LIBRARY torch_python - PATHS "${TORCH_INSTALL_PREFIX}/lib") + if(EXECUTORCH_BUILD_CUSTOM) + list(APPEND _dep_libs custom_ops_lib) + endif() # compile options for pybind @@ -548,19 +567,7 @@ if(EXECUTORCH_BUILD_PYBIND) PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=portable_lib) target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS}) target_compile_options(portable_lib PUBLIC ${_pybind_compile_options}) - target_link_libraries( - portable_lib - PUBLIC ${TORCH_PYTHON_LIBRARY} - bundled_program - etdump - executorch - extension_data_loader - portable_ops_lib - util - torch - ${PYBIND_LINK_COREML} - ${PYBIND_LINK_MPS} - ${PYBIND_LINK_XNNPACK}) + target_link_libraries(portable_lib PUBLIC ${_dep_libs}) install(TARGETS portable_lib LIBRARY DESTINATION executorch/extension/pybindings) diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 8bdba698645..bd094b12a01 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_CUSTOM=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index ea4096074ee..13c6576a998 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -18,8 +18,23 @@ cmake_minimum_required(VERSION 3.19) project(llama_runner) +# Duplicating options as root CMakeLists.txt option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF) +include(CMakeDependentOption) +# +# pthreadpool: build pthreadpool library. Disable on unsupported platforms +# +cmake_dependent_option( + EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON + "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF) +# +# cpuinfo: build cpuinfo library. Disable on unsupported platforms +# +cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON + "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF) + + if(NOT PYTHON_EXECUTABLE) set(PYTHON_EXECUTABLE python3) endif() @@ -49,22 +64,16 @@ set(_common_compile_options -Wno-deprecated-declarations -fPIC) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -# For some reason android build is not able to find where gflags is -# and hence cannot find corresponding .cmake file +# For some reason android build is not able to find where gflags is and hence +# cannot find corresponding .cmake file set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) find_package(gflags REQUIRED) # # llama_main: test binary to run llama, with tokenizer and sampler integrated # -add_executable(llama_main main.cpp -${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp) -if(CMAKE_BUILD_TYPE EQUAL "RELEASE") - target_link_options(llama_main PRIVATE "LINKER:--gc-sections") -endif() -# find `executorch` libraries -# Same as for gflags +# find `executorch` libraries Same as for gflags set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch) find_package(executorch CONFIG REQUIRED) if(CMAKE_TOOLCHAIN_IOS OR ANDROID) @@ -72,32 +81,67 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) endif() # custom ops library -add_subdirectory(custom_ops) +if(EXECUTORCH_BUILD_CUSTOM) + add_subdirectory(custom_ops) +endif() # llama_runner library add_subdirectory(runner) -target_include_directories(llama_main PUBLIC -${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include) -target_include_directories(llama_main PUBLIC -${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include) - set(link_libraries) +set(_srcs main.cpp) if(EXECUTORCH_BUILD_OPTIMIZED) - list(APPEND link_libraries optimized_native_cpu_ops_lib optimized_kernels - portable_kernels cpublas eigen_blas) + list( + APPEND + link_libraries + optimized_native_cpu_ops_lib + optimized_kernels + portable_kernels + cpublas + eigen_blas) target_link_options_shared_lib(optimized_native_cpu_ops_lib) else() list(APPEND link_libraries portable_ops_lib portable_kernels) target_link_options_shared_lib(portable_ops_lib) endif() -target_link_libraries(llama_main PUBLIC gflags llama_runner custom_ops_lib) +if(EXECUTORCH_BUILD_CUSTOM) + target_link_options_shared_lib(custom_ops_lib) + list(APPEND link_libraries custom_ops_lib) +endif() + +# Extra compile option and include dir for pthreadpool +if(EXECUTORCH_BUILD_PTHREADPOOL) + list(APPEND _common_compile_options -DET_USE_THREADPOOL) + list(APPEND link_libraries pthreadpool) + list( + APPEND + _srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool_guard.cpp + ) + list(APPEND _common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include) +endif() + +# Extra sources for cpuinfo +if(EXECUTORCH_BUILD_CPUINFO) + list(APPEND link_libraries cpuinfo) + list( + APPEND + _srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp + ) + list( + APPEND + _common_include_directories + ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include + ) +endif() -# XNNPACK pthreadpool cpuinfo +# XNNPACK if(TARGET xnnpack_backend) - set(xnnpack_backend_libs xnnpack_backend XNNPACK pthreadpool cpuinfo) + set(xnnpack_backend_libs xnnpack_backend XNNPACK) list(APPEND link_libraries ${xnnpack_backend_libs}) target_link_options_shared_lib(xnnpack_backend) endif() @@ -114,15 +158,19 @@ if(TARGET qnn_executorch_backend) target_link_options_shared_lib(qnn_executorch_backend) endif() -# This one is needed for cpuinfo where it uses android -# specific log lib +# This one is needed for cpuinfo where it uses android specific log lib if(ANDROID) list(APPEND link_libraries log) endif() -target_compile_options(llama_main PUBLIC ${_common_compile_options} - -DET_USE_THREADPOOL) -target_link_libraries(llama_main PUBLIC ${link_libraries}) +add_executable(llama_main ${_srcs}) +if(CMAKE_BUILD_TYPE EQUAL "RELEASE") + target_link_options(llama_main PRIVATE "LINKER:--gc-sections") +endif() + +target_include_directories(llama_main PUBLIC ${_common_include_directories}) +target_link_libraries(llama_main PUBLIC gflags llama_runner ${link_libraries}) +target_compile_options(llama_main PUBLIC ${_common_compile_options}) if(APPLE) target_link_options_shared_lib(executorch) diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt index 8e9190eb4c1..81a80dab9c5 100644 --- a/examples/models/llama2/runner/CMakeLists.txt +++ b/examples/models/llama2/runner/CMakeLists.txt @@ -47,8 +47,7 @@ else() add_library(llama_runner SHARED ${_llama_runner__srcs}) endif() -set(llama_runner_deps executorch extension_module extension_data_loader - custom_ops) +set(llama_runner_deps executorch extension_module extension_data_loader) target_link_libraries( llama_runner PUBLIC ${llama_runner_deps}) diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h index 976549af8db..92d19c04843 100644 --- a/extension/aten_util/make_aten_functor_from_et_functor.h +++ b/extension/aten_util/make_aten_functor_from_et_functor.h @@ -149,8 +149,7 @@ struct type_convert< } c10::ScalarType scalar_type = static_cast(val.scalar_type()); - converted = - at::from_blob(val.mutable_data_ptr(), val.numel(), sizes, scalar_type); + converted = at::from_blob(val.mutable_data_ptr(), sizes, scalar_type); } ATensor call() { return converted; diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl index b396cb78325..6e325830292 100644 --- a/extension/aten_util/targets.bzl +++ b/extension/aten_util/targets.bzl @@ -27,6 +27,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/extension/kernel_util:kernel_util", + "//executorch/extension/runner_util:managed_tensor", "//executorch/runtime/core:core", "//executorch/runtime/core:evalue", "//executorch/runtime/core/exec_aten:lib", diff --git a/extension/kernel_util/meta_programming.h b/extension/kernel_util/meta_programming.h index 46262b843ea..c412e907ea0 100644 --- a/extension/kernel_util/meta_programming.h +++ b/extension/kernel_util/meta_programming.h @@ -49,7 +49,7 @@ struct is_compile_time_function_pointer< CompileTimeFunctionPointer> : std::true_type {}; #define EXECUTORCH_FN_TYPE(func) \ - CompileTimeFunctionPointer< \ + ::torch::executor::CompileTimeFunctionPointer< \ std::remove_pointer_t>, \ func> #define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)()