diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 90ea13281ba..47394cbdba0 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -37,6 +37,18 @@ if [[ -z "${MODE:-}" ]]; then
   exit 1
 fi
 
+if [[ "${MODE}" =~ xnnpack.* ]]; then
+  XNNPACK=ON
+else
+  XNNPACK=OFF
+fi
+
+if [[ "${MODE}" =~ .*custom.* ]]; then
+  CUSTOM=ON
+else
+  CUSTOM=OFF
+fi
+
 if [[ -z "${BUCK:-}" ]]; then
   BUCK=buck2
 fi
@@ -47,25 +59,20 @@ fi
 
 which "${PYTHON_EXECUTABLE}"
 
-
 cmake_install_executorch_libraries() {
     echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
     rm -rf cmake-out
-    if [[ "${MODE}" == "xnnpack" ]]; then
-      XNNPACK=ON
-    else
-      XNNPACK=OFF
-    fi
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
-    cmake --build cmake-out -j9 --target install --config Release
+    cmake --build cmake-out -j9 --target install --config Debug
 }
 
 cmake_build_llama_runner() {
@@ -73,12 +80,15 @@ cmake_build_llama_runner() {
     dir="examples/models/llama2"
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
+        -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
-    cmake --build cmake-out/${dir} -j9 --config Release
+    cmake --build cmake-out/${dir} -j9 --config Debug
 
 }
 
@@ -117,9 +127,10 @@ fi
 EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
 echo "Exporting ${EXPORTED_MODEL_NAME}"
 EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
-if [[ "${MODE}" == "xnnpack" ]]; then
+if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
 fi
+# Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9751b906cd8..f2cc83693c7 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -90,7 +90,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack+kv+custom]
       fail-fast: false
     with:
       runner: linux.2xlarge
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 16ed6a27577..0be28c40c74 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -254,7 +254,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack+kv+custom]
       fail-fast: false
     with:
       runner: macos-m1-stable
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75de1e01ae6..5e5ddb20811 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,8 +175,9 @@ option(EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" OFF)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
-cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library."
-                       ON "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
 #
 # cpuinfo: build cpuinfo library. Disable on unsupported platforms
@@ -184,6 +185,10 @@ cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library."
 cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
                        "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
+if(EXECUTORCH_BUILD_CUSTOM)
+  set(EXECUTORCH_BUILD_OPTIMIZED ON)
+endif()
+
 if(EXECUTORCH_BUILD_CPUINFO)
   # --- cpuinfo
   set(CPUINFO_SOURCE_DIR "backends/xnnpack/third-party/cpuinfo")
@@ -508,24 +513,38 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
   endif()
 
+  # find pytorch lib, to allow pybind to take at::Tensor as input/output
+  find_package(Torch CONFIG REQUIRED)
+  find_library(TORCH_PYTHON_LIBRARY torch_python
+               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+
+  set(_dep_libs
+      ${TORCH_PYTHON_LIBRARY}
+      bundled_program
+      etdump
+      executorch
+      extension_data_loader
+      portable_ops_lib
+      util
+      torch)
+
   if(EXECUTORCH_BUILD_COREML)
-    set(PYBIND_LINK_COREML "coremldelegate")
+    list(APPEND _dep_libs coremldelegate)
   endif()
 
   if(EXECUTORCH_BUILD_MPS)
-    set(PYBIND_LINK_MPS "mpsdelegate")
+    list(APPEND _dep_libs mpsdelegate)
   endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
-    # need to explicitly specify XNNPACK here
-    # otherwise uses XNNPACK symbols from libtorch_cpu
-    set(PYBIND_LINK_XNNPACK xnnpack_backend XNNPACK)
+    # need to explicitly specify XNNPACK here otherwise uses XNNPACK symbols
+    # from libtorch_cpu
+    list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
-  find_library(TORCH_PYTHON_LIBRARY torch_python
-               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+  if(EXECUTORCH_BUILD_CUSTOM)
+    list(APPEND _dep_libs custom_ops_lib)
+  endif()
 
   # compile options for pybind
 
@@ -548,19 +567,7 @@ if(EXECUTORCH_BUILD_PYBIND)
                              PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=portable_lib)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
-  target_link_libraries(
-    portable_lib
-    PUBLIC ${TORCH_PYTHON_LIBRARY}
-           bundled_program
-           etdump
-           executorch
-           extension_data_loader
-           portable_ops_lib
-           util
-           torch
-           ${PYBIND_LINK_COREML}
-           ${PYBIND_LINK_MPS}
-           ${PYBIND_LINK_XNNPACK})
+  target_link_libraries(portable_lib PUBLIC ${_dep_libs})
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings)
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index 8bdba698645..bd094b12a01 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
   -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+  -DEXECUTORCH_BUILD_CUSTOM=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"
 
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index ea4096074ee..13c6576a998 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -18,8 +18,23 @@
 cmake_minimum_required(VERSION 3.19)
 project(llama_runner)
 
+# Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
 
+include(CMakeDependentOption)
+#
+# pthreadpool: build pthreadpool library. Disable on unsupported platforms
+#
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+#
+# cpuinfo: build cpuinfo library. Disable on unsupported platforms
+#
+cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
+                       "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+
+
 if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
@@ -49,22 +64,16 @@ set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-# For some reason android build is not able to find where gflags is
-# and hence cannot find corresponding .cmake file
+# For some reason android build is not able to find where gflags is and hence
+# cannot find corresponding .cmake file
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
 #
 # llama_main: test binary to run llama, with tokenizer and sampler integrated
 #
-add_executable(llama_main main.cpp
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp)
-if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
-  target_link_options(llama_main PRIVATE "LINKER:--gc-sections")
-endif()
 
-# find `executorch` libraries
-# Same as for gflags
+# find `executorch` libraries Same as for gflags
 set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
 find_package(executorch CONFIG REQUIRED)
 if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
@@ -72,32 +81,67 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
 endif()
 
 # custom ops library
-add_subdirectory(custom_ops)
+if(EXECUTORCH_BUILD_CUSTOM)
+  add_subdirectory(custom_ops)
+endif()
 
 # llama_runner library
 add_subdirectory(runner)
 
-target_include_directories(llama_main PUBLIC
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include)
-target_include_directories(llama_main PUBLIC
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include)
-
 set(link_libraries)
+set(_srcs main.cpp)
 
 if(EXECUTORCH_BUILD_OPTIMIZED)
-  list(APPEND link_libraries optimized_native_cpu_ops_lib optimized_kernels
-  portable_kernels cpublas eigen_blas)
+  list(
+    APPEND
+    link_libraries
+    optimized_native_cpu_ops_lib
+    optimized_kernels
+    portable_kernels
+    cpublas
+    eigen_blas)
   target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 else()
   list(APPEND link_libraries portable_ops_lib portable_kernels)
   target_link_options_shared_lib(portable_ops_lib)
 endif()
 
-target_link_libraries(llama_main PUBLIC gflags llama_runner custom_ops_lib)
+if(EXECUTORCH_BUILD_CUSTOM)
+  target_link_options_shared_lib(custom_ops_lib)
+  list(APPEND link_libraries custom_ops_lib)
+endif()
+
+# Extra compile option and include dir for pthreadpool
+if(EXECUTORCH_BUILD_PTHREADPOOL)
+  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
+  list(APPEND link_libraries pthreadpool)
+  list(
+    APPEND
+    _srcs
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/threadpool_guard.cpp
+  )
+  list(APPEND _common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include)
+endif()
+
+# Extra sources for cpuinfo
+if(EXECUTORCH_BUILD_CPUINFO)
+  list(APPEND link_libraries cpuinfo)
+  list(
+    APPEND
+    _srcs
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp
+  )
+  list(
+    APPEND
+    _common_include_directories
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include
+  )
+endif()
 
-# XNNPACK pthreadpool cpuinfo
+# XNNPACK
 if(TARGET xnnpack_backend)
-  set(xnnpack_backend_libs xnnpack_backend XNNPACK pthreadpool cpuinfo)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK)
   list(APPEND link_libraries ${xnnpack_backend_libs})
   target_link_options_shared_lib(xnnpack_backend)
 endif()
@@ -114,15 +158,19 @@ if(TARGET qnn_executorch_backend)
   target_link_options_shared_lib(qnn_executorch_backend)
 endif()
 
-# This one is needed for cpuinfo where it uses android
-# specific log lib
+# This one is needed for cpuinfo where it uses android specific log lib
 if(ANDROID)
   list(APPEND link_libraries log)
 endif()
 
-target_compile_options(llama_main PUBLIC ${_common_compile_options}
-  -DET_USE_THREADPOOL)
-target_link_libraries(llama_main PUBLIC ${link_libraries})
+add_executable(llama_main ${_srcs})
+if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
+  target_link_options(llama_main PRIVATE "LINKER:--gc-sections")
+endif()
+
+target_include_directories(llama_main PUBLIC ${_common_include_directories})
+target_link_libraries(llama_main PUBLIC gflags llama_runner ${link_libraries})
+target_compile_options(llama_main PUBLIC ${_common_compile_options})
 
 if(APPLE)
   target_link_options_shared_lib(executorch)
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
index 8e9190eb4c1..81a80dab9c5 100644
--- a/examples/models/llama2/runner/CMakeLists.txt
+++ b/examples/models/llama2/runner/CMakeLists.txt
@@ -47,8 +47,7 @@ else()
   add_library(llama_runner SHARED ${_llama_runner__srcs})
 endif()
 
-set(llama_runner_deps executorch extension_module extension_data_loader
-  custom_ops)
+set(llama_runner_deps executorch extension_module extension_data_loader)
 
 target_link_libraries(
   llama_runner PUBLIC ${llama_runner_deps})
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index 976549af8db..92d19c04843 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -149,8 +149,7 @@ struct type_convert<
     }
     c10::ScalarType scalar_type =
         static_cast<c10::ScalarType>(val.scalar_type());
-    converted =
-        at::from_blob(val.mutable_data_ptr(), val.numel(), sizes, scalar_type);
+    converted = at::from_blob(val.mutable_data_ptr(), sizes, scalar_type);
   }
   ATensor call() {
     return converted;
diff --git a/extension/aten_util/targets.bzl b/extension/aten_util/targets.bzl
index b396cb78325..6e325830292 100644
--- a/extension/aten_util/targets.bzl
+++ b/extension/aten_util/targets.bzl
@@ -27,6 +27,7 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/extension/kernel_util:kernel_util",
+            "//executorch/extension/runner_util:managed_tensor",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core/exec_aten:lib",
diff --git a/extension/kernel_util/meta_programming.h b/extension/kernel_util/meta_programming.h
index 46262b843ea..c412e907ea0 100644
--- a/extension/kernel_util/meta_programming.h
+++ b/extension/kernel_util/meta_programming.h
@@ -49,7 +49,7 @@ struct is_compile_time_function_pointer<
     CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
 
 #define EXECUTORCH_FN_TYPE(func)                                      \
-  CompileTimeFunctionPointer<                                         \
+  ::torch::executor::CompileTimeFunctionPointer<                      \
       std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
       func>
 #define EXECUTORCH_FN(func) EXECUTORCH_FN_TYPE(func)()