From afc1cae08b424e95d35e7ee917198e4ce79fcb49 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Mon, 12 May 2025 18:50:08 +0800
Subject: [PATCH 01/10] feat: virtual device memory

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 .clang-tidy                                   |    3 +
 .../tensorrt_llm/runtime/virtualMemory.h      |  541 ++++++
 cpp/tensorrt_llm/common/cudaDriverWrapper.h   |   17 +
 .../nanobind/runtime/bindings.cpp             |   29 +
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp  |   29 +
 cpp/tensorrt_llm/runtime/CMakeLists.txt       |    3 +-
 cpp/tensorrt_llm/runtime/bufferManager.cpp    |   16 +
 cpp/tensorrt_llm/runtime/tllmBuffers.h        |   80 +-
 cpp/tensorrt_llm/runtime/virtualMemory.cpp    |  416 +++++
 cpp/tensorrt_llm/thop/CMakeLists.txt          |    1 +
 .../thop/virtualMemoryAllocator.cpp           |   60 +
 cpp/tests/unit_tests/runtime/CMakeLists.txt   |    1 +
 .../unit_tests/runtime/virtualMemoryTest.cpp  | 1572 +++++++++++++++++
 tensorrt_llm/_torch/virtual_memory.py         |   89 +
 tests/unittest/_torch/test_virtual_memory.py  |  254 +++
 15 files changed, 3086 insertions(+), 25 deletions(-)
 create mode 100644 cpp/include/tensorrt_llm/runtime/virtualMemory.h
 create mode 100644 cpp/tensorrt_llm/runtime/virtualMemory.cpp
 create mode 100644 cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp
 create mode 100644 cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
 create mode 100644 tensorrt_llm/_torch/virtual_memory.py
 create mode 100644 tests/unittest/_torch/test_virtual_memory.py
diff --git a/.clang-tidy b/.clang-tidy
index 8aac1d0b25e..33505432a60 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,5 +1,6 @@
 Checks: '*,
          -altera-id-dependent-backward-branch,
+         -altera-struct-pack-align,
          -altera-unroll-loops,
          -boost-use-ranges,
          -cppcoreguidelines-avoid-do-while,
@@ -9,8 +10,10 @@ Checks: '*,
          -fuchsia-default-arguments-calls,
          -fuchsia-default-arguments-declarations,
          -fuchsia-overloaded-operator,
+         -fuchsia-virtual-inheritance,
          -hicpp-vararg,
          -llvm-else-after-return,
          -llvmlibc-*,
+         -misc-include-cleaner,
          -misc-non-private-member-variables-in-classes,
          -modernize-use-trailing-return-type'
diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
new file mode 100644
index 00000000000..39959ed988d
--- /dev/null
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/runtime/cudaEvent.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
+
+#include <cuda.h>
+#include <map>
+#include <mutex>
+#include <unistd.h>
+#include <utility>
+
+class VirtualMemoryManagerTest;
+
+namespace tensorrt_llm::runtime
+{
+
+/**
+ * CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation,
+ * providing the ability to release and rematerialize the allocation.
+ */
+class CUDAVirtualMemoryChunk
+{
+public:
+    /**
+     * CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle,
+     * either by creating one locally, or importing one from remote.
+     */
+    struct Creator
+    {
+        Creator() = default;
+        virtual ~Creator() = default;
+        Creator(Creator const&) = default;
+        Creator& operator=(Creator const&) = default;
+        Creator(Creator&&) = default;
+        Creator& operator=(Creator&&) = default;
+
+        // Note: create() shall not leak resources when throwing exceptions.
+        // release() will only, and will always be called if create() success.
+        virtual CUmemGenericAllocationHandle create() = 0;
+        virtual void release(CUmemGenericAllocationHandle) = 0;
+    };
+
+    using CreatorPtr = std::unique_ptr<Creator>;
+
+    /**
+     * CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle:
+     * - Map into virtual address
+     * - Bind to multicast object
+     * - Backup and restore memory content
+     */
+    struct Configurator
+    {
+        Configurator() = default;
+        virtual ~Configurator() = default;
+        Configurator(Configurator const&) = default;
+        Configurator& operator=(Configurator const&) = default;
+        Configurator(Configurator&&) = default;
+        Configurator& operator=(Configurator&&) = default;
+
+        // Note: setup() shall not leak resources when throwing exceptions.
+        // teardown() will only, and will always be called if setup() success.
+        virtual void setup(CUmemGenericAllocationHandle) = 0;
+        virtual void teardown(CUmemGenericAllocationHandle) = 0;
+    };
+
+    using ConfiguratorPtr = std::unique_ptr<Configurator>;
+    using Configurators = std::vector<ConfiguratorPtr>;
+
+    enum Status
+    {
+        INVALID,      // This is a default constructed invalid CUDAVirtualMemoryChunk.
+        RELEASED,     // The memory represented by this CUDAVirtualMemoryChunk is not allocated.
+        MATERIALIZED, // The memory represented by this CUDAVirtualMemoryChunk is allocated.
+        ERRORED,      // Error happened during materialize() or release().
+                      // This CUDAVirtualMemoryChunk cannot be used anymore.
+    };
+
+    [[nodiscard]] Status status() const noexcept
+    {
+        if (mCreator == nullptr)
+        {
+            return INVALID;
+        }
+
+        if (mState == 0 && mHandle == 0)
+        {
+            return RELEASED;
+        }
+
+        if (mState == mConfigurators.size() && mHandle != 0)
+        {
+            return MATERIALIZED;
+        }
+
+        return ERRORED;
+    }
+
+    /**
+     * Materialize this CUDAVirtualMemoryChunk.
+     * Shall be called only when status() == RELEASED.
+     *
+     * Calls creator.create(), and then configurator.setup() for each configurator in order.
+     *
+     * Stop at the first thrown exception and propagates it.
+     */
+    void materialize();
+
+    /**
+     * Release this CUDAVirtualMemoryChunk.
+     * Shall be called only when status() == MATERIALIZED, or materialize() throws.
+     * Will be called automatically by destructor if necessary.
+     *
+     * Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order,
+     * and then creator.release().
+     *
+     * Never stops early upon exception. The last thrown exception will be propagated, and others logged.
+     */
+    void release();
+
+    CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk const&) = delete;
+    CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk const&) = delete;
+
+    CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk&& other) noexcept
+    {
+        mCreator = std::move(other.mCreator);
+        mConfigurators = std::move(other.mConfigurators);
+        mHandle = other.mHandle;
+        mState = other.mState;
+        new (&other) CUDAVirtualMemoryChunk; // Put other into default constructed state
+    }
+
+    CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk&& other)
+    {
+        this->~CUDAVirtualMemoryChunk(); // May throw if current virtual memory need release
+        new (this) CUDAVirtualMemoryChunk(std::move(other));
+        return *this;
+    }
+
+    CUDAVirtualMemoryChunk() noexcept = default;
+
+    CUDAVirtualMemoryChunk(CreatorPtr&& creator, Configurators&& configurators)
+        : mCreator(std::move(creator))
+        , mConfigurators(std::move(configurators))
+    {
+    }
+
+    virtual ~CUDAVirtualMemoryChunk()
+    {
+        // Calling release() is necessary if materialize() succeed or threw an exception.
+        // If release() is already called by the user, whether succeed or threw an exception,
+        // we shouldn't call release() again.
+        if (mHandle != 0 && mState != INVALID_STATE)
+        {
+            release();
+        }
+    }
+
+    /**
+     * Test if this CUDAVirtualMemoryChunk is managing a memory block.
+     */
+    explicit operator bool() const noexcept
+    {
+        return mCreator != nullptr;
+    }
+
+private:
+    constexpr static size_t INVALID_STATE = static_cast<size_t>(-1);
+    size_t mState = 0;
+    CUmemGenericAllocationHandle mHandle{};
+    std::unique_ptr<Creator> mCreator;
+    std::vector<std::unique_ptr<Configurator>> mConfigurators;
+};
+
+/**
+ * LocalCreator creates memory allocation locally through cuMemCreate.
+ */
+template <bool count = true>
+struct LocalCreator : CUDAVirtualMemoryChunk::Creator
+{
+    LocalCreator(CUmemAllocationProp const& prop, size_t size)
+        : mProp(prop)
+        , mSize(size)
+    {
+    }
+
+    CUmemGenericAllocationHandle create() override
+    {
+        CUmemGenericAllocationHandle handle{};
+        TLLM_CU_CHECK(cuMemCreate(&handle, mSize, &mProp, 0));
+        if constexpr (count)
+        {
+            MemoryCounters::getInstance().allocate(
+                mProp.location.type == CU_MEM_LOCATION_TYPE_DEVICE ? MemoryType::kGPU : MemoryType::kPINNED, mSize);
+        }
+        return handle;
+    }
+
+    void release(CUmemGenericAllocationHandle handle) override
+    {
+        TLLM_CU_CHECK_FREE_RESOURCE(cuMemRelease(handle));
+        if constexpr (count)
+        {
+            MemoryCounters::getInstance().deallocate(
+                mProp.location.type == CU_MEM_LOCATION_TYPE_DEVICE ? MemoryType::kGPU : MemoryType::kPINNED, mSize);
+        }
+    }
+
+    CUmemAllocationProp mProp{};
+    size_t mSize{};
+};
+
+/**
+ * UnicastConfigurator maps the allocation handle into the specified unicast address range.
+ */
+struct UnicastConfigurator : CUDAVirtualMemoryChunk::Configurator
+{
+    UnicastConfigurator(CUdeviceptr address, size_t size, CUmemAccessDesc const& desc)
+        : mAddress(address)
+        , mSize(size)
+        , mDesc(desc)
+    {
+    }
+
+    void setup(CUmemGenericAllocationHandle handle) override
+    {
+        TLLM_CU_CHECK(cuMemMap(mAddress, mSize, 0, handle, 0));
+        TLLM_CU_CHECK(cuMemSetAccess(mAddress, mSize, &mDesc, 1));
+    }
+
+    void teardown(CUmemGenericAllocationHandle) override
+    {
+        TLLM_CU_CHECK_FREE_RESOURCE(cuMemUnmap(mAddress, mSize));
+    }
+
+    CUdeviceptr mAddress;
+    size_t mSize;
+    CUmemAccessDesc mDesc;
+};
+
+/**
+ * MulticastConfigurator binds the allocation handle to the given multicast object and offset.
+ */
+struct MulticastConfigurator : CUDAVirtualMemoryChunk::Configurator
+{
+    void setup(CUmemGenericAllocationHandle handle) override
+    {
+        TLLM_CU_CHECK(cuMulticastBindMem(mMulticast, 0, handle, mBindOffset, mSize, 0));
+    }
+
+    void teardown(CUmemGenericAllocationHandle) override
+    {
+        TLLM_CU_CHECK_FREE_RESOURCE(cuMulticastUnbind(mMulticast, mDevice, 0, mSize));
+    }
+
+    CUmemGenericAllocationHandle mMulticast;
+    size_t mBindOffset;
+    CUdevice mDevice;
+    size_t mSize;
+};
+
+/**
+ * MemsetConfigurator fills the memory with given value.
+ */
+struct MemsetConfigurator : CUDAVirtualMemoryChunk::Configurator
+{
+    MemsetConfigurator(CUdeviceptr address, size_t size, uint8_t value, CUstream stream)
+        : mAddress(address)
+        , mSize(size)
+        , mStream(stream)
+        , mValue(value)
+    {
+    }
+
+    void setup(CUmemGenericAllocationHandle) override
+    {
+        if (mFirstTime)
+        {
+            mFirstTime = false;
+        }
+        else
+        {
+            TLLM_CU_CHECK(cuMemsetD8Async(mAddress, mValue, mSize, mStream));
+        }
+    }
+
+    void teardown(CUmemGenericAllocationHandle) noexcept override {}
+
+    CUdeviceptr mAddress;
+    size_t mSize;
+    CUstream mStream{};
+    uint8_t mValue;
+    bool mFirstTime = true;
+};
+
+/**
+ * OffloadConfigurator offload the content of the allocation to the backup storage when teardown,
+ * and restore the content on the following setup.
+ */
+struct OffloadConfigurator : CUDAVirtualMemoryChunk::Configurator
+{
+    OffloadConfigurator(CUdeviceptr address, size_t size, MemoryType backType, CUstream stream, bool ondemand = false)
+        : mAddress(address)
+        , mSize(size)
+        , mBackType(backType)
+        , mStream(stream)
+        , mOndemand(ondemand)
+    {
+    }
+
+    void setup(CUmemGenericAllocationHandle) override;
+    void teardown(CUmemGenericAllocationHandle) override;
+
+    CUdeviceptr mAddress;
+    size_t mSize;
+    MemoryType mBackType;
+    CUstream mStream;
+    bool mOndemand;
+
+    IBuffer::UniquePtr mBackedStorage;
+};
+
+class CudaVirtualMemoryManager
+{
+public:
+    /**
+     * Add memory to be managed by this manager.
+     * @param handle  Unique handle provided to reference this memory in `remove`.
+     * @param mark    Mark the memory, so this memory can be targeted in `releaseWithMark` and `materializeWithMark`.
+     * @param memory  The CUDAVirtualMemory object.
+     *
+     * The memory and internal state will remain valid if any exception is thrown.
+     */
+    void add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk&& memory);
+
+    /**
+     * Creates and adds memory to be managed by this manager. The created memory is automatically materialized.
+     * @param handle         Unique handle provided to reference this memory in `remove`.
+     * @param mark           Mark the memory, so this memory can be targeted in `releaseWithMark` and
+     * `materializeWithMark`.
+     * @param creator        The creator for the memory.
+     * @param configurators  The configurators for the memory.
+     *
+     * The internal state will remain valid if any exception is thrown.
+     */
+    void add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
+        CUDAVirtualMemoryChunk::Configurators&& configurators);
+
+    template <typename... Configurators>
+    void add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
+        Configurators&&... configurators)
+    {
+        add(handle, mark, std::move(creator), {std::forward<Configurators>(configurators)...});
+    }
+
+    /**
+     * Remove the memory from the manager.
+     * @param handle The handle provided to `add`.
+     * @return The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned.
+     */
+    CUDAVirtualMemoryChunk remove(uintptr_t handle) noexcept;
+
+    /**
+     * Call release for CUDAVirtualMemoryChunk objects with a given mark.
+     * @param mark the mark to select target memories.
+     * @return Number of objects selected.
+     *
+     * This function will always call `CUDAVirtualMemoryChunk::release` on all selected objects.
+     * The last exception thrown by `CUDAVirtualMemoryChunk::release` will be rethrown, and others will be logged.
+     *
+     * If any CUDAVirtualMemoryChunk threw an exception during `release`, it will be removed from the manager.
+     * Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
+     */
+    size_t releaseWithMark(std::string const& mark);
+
+    /**
+     * Call materialize for CUDAVirtualMemoryChunk objects with a given mark.
+     * @param mark the mark to select target memories.
+     * @return Number of objects selected.
+     *
+     * This function will stop at the first `CUDAVirtualMemoryChunk::materialize` that throws exception,
+     * and attempt to roll back previous successful `materialize` by calling `release`.
+     * The exception thrown by `CUDAVirtualMemoryChunk::materialize` will be rethrown,
+     * and any exception thrown by `release` will be logged.
+     *
+     * If any CUDAVirtualMemoryChunk threw an exception during `materialize` or `release`, it will be removed from the
+     * manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed.
+     * Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
+     */
+    size_t materializeWithMark(std::string const& mark);
+
+    /**
+     * Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list.
+     * The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened.
+     * This method is only for diagnostic purpose, and should not be called concurrently with other methods.
+     * @return The handle list.
+     */
+    std::vector<uintptr_t> retrieveBadHandles() noexcept;
+
+private:
+    CUDAVirtualMemoryChunk unsafeRemove(uintptr_t handle) noexcept;
+    void addBadHandle(uintptr_t handle) noexcept;
+
+    struct Entry;
+    // Unordered map invalidates iterator upon rehash, so we can only use the ordered map.
+    using PointerMemoryMap = std::map<uintptr_t, Entry>;
+    using MarkEntryMap = std::multimap<std::string, PointerMemoryMap::iterator>;
+
+    struct Entry
+    {
+        CUDAVirtualMemoryChunk mMemory;
+        MarkEntryMap::iterator mEntryIt;
+    };
+
+    std::mutex mMutex;
+    PointerMemoryMap mMemories;
+    MarkEntryMap mEntries;
+    std::vector<uintptr_t> mBadHandles;
+
+    friend VirtualMemoryManagerTest;
+};
+
+class CudaVirtualAddressManager
+{
+public:
+    static CudaVirtualAddressManager& instance();
+
+    CUdeviceptr Reserve(size_t size, size_t alignment = 0, CUdeviceptr addr = {}) const;
+    void Free(CUdeviceptr ptr, size_t size) const;
+
+private:
+    explicit CudaVirtualAddressManager(size_t pageSize)
+        : mPageSize(pageSize)
+    {
+    }
+
+    size_t mPageSize;
+};
+
+class CudaVirtualMemoryAllocator
+{
+    using CudaStreamPtr = std::shared_ptr<CudaStream>;
+    using Pointer = void*;
+
+public:
+    enum RestoreMode
+    {
+        NONE,   // The memory is not backed. Upon rematerialize, memory has uninitialized content.
+        MEMSET, // The memory is memset to zero upon rematerialize.
+        CPU,    // The memory is backed by normal CPU memory. The content is restored upon rematerialize.
+        PINNED  // The memory is backed by pinned CPU memory. The content is restored upon rematerialize.
+    };
+
+    class Configuration
+    {
+        CudaVirtualMemoryManager& mManager;
+        std::string mMark;
+        CudaStreamPtr mBackStream;
+        std::size_t mPageSize;
+        RestoreMode mMode;
+        bool mBackground{};
+
+        friend class CudaVirtualMemoryAllocator;
+
+    public:
+        /**
+         * CudaVirtualMemoryAllocator::Configuration
+         * @param manager    Manager used to track and manage virtual memories
+         * @param mark       The mark for allocated memories
+         * @param mode       Backed storage mode
+         * @param backStream The CUDA stream used for restoring memory content
+         *                   Note: Virtual Address Allocation is not async. The stream is not used in allocation.
+         */
+        Configuration(CudaVirtualMemoryManager& manager, std::string mark, RestoreMode mode, CudaStreamPtr backStream)
+            : mManager(manager)
+            , mMark(std::move(mark))
+            , mBackStream(std::move(backStream))
+            , mPageSize(getpagesize())
+            , mMode(mode)
+        {
+        }
+
+        // Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.
+        static Configuration backgroundConfiguration;
+
+    private:
+        Configuration(CudaVirtualMemoryManager& manager, std::string mark, RestoreMode mode, CudaStreamPtr backStream,
+            bool background)
+            : Configuration(manager, std::move(mark), mode, std::move(backStream))
+        {
+            mBackground = background;
+        }
+    };
+
+    explicit CudaVirtualMemoryAllocator(std::shared_ptr<Configuration> config)
+        : mConfig(std::move(config))
+    {
+    }
+
+    // Tells if this is the background allocator.
+    explicit operator bool() const noexcept
+    {
+        return !mConfig->mBackground;
+    }
+
+    void allocate(Pointer* ptr, std::size_t n, int device) const;
+    void deallocate(Pointer ptr, std::size_t n) const;
+
+private:
+    std::shared_ptr<Configuration> mConfig;
+};
+
+} // namespace tensorrt_llm::runtime
+
+namespace tensorrt_llm::runtime
+{
+CudaVirtualMemoryManager& getVirtualMemoryManager();
+CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator();
+void pushVirtualMemoryAllocator(
+    std::string const& mark, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
+void popVirtualMemoryAllocator();
+
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
index 51ec498d92c..396071bb314 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@@ -155,6 +155,16 @@ void checkDriver(
     }
 }
 
+template <typename T>
+void checkDriverExitSafe(T result, char const* const func, char const* const file, int const line)
+{
+    if (result)
+    {
+        throw TllmException(
+            file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %d.", func, result).c_str());
+    }
+}
+
 } // namespace tensorrt_llm::common
 
 /*
@@ -167,4 +177,11 @@ void checkDriver(
             (stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__);               \
     } while (0)
 
+// Avoid using CUDADriverWrapper when freeing resource, during which the global instance may already be freed.
+#define TLLM_CU_CHECK_FREE_RESOURCE(stat)                                                                              \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        tensorrt_llm::common::checkDriverExitSafe((stat), #stat, __FILE__, __LINE__);                                  \
+    } while (0)
+
 #endif // CUDA_DRIVER_WRAPPER_H
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
index f3be85bbbf2..ce0b4ae2efa 100644
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -40,6 +40,7 @@
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/torchView.h"
+#include "tensorrt_llm/runtime/virtualMemory.h"
 
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
@@ -117,6 +118,10 @@ void initBindings(nb::module_& m)
         .def_rw("scaling_vec_pointer", &tr::LoraCache::TaskLayerModuleConfig::scalingVecPointer)
         .def(nb::self == nb::self);
 
+    nb::class_<tr::CudaVirtualMemoryManager>(m, "CudaVirtualMemoryManager")
+        .def("release_with_mark", &tr::CudaVirtualMemoryManager::releaseWithMark, nb::arg("mark"))
+        .def("materialize_with_mark", &tr::CudaVirtualMemoryManager::materializeWithMark, nb::arg("mark"));
+
     nb::class_<tr::BufferManager>(m, "BufferManager")
         .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
         .def_prop_ro("stream", &tr::BufferManager::getStream);
@@ -332,6 +337,30 @@ void initBindings(nb::module_& m)
         [](int32_t tp_size) { return tensorrt_llm::kernels::max_workspace_size_lowprecision(tp_size); },
         "Calculate the maximum workspace size needed for low precision all-reduce operations");
 
+    nb::enum_<tr::CudaVirtualMemoryAllocator::RestoreMode>(m, "CudaVirtualMemoryAllocatorRestoreMode")
+        .value("NONE", tr::CudaVirtualMemoryAllocator::RestoreMode::NONE)
+        .value("CPU", tr::CudaVirtualMemoryAllocator::RestoreMode::CPU)
+        .value("PINNED", tr::CudaVirtualMemoryAllocator::RestoreMode::PINNED)
+        .value("MEMSET", tr::CudaVirtualMemoryAllocator::RestoreMode::MEMSET);
+
+    m.def("get_virtual_memory_manager", &tr::getVirtualMemoryManager, "Get the virtual memory manager",
+        nb::rv_policy::reference);
+
+    m.def(
+        "push_virtual_memory_allocator",
+        [](std::string const& mark, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
+        {
+            tr::pushVirtualMemoryAllocator(mark, mode,
+                std::make_shared<tr::CudaStream>(
+                    reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
+        },
+        "Create a new virtual address allocator. Push it into the allocator stack to be used for subsequent "
+        "allocations");
+
+    m.def("pop_virtual_memory_allocator", &tr::popVirtualMemoryAllocator,
+        "Pop the current virtual address allocator from the allocator stack. Restore the active allocator for "
+        "subsequent allocations to be the previous one");
+
     nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
         .def(nb::init<size_t, uint32_t, uint32_t, at::Device, bool>())
         .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer)
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index c4adb08567b..b26ac7c2b08 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -39,6 +39,7 @@
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/torchView.h"
+#include "tensorrt_llm/runtime/virtualMemory.h"
 
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
@@ -214,6 +215,10 @@ void initBindings(pybind11::module_& m)
         .def_readwrite("scaling_vec_pointer", &tr::LoraCache::TaskLayerModuleConfig::scalingVecPointer)
         .def(py::self == py::self);
 
+    py::class_<tr::CudaVirtualMemoryManager>(m, "CudaVirtualMemoryManager")
+        .def("release_with_mark", &tr::CudaVirtualMemoryManager::releaseWithMark, py::arg("mark"))
+        .def("materialize_with_mark", &tr::CudaVirtualMemoryManager::materializeWithMark, py::arg("mark"));
+
     py::classh<tr::BufferManager>(m, "BufferManager")
         .def(py::init<tr::BufferManager::CudaStreamPtr, bool>(), py::arg("stream"), py::arg("trim_pool") = false)
         .def_property_readonly("stream", &tr::BufferManager::getStream);
@@ -425,6 +430,30 @@ void initBindings(pybind11::module_& m)
         [](int32_t tp_size) { return tensorrt_llm::kernels::max_workspace_size_lowprecision(tp_size); },
         "Calculate the maximum workspace size needed for low precision all-reduce operations");
 
+    py::enum_<tr::CudaVirtualMemoryAllocator::RestoreMode>(m, "CudaVirtualMemoryAllocatorRestoreMode")
+        .value("NONE", tr::CudaVirtualMemoryAllocator::RestoreMode::NONE)
+        .value("CPU", tr::CudaVirtualMemoryAllocator::RestoreMode::CPU)
+        .value("PINNED", tr::CudaVirtualMemoryAllocator::RestoreMode::PINNED)
+        .value("MEMSET", tr::CudaVirtualMemoryAllocator::RestoreMode::MEMSET);
+
+    m.def("get_virtual_memory_manager", &tr::getVirtualMemoryManager, "Get the virtual memory manager",
+        py::return_value_policy::reference);
+
+    m.def(
+        "push_virtual_memory_allocator",
+        [](std::string const& mark, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
+        {
+            tr::pushVirtualMemoryAllocator(mark, mode,
+                std::make_shared<tr::CudaStream>(
+                    reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
+        },
+        "Create a new virtual address allocator. Push it into the allocator stack to be used for subsequent "
+        "allocations");
+
+    m.def("pop_virtual_memory_allocator", &tr::popVirtualMemoryAllocator,
+        "Pop the current virtual address allocator from the allocator stack. Restore the active allocator for "
+        "subsequent allocations to be the previous one");
+
     py::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
         .def(py::init<size_t, uint32_t, uint32_t, at::Device, bool>())
         .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer)
diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt
index 79b8ea7d784..3ed61445d60 100644
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@@ -55,7 +55,8 @@ set(SRCS
     tllmStreamReaders.cpp
     tllmLogger.cpp
     workerPool.cpp
-    worldConfig.cpp)
+    worldConfig.cpp
+    virtualMemory.cpp)
 
 include_directories(${API_INCLUDE_DIR}/tensorrt_llm/runtime)
 
diff --git a/cpp/tensorrt_llm/runtime/bufferManager.cpp b/cpp/tensorrt_llm/runtime/bufferManager.cpp
index 13e904b9e1e..40e8ce58e5d 100644
--- a/cpp/tensorrt_llm/runtime/bufferManager.cpp
+++ b/cpp/tensorrt_llm/runtime/bufferManager.cpp
@@ -39,6 +39,10 @@ BufferManager::BufferManager(CudaStreamPtr stream, bool trimPool)
 
 BufferManager::IBufferPtr BufferManager::gpu(std::size_t size, nvinfer1::DataType type) const
 {
+    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    {
+        return std::make_unique<VirtualAddressDeviceBuffer>(size, type, vmAllocator);
+    }
     if (static_cast<bool>(mPool))
     {
         return std::make_unique<DeviceBuffer>(size, type, CudaAllocatorAsync{mStream, mPool});
@@ -49,6 +53,10 @@ BufferManager::IBufferPtr BufferManager::gpu(std::size_t size, nvinfer1::DataTyp
 
 BufferManager::ITensorPtr BufferManager::gpu(nvinfer1::Dims dims, nvinfer1::DataType type) const
 {
+    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    {
+        return std::make_unique<VirtualAddressDeviceTensor>(dims, type, vmAllocator);
+    }
     if (static_cast<bool>(mPool))
     {
         return std::make_unique<DeviceTensor>(dims, type, CudaAllocatorAsync{mStream, mPool});
@@ -59,11 +67,19 @@ BufferManager::ITensorPtr BufferManager::gpu(nvinfer1::Dims dims, nvinfer1::Data
 
 BufferManager::IBufferPtr BufferManager::gpuSync(std::size_t size, nvinfer1::DataType type)
 {
+    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    {
+        return std::make_unique<VirtualAddressDeviceBuffer>(size, type, vmAllocator);
+    }
     return std::make_unique<StaticDeviceBuffer>(size, type, CudaAllocator{});
 }
 
 BufferManager::ITensorPtr BufferManager::gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type)
 {
+    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    {
+        return std::make_unique<VirtualAddressDeviceTensor>(dims, type, vmAllocator);
+    }
     return std::make_unique<StaticDeviceTensor>(dims, type, CudaAllocator{});
 }
 
diff --git a/cpp/tensorrt_llm/runtime/tllmBuffers.h b/cpp/tensorrt_llm/runtime/tllmBuffers.h
index 38263bb5aa8..faed36537e5 100644
--- a/cpp/tensorrt_llm/runtime/tllmBuffers.h
+++ b/cpp/tensorrt_llm/runtime/tllmBuffers.h
@@ -25,6 +25,7 @@
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/ipcNvlsMemory.h"
 #include "tensorrt_llm/runtime/memoryCounters.h"
+#include "tensorrt_llm/runtime/virtualMemory.h"
 
 #include <NvInferRuntime.h>
 #include <cuda_runtime_api.h>
@@ -500,6 +501,36 @@ class PoolAllocator : public BaseAllocator<PoolAllocator<TAllocator>, TAllocator
 
 using PinnedPoolAllocator = PoolAllocator<PinnedAllocator>;
 
+class CudaVirtualMemoryAllocatorAdaptor
+    : public BaseAllocator<CudaVirtualMemoryAllocatorAdaptor, MemoryType::kGPU, /* count */ false>,
+      CudaVirtualMemoryAllocator
+{
+    // Update to MemoryCounters is done in Creator to more precisely reflect the memory usage.
+    using Base = BaseAllocator<CudaVirtualMemoryAllocatorAdaptor, MemoryType::kGPU, false>;
+    friend Base;
+
+public:
+    // No explicit, to allow implicit conversion from CudaVirtualMemoryAllocator
+    CudaVirtualMemoryAllocatorAdaptor(CudaVirtualMemoryAllocator const& allocator)
+        : CudaVirtualMemoryAllocator(allocator)
+    {
+    }
+
+    using Base::allocate;
+    using Base::deallocate;
+
+protected:
+    void allocateImpl(PointerType* ptr, std::size_t n) const
+    {
+        this->CudaVirtualMemoryAllocator::allocate(ptr, n, tensorrt_llm::common::getDevice());
+    }
+
+    void deallocateImpl(PointerType ptr, std::size_t n) const
+    {
+        this->CudaVirtualMemoryAllocator::deallocate(ptr, n);
+    }
+};
+
 // Adopted from https://github.com/NVIDIA/TensorRT/blob/release/8.6/samples/common/buffers.h
 
 //!
@@ -508,17 +539,10 @@ using PinnedPoolAllocator = PoolAllocator<PinnedAllocator>;
 //! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation,
 //!          deallocation, querying of buffers on both the device and the host.
 //!          It can handle data of arbitrary types because it stores byte buffers.
-//!          The template parameters AllocFunc and FreeFunc are used for the
-//!          allocation and deallocation of the buffer.
-//!          AllocFunc must be a functor that takes in (void** ptr, size_t size)
-//!          and returns bool. ptr is a pointer to where the allocated buffer address should be stored.
-//!          size is the amount of memory in bytes to allocate.
-//!          The boolean indicates whether or not the memory allocation was successful.
-//!          FreeFunc must be a functor that takes in (void* ptr) and returns void.
-//!          ptr is the allocated buffer address. It must work with nullptr input.
+//!          The template parameter TAllocator must inherit from BaseAllocator.
 //!
 template <typename TAllocator>
-class GenericBuffer : virtual public IBuffer
+class GenericBuffer : virtual public IBuffer, TAllocator // Inherit from TAllocator for EBO
 {
 public:
     using AllocatorType = TAllocator;
@@ -527,20 +551,27 @@ class GenericBuffer : virtual public IBuffer
     //! \brief Construct an empty buffer.
     //!
     explicit GenericBuffer(nvinfer1::DataType type, TAllocator allocator = {}) // NOLINT(*-pro-type-member-init)
-        : GenericBuffer{0, type, std::move(allocator)} {};
+        : GenericBuffer{0, type, std::move(allocator)}
+    {
+    }
 
     //!
     //! \brief Construct a buffer with the specified allocation size in number of elements.
     //!
     explicit GenericBuffer( // NOLINT(*-pro-type-member-init)
         std::size_t size, nvinfer1::DataType type, TAllocator allocator = {})
-        : GenericBuffer{size, size, type, std::move(allocator)} {};
+        : GenericBuffer{size, size, type, std::move(allocator)}
+    {
+    }
+
+    GenericBuffer(GenericBuffer const& other) = delete;
+    GenericBuffer& operator=(GenericBuffer const& buf) = delete;
 
     GenericBuffer(GenericBuffer&& buf) noexcept
-        : mSize{buf.mSize}
+        : TAllocator(static_cast<TAllocator&&>(buf))
+        , mSize{buf.mSize}
         , mCapacity{buf.mCapacity}
         , mType{buf.mType}
-        , mAllocator{std::move(buf.mAllocator)}
         , mBuffer{buf.mBuffer}
     {
         buf.mSize = 0;
@@ -552,11 +583,11 @@ class GenericBuffer : virtual public IBuffer
     {
         if (this != &buf)
         {
-            mAllocator.deallocate(mBuffer, toBytes(mCapacity));
+            this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
             mSize = buf.mSize;
             mCapacity = buf.mCapacity;
             mType = buf.mType;
-            mAllocator = std::move(buf.mAllocator);
+            *static_cast<TAllocator*>(this) = static_cast<TAllocator&&>(buf);
             mBuffer = buf.mBuffer;
             // Reset buf.
             buf.mSize = 0;
@@ -615,7 +646,7 @@ class GenericBuffer : virtual public IBuffer
     //!
     [[nodiscard]] MemoryType getMemoryType() const override
     {
-        return mAllocator.getMemoryType();
+        return this->TAllocator::getMemoryType();
     }
 
     //!
@@ -625,8 +656,8 @@ class GenericBuffer : virtual public IBuffer
     {
         if (mCapacity < newSize)
         {
-            mAllocator.deallocate(mBuffer, toBytes(mCapacity));
-            mBuffer = mAllocator.allocate(toBytes(newSize));
+            this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
+            mBuffer = this->TAllocator::allocate(toBytes(newSize));
             mCapacity = newSize;
         }
         mSize = newSize;
@@ -637,7 +668,7 @@ class GenericBuffer : virtual public IBuffer
     //!
     void release() override
     {
-        mAllocator.deallocate(mBuffer, toBytes(mCapacity));
+        this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
         mSize = 0;
         mCapacity = 0;
         mBuffer = nullptr;
@@ -647,7 +678,7 @@ class GenericBuffer : virtual public IBuffer
     {
         try
         {
-            mAllocator.deallocate(mBuffer, toBytes(mCapacity));
+            this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
         }
         catch (std::exception const& e)
         {
@@ -657,11 +688,11 @@ class GenericBuffer : virtual public IBuffer
 
 protected:
     explicit GenericBuffer(std::size_t size, std::size_t capacity, nvinfer1::DataType type, TAllocator allocator = {})
-        : mSize{size}
+        : TAllocator{std::move(allocator)}
+        , mSize{size}
         , mCapacity{capacity}
         , mType{type}
-        , mAllocator{std::move(allocator)}
-        , mBuffer{capacity > 0 ? mAllocator.allocate(toBytes(capacity)) : nullptr}
+        , mBuffer{capacity > 0 ? this->TAllocator::allocate(toBytes(capacity)) : nullptr}
     {
         TLLM_CHECK(size <= capacity);
         TLLM_CHECK(capacity == 0 || size > 0);
@@ -670,7 +701,6 @@ class GenericBuffer : virtual public IBuffer
 private:
     std::size_t mSize{0}, mCapacity{0};
     nvinfer1::DataType mType;
-    TAllocator mAllocator;
     void* mBuffer;
 };
 
@@ -834,6 +864,7 @@ using HostBuffer = GenericBuffer<HostAllocator>;
 using PinnedBuffer = GenericBuffer<PinnedAllocator>;
 using PinnedPoolBuffer = GenericBuffer<PinnedPoolAllocator>;
 using UVMBuffer = GenericBuffer<UVMAllocator>;
+using VirtualAddressDeviceBuffer = GenericBuffer<CudaVirtualMemoryAllocatorAdaptor>;
 
 template <typename T>
 std::make_unsigned_t<T> nonNegative(T value)
@@ -1069,5 +1100,6 @@ using HostTensor = GenericTensor<HostAllocator>;
 using PinnedTensor = GenericTensor<PinnedAllocator>;
 using PinnedPoolTensor = GenericTensor<PinnedPoolAllocator>;
 using UVMTensor = GenericTensor<UVMAllocator>;
+using VirtualAddressDeviceTensor = GenericTensor<CudaVirtualMemoryAllocatorAdaptor>;
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
new file mode 100644
index 00000000000..945a65f3afa
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/runtime/virtualMemory.h"
+#include "bufferManager.h"
+
+#include <forward_list>
+
+namespace tensorrt_llm::runtime
+{
+
+namespace
+{
+
+template <typename T>
+struct ScopeGuard
+{
+    bool const& ok;
+    T t;
+
+    ~ScopeGuard() noexcept(noexcept(t()))
+    {
+        if (!ok)
+        {
+            t();
+        }
+    }
+};
+
+template <typename T>
+ScopeGuard(bool const&, T) -> ScopeGuard<T>;
+
+} // namespace
+
+void CUDAVirtualMemoryChunk::materialize()
+{
+    TLLM_CHECK_WITH_INFO(status() == RELEASED, "virtual memory not in RELEASED status, is: %d", status());
+    mHandle = mCreator->create();
+
+    // Track the number of configurators ran, so release can correctly teardown.
+    for (auto const& conf : mConfigurators)
+    {
+        conf->setup(mHandle); // May throw
+        ++mState;
+    }
+}
+
+template <typename Callable, typename... Args>
+static bool safe_invoke_helper(std::exception_ptr& ep, char const* msg, Callable&& f, Args&&... args) noexcept
+{
+    try
+    {
+        std::invoke(std::forward<Callable>(f), std::forward<Args>(args)...);
+        return true;
+    }
+    catch (...)
+    {
+        if (ep)
+        {
+            try
+            {
+                std::rethrow_exception(ep);
+            }
+            catch (std::exception& e)
+            {
+                TLLM_LOG_ERROR(msg, e.what());
+            }
+        }
+        ep = std::current_exception();
+        return false;
+    }
+}
+
+void CUDAVirtualMemoryChunk::release()
+{
+    TLLM_CHECK_WITH_INFO(status() == MATERIALIZED || (status() == ERRORED && mState != INVALID_STATE),
+        "virtual memory is in status %d which cannot be released", status());
+    size_t const count = mConfigurators.size();
+    size_t const start = count - mState;
+
+    // Revert materialize(). Only configurators that ran setup() successfully
+    // will have their teardown() been called.
+    // Never early returns on exceptions. The last exception will be rethrown, and
+    // previous ones will be logged.
+    std::exception_ptr ePtr{};
+    auto const* msg = "Multiple exceptions thrown during release. The previous exception is: %s";
+    for (size_t i = start; i < count; ++i)
+    {
+        safe_invoke_helper(ePtr, msg, &Configurator::teardown, mConfigurators[count - i - 1].get(), mHandle);
+    }
+    safe_invoke_helper(ePtr, msg, &Creator::release, mCreator.get(), mHandle);
+    mHandle = {};
+    mState = 0;
+
+    if (ePtr != nullptr)
+    {
+        mState = INVALID_STATE;
+        std::rethrow_exception(ePtr);
+    }
+}
+
+void OffloadConfigurator::setup(CUmemGenericAllocationHandle)
+{
+    if (mBackedStorage != nullptr)
+    {
+        if (mOndemand)
+        {
+            TLLM_CU_CHECK(cuMemcpyHtoD_v2(mAddress, mBackedStorage->data(), mSize));
+            mBackedStorage.reset();
+        }
+        else
+        {
+            TLLM_CU_CHECK(cuMemcpyHtoDAsync_v2(mAddress, mBackedStorage->data(), mSize, mStream));
+        }
+    }
+}
+
+void OffloadConfigurator::teardown(CUmemGenericAllocationHandle)
+{
+    if (mBackedStorage == nullptr)
+    {
+        switch (mBackType)
+        {
+        case MemoryType::kCPU: mBackedStorage = BufferManager::cpu(mSize, nvinfer1::DataType::kINT8); break;
+        case MemoryType::kPINNED: mBackedStorage = BufferManager::pinned(mSize, nvinfer1::DataType::kINT8); break;
+        default: TLLM_THROW("Unknown memory type: %d", static_cast<int32_t>(mBackType));
+        }
+    }
+    // We have to synchronize here, or the memory may be unmapped before the copy operation.
+    TLLM_CU_CHECK(cuMemcpyDtoH_v2(mBackedStorage->data(), mAddress, mSize));
+}
+
+void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk&& memory)
+{
+    bool success = false;
+
+    TLLM_CHECK_WITH_INFO(
+        memory.status() == CUDAVirtualMemoryChunk::RELEASED || memory.status() == CUDAVirtualMemoryChunk::MATERIALIZED,
+        "CudaVirtualMemoryManager: bad virtual memory status");
+
+    std::unique_lock lock(mMutex);
+    auto [memIt, created] = mMemories.try_emplace(handle, Entry{});
+    TLLM_CHECK_WITH_INFO(
+        created, "CudaVirtualMemoryManager: handle 0x%016zx already being used by another memory", handle);
+    ScopeGuard eraseMemIt{success, [&, memIt_ = memIt] { mMemories.erase(memIt_); }};
+
+    auto const entryIt = mEntries.emplace(std::move(mark), memIt);
+    entryIt->second->second.mEntryIt = entryIt;
+
+    memIt->second.mMemory = std::move(memory);
+    success = true;
+}
+
+void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
+    CUDAVirtualMemoryChunk::Configurators&& configurators)
+{
+    std::unique_lock lock(mMutex);
+    bool success = false;
+
+    auto [memIt, created] = mMemories.try_emplace(handle,
+        Entry{
+            {std::move(creator), std::move(configurators)},
+        });
+    TLLM_CHECK_WITH_INFO(
+        created, "CudaVirtualMemoryManager: handle 0x%016zx already being used by another memory", handle);
+    ScopeGuard eraseMemIt{success, [&, memIt_ = memIt] { mMemories.erase(memIt_); }};
+
+    auto const entryIt = mEntries.emplace(std::move(mark), memIt);
+    memIt->second.mEntryIt = entryIt;
+    ScopeGuard eraseMarkIt{success, [&] { mEntries.erase(entryIt); }};
+
+    try
+    {
+        // Hopefully we don't need to hold the mutex guarding mMemories and mEntries anymore.
+        lock.unlock();
+        memIt->second.mMemory.materialize();
+        success = true;
+    }
+    catch (...)
+    {
+        // ...unless materialize() throws and we need to rollback.
+        lock.lock();
+        throw;
+    }
+}
+
+CUDAVirtualMemoryChunk CudaVirtualMemoryManager::remove(uintptr_t handle) noexcept
+{
+    std::unique_lock lock(mMutex);
+
+    return unsafeRemove(handle);
+}
+
+CUDAVirtualMemoryChunk CudaVirtualMemoryManager::unsafeRemove(uintptr_t handle) noexcept
+{
+    auto const nodeHandle = mMemories.extract(handle);
+    if (!nodeHandle)
+    {
+        return {};
+    }
+    mEntries.erase(nodeHandle.mapped().mEntryIt);
+
+    return std::move(nodeHandle.mapped().mMemory);
+}
+
+void CudaVirtualMemoryManager::addBadHandle(uintptr_t handle) noexcept
+{
+    try
+    {
+        mBadHandles.push_back(handle);
+    }
+    catch (...)
+    {
+    }
+}
+
+std::vector<uintptr_t> CudaVirtualMemoryManager::retrieveBadHandles() noexcept
+{
+    return std::move(mBadHandles);
+}
+
+size_t CudaVirtualMemoryManager::releaseWithMark(std::string const& mark)
+{
+    std::unique_lock lock(mMutex);
+
+    std::exception_ptr ePtr{};
+    auto [begin, end] = mEntries.equal_range(mark);
+    size_t count = 0;
+    for (auto it = begin; it != end;)
+    {
+        auto const handle = it->second->first;
+        auto& memory = it->second->second.mMemory;
+        ++it; // element referenced by `it` will be invalidated by unsafeRemove(handle)
+        if (memory.status() == CUDAVirtualMemoryChunk::MATERIALIZED)
+        {
+            if (!safe_invoke_helper(ePtr,
+                    "Multiple exceptions thrown during releaseWithMark. The previous exception is: %s",
+                    &CUDAVirtualMemoryChunk::release, &memory))
+            {
+                addBadHandle(handle);
+                unsafeRemove(handle);
+            }
+            ++count;
+        }
+    }
+
+    if (ePtr != nullptr)
+    {
+        std::rethrow_exception(ePtr);
+    }
+
+    return count;
+}
+
+size_t CudaVirtualMemoryManager::materializeWithMark(std::string const& mark)
+{
+    std::unique_lock lock(mMutex);
+
+    auto [begin, end] = mEntries.equal_range(mark);
+    size_t count = 0;
+
+    auto it = begin;
+
+    try
+    {
+        for (; it != end; ++it)
+        {
+            auto& memory = it->second->second.mMemory;
+            if (memory.status() == CUDAVirtualMemoryChunk::RELEASED)
+            {
+                memory.materialize();
+                ++count;
+            }
+        }
+    }
+    catch (...)
+    {
+        for (auto itRollback = begin; itRollback != it;)
+        {
+            auto const handle = itRollback->second->first;
+            auto& memory = itRollback->second->second.mMemory;
+            ++itRollback;
+            try
+            {
+                memory.release();
+            }
+            catch (std::exception& e)
+            {
+                addBadHandle(handle);
+                unsafeRemove(handle);
+                TLLM_LOG_ERROR("Additional exception thrown during rollback of materializeWithMark: %s", e.what());
+            }
+        }
+
+        addBadHandle(it->second->first);
+        unsafeRemove(it->second->first);
+
+        throw;
+    }
+    return count;
+}
+
+static_assert(sizeof(void*) == sizeof(CUdeviceptr));
+
+static CUdeviceptr deviceptr_cast(void* ptr)
+{
+    CUdeviceptr ret{};
+    std::memcpy(&ret, &ptr, sizeof(CUdeviceptr));
+    return ret;
+}
+
+static void* deviceptr_cast(CUdeviceptr ptr)
+{
+    void* ret{};
+    std::memcpy(&ret, &ptr, sizeof(CUdeviceptr));
+    return ret;
+}
+
+void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const
+{
+    CUdeviceptr address{};
+    std::size_t const pageAlignedSize = (n + mConfig->mPageSize - 1) & ~(mConfig->mPageSize - 1);
+    TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0));
+
+    CUDAVirtualMemoryChunk::Configurators configurators;
+    configurators.push_back(std::make_unique<UnicastConfigurator>(address, n,
+        CUmemAccessDesc{{
+                            CU_MEM_LOCATION_TYPE_DEVICE,
+                            device,
+                        },
+            CU_MEM_ACCESS_FLAGS_PROT_READWRITE}));
+
+    switch (mConfig->mMode)
+    {
+    case NONE: break;
+    case MEMSET:
+        configurators.push_back(std::make_unique<MemsetConfigurator>(address, n, 0, mConfig->mBackStream->get()));
+        break;
+    case CPU:
+        configurators.push_back(
+            std::make_unique<OffloadConfigurator>(address, n, MemoryType::kCPU, mConfig->mBackStream->get()));
+        break;
+    case PINNED:
+        configurators.push_back(
+            std::make_unique<OffloadConfigurator>(address, n, MemoryType::kPINNED, mConfig->mBackStream->get()));
+        break;
+    }
+
+    mConfig->mManager.add(address, mConfig->mMark,
+        std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
+                                             {
+                                                 CU_MEM_LOCATION_TYPE_DEVICE,
+                                                 device,
+                                             }},
+            n),
+        std::move(configurators));
+
+    *ptr = deviceptr_cast(address);
+}
+
+void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const
+{
+    auto const address = deviceptr_cast(ptr);
+    mConfig->mManager.remove(address);
+
+    std::size_t const pageAlignedSize = (n + mConfig->mPageSize - 1) & ~(mConfig->mPageSize - 1);
+    TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize));
+}
+
+} // namespace tensorrt_llm::runtime
+
+namespace tensorrt_llm::runtime
+{
+
+CudaVirtualMemoryManager& getVirtualMemoryManager()
+{
+    static CudaVirtualMemoryManager manager;
+    return manager;
+}
+
+using AllocConf = CudaVirtualMemoryAllocator::Configuration;
+
+AllocConf AllocConf::backgroundConfiguration{getVirtualMemoryManager(), "", NONE, nullptr, true};
+static std::forward_list vmAllocators{
+    CudaVirtualMemoryAllocator{{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration}}};
+
+CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator()
+{
+    return vmAllocators.front();
+}
+
+void pushVirtualMemoryAllocator(
+    std::string const& mark, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
+{
+    vmAllocators.emplace_front(std::make_shared<AllocConf>(getVirtualMemoryManager(), mark, mode, backStream));
+}
+
+void popVirtualMemoryAllocator()
+{
+    vmAllocators.pop_front();
+}
+
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index 8e41e2a2886..a9d0d4009f9 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -85,6 +85,7 @@ add_library(
   selectiveScanOp.cpp
   userbuffersFinalizeOp.cpp
   userbuffersTensor.cpp
+  virtualMemoryAllocator.cpp
   weightOnlyQuantGemm.cpp
   weightOnlyQuantOp.cpp
   mtpOp.cpp
diff --git a/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp b/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp
new file mode 100644
index 00000000000..9904d985021
--- /dev/null
+++ b/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/runtime/virtualMemory.h"
+#include <cuda_runtime_api.h>
+#include <sys/types.h>
+
+extern "C"
+{
+
+    void* tensorrt_llm_virtual_memory_alloc(ssize_t size, int device, cudaStream_t) noexcept
+    {
+        void* ptr{};
+        try
+        {
+            tensorrt_llm::runtime::getVirtualMemoryAllocator().allocate(&ptr, size, device);
+        }
+        catch (std::exception const& e)
+        {
+            TLLM_LOG_EXCEPTION(e);
+            ptr = {};
+        }
+        catch (...)
+        {
+            TLLM_LOG_ERROR("Unknown exception thrown allocating virtual memory");
+            ptr = {};
+        }
+
+        return ptr;
+    }
+
+    void tensorrt_llm_virtual_memory_free(void* ptr, ssize_t size, cudaStream_t) noexcept
+    {
+        try
+        {
+            tensorrt_llm::runtime::getVirtualMemoryAllocator().deallocate(ptr, size);
+        }
+        catch (std::exception const& e)
+        {
+            TLLM_LOG_EXCEPTION(e);
+        }
+        catch (...)
+        {
+            TLLM_LOG_ERROR("Unknown exception thrown deallocating virtual memory");
+        }
+    }
+}
diff --git a/cpp/tests/unit_tests/runtime/CMakeLists.txt b/cpp/tests/unit_tests/runtime/CMakeLists.txt
index 25ec2ab1b5e..77db1d9fd0e 100644
--- a/cpp/tests/unit_tests/runtime/CMakeLists.txt
+++ b/cpp/tests/unit_tests/runtime/CMakeLists.txt
@@ -28,6 +28,7 @@ add_gtest(tllmRuntimeTest tllmRuntimeTest.cpp)
 add_gtest(transposeKVKernelTest transposeKVKernelTest.cpp)
 add_gtest(userBufferTest userBufferTest.cpp)
 add_gtest(utilsTest utilsTest.cpp)
+add_gtest(virtualMemoryTest virtualMemoryTest.cpp)
 add_gtest(workerPoolTest workerPoolTest.cpp)
 add_gtest(worldConfigTest worldConfigTest.cpp)
 
diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
new file mode 100644
index 00000000000..eacbfe6ad3e
--- /dev/null
+++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@@ -0,0 +1,1572 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <nvml.h>
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/tllmBuffers.h"
+#include "tensorrt_llm/runtime/virtualMemory.h"
+
+#include <cstdint>
+#include <random>
+#include <unistd.h>
+#include <vector>
+
+using namespace tensorrt_llm::runtime;
+namespace tc = tensorrt_llm::common;
+
+struct DummyException : std::runtime_error
+{
+    DummyException()
+        : runtime_error("dummy exception")
+    {
+    }
+};
+
+class VirtualMemoryTestBase : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        if (tc::getDeviceCount() == 0)
+        {
+            GTEST_SKIP() << "This test suite cannot run on systems with no devices.";
+        }
+
+        TLLM_CU_CHECK(cuInit(0));
+
+        CUdevice dev;
+        TLLM_CU_CHECK(cuDeviceGet(&dev, 0));
+
+        CUcontext ctx;
+        TLLM_CU_CHECK(cuDevicePrimaryCtxRetain(&ctx, dev));
+        TLLM_CU_CHECK(cuCtxSetCurrent(ctx));
+
+        // Initialize NVML
+        nvmlReturn_t nvmlResult = nvmlInit();
+        TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", nvmlErrorString(nvmlResult));
+
+        if (!memoryInfoAvailable())
+        {
+            TLLM_LOG_WARNING("Per process memory information unavailable.");
+        }
+
+        TLLM_CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    void TearDown() override
+    {
+        TLLM_CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    static bool memoryInfoAvailable()
+    {
+        static bool available = []
+        {
+            auto blob = BufferManager::gpuSync(4096);
+            auto usage = getCurrentProcessMemoryInfo();
+            return usage != 0;
+        }();
+
+        return available;
+    }
+
+    static size_t getCurrentProcessMemoryInfo()
+    {
+        // Get current process ID
+        uint32_t currentPid = static_cast<uint32_t>(getpid());
+
+        // Get device handle for GPU 0
+        nvmlDevice_t device;
+        auto nvmlResult = nvmlDeviceGetHandleByIndex(0, &device);
+        TLLM_CHECK_WITH_INFO(
+            nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvmlErrorString(nvmlResult));
+
+        // Get running processes
+        unsigned int processCount = 1;
+        std::vector<nvmlProcessInfo_v2_t> processes(processCount);
+        nvmlResult = NVML_ERROR_INSUFFICIENT_SIZE;
+        while (nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE)
+        {
+            nvmlResult = nvmlDeviceGetComputeRunningProcesses_v3(device, &processCount, processes.data());
+            TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS || nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE,
+                "Failed to get process count: %s", nvmlErrorString(nvmlResult));
+            processes.resize(processCount);
+        }
+
+        // Find current process
+        for (auto const& process : processes)
+        {
+            if (process.pid == currentPid)
+            {
+                return process.usedGpuMemory;
+            }
+        }
+
+        return 0;
+    }
+};
+
+class VirtualMemoryTest : public VirtualMemoryTestBase
+{
+};
+
+// Test CUDAVirtualMemoryChunk materialize and release memory correctly
+TEST_F(VirtualMemoryTest, TestBasic)
+{
+    CUdeviceptr address{};
+    std::size_t constexpr size = 256 * 1024 * 1024;
+    TLLM_CU_CHECK(cuMemAddressReserve(&address, size, 0, {}, 0));
+
+    CUDAVirtualMemoryChunk::CreatorPtr creator
+        = std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
+                                               {
+                                                   CU_MEM_LOCATION_TYPE_DEVICE,
+                                                   0,
+                                               }},
+            size);
+
+    CUDAVirtualMemoryChunk::Configurators configurators;
+    configurators.push_back(std::make_unique<UnicastConfigurator>(address, size,
+        CUmemAccessDesc{{
+                            CU_MEM_LOCATION_TYPE_DEVICE,
+                            0,
+                        },
+            CU_MEM_ACCESS_FLAGS_PROT_READWRITE}));
+
+    CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+    ASSERT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+
+    auto memoryBegin = getCurrentProcessMemoryInfo();
+    vm.materialize();
+    ASSERT_EQ(vm.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+    auto memoryMaterialized = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin + size, memoryMaterialized) << "materialize does not allocate memory";
+    }
+
+    auto result = cuMemsetD8_v2(address, 255, size);
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Accessing memory returned failure (first materialize)";
+    TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
+
+    vm.release();
+    ASSERT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+    auto memoryReleased = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryReleased) << "release does not release memory";
+    }
+
+    vm.materialize();
+    ASSERT_EQ(vm.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+    result = cuMemsetD8_v2(address, 255, size);
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Accessing memory returned failure (second materialize)";
+    TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
+
+    vm.release();
+    ASSERT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+    memoryReleased = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryReleased) << "release does not release memory";
+    }
+}
+
+// Test BackedConfigurator refills memory correctly for both CPU and PINNED memory types
+class VirtualMemoryOffloadConfigurator : public VirtualMemoryTest, public ::testing::WithParamInterface<MemoryType>
+{
+};
+
+TEST_P(VirtualMemoryOffloadConfigurator, Test)
+{
+    MemoryType backType = GetParam();
+    CUdeviceptr address{};
+    std::size_t constexpr size = 4 * 1024 * 1024;
+    TLLM_CU_CHECK(cuMemAddressReserve(&address, size, 0, {}, 0));
+
+    CudaStream stream;
+
+    CUDAVirtualMemoryChunk::CreatorPtr creator
+        = std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
+                                               {
+                                                   CU_MEM_LOCATION_TYPE_DEVICE,
+                                                   0,
+                                               }},
+            size);
+
+    CUDAVirtualMemoryChunk::Configurators configurators;
+    configurators.push_back(std::make_unique<UnicastConfigurator>(address, size,
+        CUmemAccessDesc{{
+                            CU_MEM_LOCATION_TYPE_DEVICE,
+                            0,
+                        },
+            CU_MEM_ACCESS_FLAGS_PROT_READWRITE}));
+    configurators.push_back(std::make_unique<OffloadConfigurator>(address, size, backType, stream.get(), false));
+
+    CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+    std::vector<uint64_t> data(size / sizeof(uint64_t), 0);
+    std::generate(data.begin(), data.end(), [engine = std::mt19937_64(address)]() mutable { return engine(); });
+
+    vm.materialize();
+
+    auto pointer = reinterpret_cast<void*>(address);
+    auto result = cudaMemcpyAsync(pointer, data.data(), size, cudaMemcpyHostToDevice, stream.get());
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Copying memory returned failure";
+
+    vm.release();
+
+    vm.materialize();
+
+    std::fill(data.begin(), data.end(), 0);
+    result = cudaMemcpyAsync(data.data(), pointer, size, cudaMemcpyDeviceToHost, stream.get());
+    stream.synchronize();
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Copying memory returned failure";
+
+    auto engine = std::mt19937_64(static_cast<uint64_t>(address));
+    for (size_t i = 0; i < data.size(); ++i)
+    {
+        ASSERT_EQ(data[i], engine()) << "Mismatched at index " << i;
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Backends, VirtualMemoryOffloadConfigurator, ::testing::Values(MemoryType::kCPU, MemoryType::kPINNED));
+
+// Test CUDAVirtualMemoryChunk calls creator and configurators in correct order
+TEST_F(VirtualMemoryTest, TestOrder)
+{
+    // Order tracking - local counter to track call sequence
+    int callOrder = 0;
+
+    // OrderTrackingCreator that records when its methods are called
+    class OrderTrackingCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        int& mCallOrder;
+        int createOrder = -1;
+        int releaseOrder = -1;
+        CUmemGenericAllocationHandle createdHandle = 0;
+
+        OrderTrackingCreator(int& callOrder)
+            : mCallOrder(callOrder)
+        {
+        }
+
+        CUmemGenericAllocationHandle create() override
+        {
+            createOrder = ++mCallOrder;
+            createdHandle = 0xbaadf00dbaadf00d;
+            return createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            releaseOrder = ++mCallOrder;
+            ASSERT_EQ(handle, createdHandle);
+        }
+    };
+
+    // OrderTrackingConfigurator that records when its methods are called
+    class OrderTrackingConfigurator : public CUDAVirtualMemoryChunk::Configurator
+    {
+    public:
+        int& mCallOrder;
+        std::string name;
+        int setupOrder = -1;
+        int teardownOrder = -1;
+
+        OrderTrackingConfigurator(int& callOrder, std::string n)
+            : mCallOrder(callOrder)
+            , name(std::move(n))
+        {
+        }
+
+        void setup(CUmemGenericAllocationHandle handle) override
+        {
+            setupOrder = ++mCallOrder;
+        }
+
+        void teardown(CUmemGenericAllocationHandle handle) override
+        {
+            teardownOrder = ++mCallOrder;
+        }
+    };
+
+    // Create creator and configurators
+    auto creator = std::make_unique<OrderTrackingCreator>(callOrder);
+    auto* creatorPtr = creator.get();
+
+    auto config1 = std::make_unique<OrderTrackingConfigurator>(callOrder, "config1");
+    auto config2 = std::make_unique<OrderTrackingConfigurator>(callOrder, "config2");
+    auto config3 = std::make_unique<OrderTrackingConfigurator>(callOrder, "config3");
+    auto* config1Ptr = config1.get();
+    auto* config2Ptr = config2.get();
+    auto* config3Ptr = config3.get();
+
+    CUDAVirtualMemoryChunk::Configurators configurators;
+    configurators.push_back(std::move(config1));
+    configurators.push_back(std::move(config2));
+    configurators.push_back(std::move(config3));
+
+    CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+    // Test materialize() order: creator.create() first, then configurators.setup() in order
+    vm.materialize();
+
+    // Verify materialize order
+    EXPECT_EQ(creatorPtr->createOrder, 1); // creator.create() should be called first
+    EXPECT_EQ(config1Ptr->setupOrder, 2);  // config1.setup() should be called second
+    EXPECT_EQ(config2Ptr->setupOrder, 3);  // config2.setup() should be called third
+    EXPECT_EQ(config3Ptr->setupOrder, 4);  // config3.setup() should be called fourth
+
+    // Verify release() hasn't been called yet
+    EXPECT_EQ(creatorPtr->releaseOrder, -1);
+    EXPECT_EQ(config1Ptr->teardownOrder, -1);
+    EXPECT_EQ(config2Ptr->teardownOrder, -1);
+    EXPECT_EQ(config3Ptr->teardownOrder, -1);
+
+    // Test release() order: configurators.teardown() in reverse order, then creator.release()
+    vm.release();
+
+    // Verify release order
+    EXPECT_EQ(config3Ptr->teardownOrder, 5); // config3.teardown() should be called first (reverse order)
+    EXPECT_EQ(config2Ptr->teardownOrder, 6); // config2.teardown() should be called second
+    EXPECT_EQ(config1Ptr->teardownOrder, 7); // config1.teardown() should be called third
+    EXPECT_EQ(creatorPtr->releaseOrder, 8);  // creator.release() should be called last
+}
+
+// Test CUDAVirtualMemoryChunk behaves correctly when exceptions were thrown
+TEST_F(VirtualMemoryTest, TestException)
+{
+    // Dummy Creator that can be configured to throw on create() or release()
+    class DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        bool throwOnCreate = false;
+        bool throwOnRelease = false;
+        bool createCalled = false;
+        bool releaseCalled = false;
+        CUmemGenericAllocationHandle createdHandle = 0;
+
+        CUmemGenericAllocationHandle create() override
+        {
+            createCalled = true;
+            if (throwOnCreate)
+            {
+                throw DummyException();
+            }
+            createdHandle = 0xbaadf00dbaadf00d;
+            return createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            releaseCalled = true;
+            ASSERT_EQ(handle, createdHandle);
+            if (throwOnRelease)
+            {
+                throw DummyException();
+            }
+        }
+    };
+
+    // Dummy Configurator that can be configured to throw on setup() or teardown()
+    class DummyConfigurator : public CUDAVirtualMemoryChunk::Configurator
+    {
+    public:
+        bool throwOnSetup = false;
+        bool throwOnTeardown = false;
+        bool setupCalled = false;
+        bool teardownCalled = false;
+        std::string name;
+
+        DummyConfigurator(std::string n)
+            : name(std::move(n))
+        {
+        }
+
+        void setup(CUmemGenericAllocationHandle) override
+        {
+            setupCalled = true;
+            if (throwOnSetup)
+            {
+                throw DummyException();
+            }
+        }
+
+        void teardown(CUmemGenericAllocationHandle handle) override
+        {
+            teardownCalled = true;
+            if (throwOnTeardown)
+            {
+                throw DummyException();
+            }
+        }
+    };
+
+    // Test 1: Exception in creator->create()
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        creator->throwOnCreate = true;
+        auto* creatorPtr = creator.get();
+
+        auto config1 = std::make_unique<DummyConfigurator>("config1");
+        auto config2 = std::make_unique<DummyConfigurator>("config2");
+        auto* config1Ptr = config1.get();
+        auto* config2Ptr = config2.get();
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+        configurators.push_back(std::move(config2));
+
+        CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+        // materialize() should throw due to creator->create() exception
+        EXPECT_THROW(vm.materialize(), DummyException);
+
+        // Verify creator->create() was called but no configurators were setup
+        EXPECT_TRUE(creatorPtr->createCalled);
+        EXPECT_FALSE(config1Ptr->setupCalled);
+        EXPECT_FALSE(config2Ptr->setupCalled);
+
+        // Internal state is still valid.
+        // If the failure from creator is temporary, materialize() can be reattempted.
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+    }
+
+    // Test 2: Exception in first configurator setup()
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        auto* creatorPtr = creator.get();
+
+        auto config1 = std::make_unique<DummyConfigurator>("config1");
+        auto config2 = std::make_unique<DummyConfigurator>("config2");
+        config1->throwOnSetup = true;
+        auto* config1Ptr = config1.get();
+        auto* config2Ptr = config2.get();
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+        configurators.push_back(std::move(config2));
+
+        CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+        // materialize() should throw due to first configurator exception
+        EXPECT_THROW(vm.materialize(), DummyException);
+
+        // Verify creator->create() was called and first configurator setup() was called
+        EXPECT_TRUE(creatorPtr->createCalled);
+        EXPECT_TRUE(config1Ptr->setupCalled);
+        EXPECT_FALSE(config2Ptr->setupCalled);
+
+        // Status should be ERRORED
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::ERRORED);
+
+        // release() should still work and only teardown what was set up
+        vm.release();
+        EXPECT_TRUE(creatorPtr->releaseCalled);
+        EXPECT_FALSE(config1Ptr->teardownCalled); // Failed setup, so no teardown
+        EXPECT_FALSE(config2Ptr->teardownCalled); // Never setup
+    }
+
+    // Test 3: Exception in second configurator setup()
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        auto* creatorPtr = creator.get();
+
+        auto config1 = std::make_unique<DummyConfigurator>("config1");
+        auto config2 = std::make_unique<DummyConfigurator>("config2");
+        config2->throwOnSetup = true;
+        auto* config1Ptr = config1.get();
+        auto* config2Ptr = config2.get();
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+        configurators.push_back(std::move(config2));
+
+        CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+        // materialize() should throw due to second configurator exception
+        EXPECT_THROW(vm.materialize(), DummyException);
+
+        // Verify both creator and first configurator were called
+        EXPECT_TRUE(creatorPtr->createCalled);
+        EXPECT_TRUE(config1Ptr->setupCalled);
+        EXPECT_TRUE(config2Ptr->setupCalled);
+
+        // Status should be ERRORED
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::ERRORED);
+
+        // release() should teardown the first configurator (successful setup) but not the second
+        vm.release();
+        EXPECT_TRUE(creatorPtr->releaseCalled);
+        EXPECT_TRUE(config1Ptr->teardownCalled);  // Successful setup, so teardown called
+        EXPECT_FALSE(config2Ptr->teardownCalled); // Failed setup, so no teardown
+    }
+
+    // Test 4: Exception in configurator teardown() during release()
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        auto* creatorPtr = creator.get();
+
+        auto config1 = std::make_unique<DummyConfigurator>("config1");
+        auto config2 = std::make_unique<DummyConfigurator>("config2");
+        config2->throwOnTeardown = true;
+        auto* config1Ptr = config1.get();
+        auto* config2Ptr = config2.get();
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+        configurators.push_back(std::move(config2));
+
+        CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+        // materialize() should succeed
+        vm.materialize();
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        // release() should throw due to teardown exception but still complete cleanup
+        EXPECT_THROW(vm.release(), DummyException);
+
+        // Verify all teardown methods were called despite exception
+        EXPECT_TRUE(config1Ptr->teardownCalled);
+        EXPECT_TRUE(config2Ptr->teardownCalled);
+        EXPECT_TRUE(creatorPtr->releaseCalled);
+
+        // Status should be ERRORED due to exception
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::ERRORED);
+    }
+
+    // Test 5: Exception in creator->release()
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        creator->throwOnRelease = true;
+        auto* creatorPtr = creator.get();
+
+        auto config1 = std::make_unique<DummyConfigurator>("config1");
+        auto* config1Ptr = config1.get();
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+
+        CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+        // materialize() should succeed
+        vm.materialize();
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        // release() should throw due to creator exception but still complete configurator cleanup
+        EXPECT_THROW(vm.release(), DummyException);
+
+        // Verify configurator teardown was called despite creator exception
+        EXPECT_TRUE(config1Ptr->teardownCalled);
+        EXPECT_TRUE(creatorPtr->releaseCalled);
+
+        // Status should be ERRORED due to exception
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::ERRORED);
+    }
+}
+
+// Test various class facilities
+TEST_F(VirtualMemoryTest, TestFacilities)
+{
+    // Test default constructed CUDAVirtualMemoryChunk
+    {
+        CUDAVirtualMemoryChunk defaultVm;
+
+        // Should be invalid
+        EXPECT_FALSE(defaultVm);
+        EXPECT_EQ(defaultVm.status(), CUDAVirtualMemoryChunk::INVALID);
+    }
+
+    CUdeviceptr address{};
+    std::size_t constexpr size = 64 * 1024 * 1024;
+    TLLM_CU_CHECK(cuMemAddressReserve(&address, size, 0, {}, 0));
+    // Test move semantic
+    {
+
+        // Create original CUDAVirtualMemoryChunk
+        CUDAVirtualMemoryChunk::CreatorPtr creator
+            = std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED,
+                                                   CU_MEM_HANDLE_TYPE_NONE, {CU_MEM_LOCATION_TYPE_DEVICE, 0}},
+                size);
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::make_unique<UnicastConfigurator>(
+            address, size, CUmemAccessDesc{{CU_MEM_LOCATION_TYPE_DEVICE, 0}, CU_MEM_ACCESS_FLAGS_PROT_READWRITE}));
+
+        CUDAVirtualMemoryChunk original(std::move(creator), std::move(configurators));
+        original.materialize();
+        EXPECT_EQ(original.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        // Test move constructor
+        CUDAVirtualMemoryChunk moved{std::move(original)};
+        EXPECT_FALSE(original); // Original should be invalid after move
+        EXPECT_TRUE(moved);     // Moved-to object should be valid
+        EXPECT_EQ(moved.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        // Test move assignment
+        CUDAVirtualMemoryChunk assigned;
+        EXPECT_FALSE(assigned); // Default constructed, should be invalid
+
+        assigned = std::move(moved);
+        EXPECT_FALSE(moved);   // moved should be invalid after move
+        EXPECT_TRUE(assigned); // assigned should be valid
+        EXPECT_EQ(assigned.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        // Clean up
+        assigned.release();
+    }
+}
+
+// Test destructor
+TEST_F(VirtualMemoryTest, TestDestructor)
+{
+
+    // Dummy Creator for testing destructor behavior
+    class DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        bool& createCalledRef;
+        bool& releaseCalledRef;
+        CUmemGenericAllocationHandle createdHandle = 0;
+
+        DummyCreator(bool& createRef, bool& releaseRef)
+            : createCalledRef(createRef)
+            , releaseCalledRef(releaseRef)
+        {
+        }
+
+        CUmemGenericAllocationHandle create() override
+        {
+            createCalledRef = true;
+            createdHandle = 0xbaadf00dbaadf00d;
+            return createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            releaseCalledRef = true;
+            ASSERT_EQ(handle, createdHandle);
+        }
+    };
+
+    // Dummy Configurator for testing destructor behavior
+    class DummyConfigurator : public CUDAVirtualMemoryChunk::Configurator
+    {
+    public:
+        bool& setupCalledRef;
+        bool& teardownCalledRef;
+        std::string name;
+
+        DummyConfigurator(std::string n, bool& setupRef, bool& teardownRef)
+            : setupCalledRef(setupRef)
+            , teardownCalledRef(teardownRef)
+            , name(std::move(n))
+        {
+        }
+
+        void setup(CUmemGenericAllocationHandle) override
+        {
+            setupCalledRef = true;
+        }
+
+        void teardown(CUmemGenericAllocationHandle) override
+        {
+            teardownCalledRef = true;
+        }
+    };
+
+    // Test destructor calls release automatically for materialized memory
+    {
+        bool createCalled = false;
+        bool releaseCalled = false;
+        bool setupCalled = false;
+        bool teardownCalled = false;
+
+        auto creator = std::make_unique<DummyCreator>(createCalled, releaseCalled);
+        auto config1 = std::make_unique<DummyConfigurator>("config1", setupCalled, teardownCalled);
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+
+        alignas(CUDAVirtualMemoryChunk) std::byte storage[sizeof(CUDAVirtualMemoryChunk)];
+        CUDAVirtualMemoryChunk* vm = new (storage) CUDAVirtualMemoryChunk(std::move(creator), std::move(configurators));
+
+        vm->materialize();
+
+        // Verify materialize was called
+        EXPECT_TRUE(createCalled);
+        EXPECT_TRUE(setupCalled);
+        EXPECT_FALSE(releaseCalled);
+        EXPECT_FALSE(teardownCalled);
+        EXPECT_EQ(vm->status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        vm->~CUDAVirtualMemoryChunk();
+
+        // Verify destructor called release
+        EXPECT_TRUE(releaseCalled);
+        EXPECT_TRUE(teardownCalled);
+    }
+
+    // Test destructor doesn't double-release for manually released memory
+    {
+        // Local variables to track calls (persist after object destruction)
+        bool createCalled = false;
+        bool releaseCalled = false;
+        bool setupCalled = false;
+        bool teardownCalled = false;
+
+        auto creator = std::make_unique<DummyCreator>(createCalled, releaseCalled);
+        auto config1 = std::make_unique<DummyConfigurator>("config1", setupCalled, teardownCalled);
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+
+        alignas(CUDAVirtualMemoryChunk) std::byte storage[sizeof(CUDAVirtualMemoryChunk)];
+        auto* vm = new (storage) CUDAVirtualMemoryChunk(std::move(creator), std::move(configurators));
+
+        vm->materialize();
+        vm->release(); // Manual release
+
+        // Verify manual release was called
+        EXPECT_TRUE(releaseCalled);
+        EXPECT_TRUE(teardownCalled);
+        EXPECT_EQ(vm->status(), CUDAVirtualMemoryChunk::RELEASED);
+
+        // Reset flags to verify destructor doesn't call release again
+        releaseCalled = false;
+        teardownCalled = false;
+
+        vm->~CUDAVirtualMemoryChunk();
+
+        // Verify destructor did NOT call release again (no double-release)
+        EXPECT_FALSE(releaseCalled);
+        EXPECT_FALSE(teardownCalled);
+    }
+
+    // Test destructor behavior with ERRORED state
+    {
+        // Local variables to track calls (persist after object destruction)
+        bool createCalled = false;
+        bool releaseCalled = false;
+        bool config1SetupCalled = false;
+        bool config1TeardownCalled = false;
+        bool throwingSetupCalled = false;
+        bool throwingTeardownCalled = false;
+
+        class ThrowingConfigurator : public CUDAVirtualMemoryChunk::Configurator
+        {
+        public:
+            bool& setupCalledRef;
+            bool& teardownCalledRef;
+
+            ThrowingConfigurator(bool& setupRef, bool& teardownRef)
+                : setupCalledRef(setupRef)
+                , teardownCalledRef(teardownRef)
+            {
+            }
+
+            void setup(CUmemGenericAllocationHandle) override
+            {
+                setupCalledRef = true;
+                throw DummyException();
+            }
+
+            void teardown(CUmemGenericAllocationHandle) override
+            {
+                teardownCalledRef = true;
+            }
+        };
+
+        auto creator = std::make_unique<DummyCreator>(createCalled, releaseCalled);
+        auto config1 = std::make_unique<DummyConfigurator>("config1", config1SetupCalled, config1TeardownCalled);
+        auto throwingConfig = std::make_unique<ThrowingConfigurator>(throwingSetupCalled, throwingTeardownCalled);
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(config1));
+        configurators.push_back(std::move(throwingConfig));
+
+        alignas(CUDAVirtualMemoryChunk) std::byte storage[sizeof(CUDAVirtualMemoryChunk)];
+        auto* vm = new (storage) CUDAVirtualMemoryChunk(std::move(creator), std::move(configurators));
+
+        // Materialize should throw and leave VM in ERRORED state
+        EXPECT_THROW(vm->materialize(), DummyException);
+        EXPECT_EQ(vm->status(), CUDAVirtualMemoryChunk::ERRORED);
+
+        // Verify partial setup occurred
+        EXPECT_TRUE(createCalled);
+        EXPECT_TRUE(config1SetupCalled);
+        EXPECT_TRUE(throwingSetupCalled);
+        EXPECT_FALSE(releaseCalled);
+
+        vm->~CUDAVirtualMemoryChunk();
+
+        // Verify destructor called release to clean up the errored state
+        EXPECT_TRUE(releaseCalled);
+        EXPECT_TRUE(config1TeardownCalled);
+        // throwingConfig's teardown should NOT be called since setup failed
+        EXPECT_FALSE(throwingTeardownCalled);
+    }
+}
+
+// Test edge cases and error scenarios
+TEST_F(VirtualMemoryTest, TestEdgeCases)
+{
+    // Dummy Creator for testing edge cases
+    class DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        CUmemGenericAllocationHandle createdHandle = 0xbaadf00dbaadf00d;
+
+        CUmemGenericAllocationHandle create() override
+        {
+            return createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            ASSERT_EQ(handle, createdHandle);
+        }
+    };
+
+    // Test multiple materialize calls (should throw)
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        CUDAVirtualMemoryChunk vm(std::move(creator), {});
+
+        vm.materialize();
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        // Second materialize should throw
+        EXPECT_THROW(vm.materialize(), tc::TllmException);
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::MATERIALIZED);
+
+        vm.release();
+    }
+
+    // Test multiple release calls (should throw)
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        CUDAVirtualMemoryChunk vm(std::move(creator), {});
+
+        vm.materialize();
+        vm.release();
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+
+        // Second release should throw
+        EXPECT_THROW(vm.release(), tc::TllmException);
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+    }
+
+    // Test release on RELEASED state (should throw)
+    {
+        auto creator = std::make_unique<DummyCreator>();
+        CUDAVirtualMemoryChunk vm(std::move(creator), {});
+
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+        EXPECT_THROW(vm.release(), tc::TllmException); // Should throw on RELEASED state
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+    }
+
+    // Test materialize on ERRORED state after exception recovery
+    {
+        // Create a VM that will go into ERRORED state
+        class ThrowingConfigurator : public CUDAVirtualMemoryChunk::Configurator
+        {
+        public:
+            bool shouldThrow = true;
+
+            void setup(CUmemGenericAllocationHandle) override
+            {
+                if (shouldThrow)
+                {
+                    throw DummyException();
+                }
+            }
+
+            void teardown(CUmemGenericAllocationHandle) override {}
+        };
+
+        auto creator = std::make_unique<DummyCreator>();
+        auto throwingConfig = std::make_unique<ThrowingConfigurator>();
+
+        CUDAVirtualMemoryChunk::Configurators configurators;
+        configurators.push_back(std::move(throwingConfig));
+
+        CUDAVirtualMemoryChunk vm(std::move(creator), std::move(configurators));
+
+        // First materialize should throw and leave VM in ERRORED state
+        EXPECT_THROW(vm.materialize(), DummyException);
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::ERRORED);
+
+        // Should be able to release from ERRORED state
+        vm.release();
+        EXPECT_EQ(vm.status(), CUDAVirtualMemoryChunk::RELEASED);
+    }
+}
+
+class VirtualMemoryManagerTest : public VirtualMemoryTestBase // NOLINT(cppcoreguidelines-pro-type-member-init)
+{
+    using Base = VirtualMemoryTestBase;
+
+protected:
+    auto& entries()
+    {
+        return mVMManager->mEntries;
+    }
+
+    auto& memories()
+    {
+        return mVMManager->mMemories;
+    }
+
+    auto& badHandles()
+    {
+        return mVMManager->mBadHandles;
+    }
+
+    void SetUp() override
+    {
+        this->Base::SetUp();
+        mVMManager = std::make_unique<CudaVirtualMemoryManager>();
+    }
+
+    void TearDown() override
+    {
+        this->Base::TearDown();
+        ASSERT_TRUE(!mVMManager || entries().size() == 0) << "Leftover memory in manager";
+    }
+
+    std::unique_ptr<CudaVirtualMemoryManager> mVMManager = nullptr;
+};
+
+TEST_F(VirtualMemoryManagerTest, TestBasic)
+{
+    CUdeviceptr address{};
+    std::size_t constexpr size = 256 * 1024 * 1024;
+    TLLM_CU_CHECK(cuMemAddressReserve(&address, size, 0, {}, 0));
+
+    uintptr_t handle = static_cast<uintptr_t>(address);
+    std::string mark = "test_mark";
+
+    CUDAVirtualMemoryChunk::CreatorPtr creator
+        = std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
+                                               {
+                                                   CU_MEM_LOCATION_TYPE_DEVICE,
+                                                   0,
+                                               }},
+            size);
+
+    CUDAVirtualMemoryChunk::Configurators configurators;
+    configurators.push_back(std::make_unique<UnicastConfigurator>(address, size,
+        CUmemAccessDesc{{
+                            CU_MEM_LOCATION_TYPE_DEVICE,
+                            0,
+                        },
+            CU_MEM_ACCESS_FLAGS_PROT_READWRITE}));
+
+    auto memoryBegin = getCurrentProcessMemoryInfo();
+
+    // Add to manager - this automatically materializes
+    mVMManager->add(handle, mark, std::move(creator), std::move(configurators));
+
+    auto memoryMaterialized = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin + size, memoryMaterialized) << "add/materialize does not allocate memory";
+    }
+
+    // Test memory access after materialization
+    auto result = cuMemsetD8_v2(address, 255, size);
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Accessing memory returned failure (first materialize)";
+    TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
+
+    // Release memory through manager
+    auto releaseCount = mVMManager->releaseWithMark(mark);
+    ASSERT_EQ(releaseCount, 1) << "Expected to release 1 memory object";
+
+    auto memoryReleased = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryReleased) << "releaseWithMark does not release memory";
+    }
+
+    // Materialize again through manager
+    auto materializeCount = mVMManager->materializeWithMark(mark);
+    ASSERT_EQ(materializeCount, 1) << "Expected to materialize 1 memory object";
+
+    auto memoryRematerialized = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin + size, memoryRematerialized) << "materializeWithMark does not allocate memory";
+    }
+
+    // Test memory access after rematerialization
+    result = cuMemsetD8_v2(address, 255, size);
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Accessing memory returned failure (second materialize)";
+    TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
+
+    // Clean up - remove from manager
+    {
+        auto removedMemory = mVMManager->remove(handle);
+        ASSERT_TRUE(removedMemory) << "Expected to successfully remove memory from manager";
+    }
+
+    auto memoryAfterRemove = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryAfterRemove) << "remove does not release memory";
+    }
+
+    auto unknownMemory = mVMManager->remove(0);
+    ASSERT_FALSE(unknownMemory) << "Expect invalid memory for unknown handle";
+}
+
+TEST_F(VirtualMemoryManagerTest, TestMarks)
+{
+    // Dummy Creator for testing mark functionality
+    class DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        bool createCalled = false;
+        bool releaseCalled = false;
+        CUmemGenericAllocationHandle createdHandle = 0xbaadf00dbaadf00d;
+
+        CUmemGenericAllocationHandle create() override
+        {
+            createCalled = true;
+            return createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            releaseCalled = true;
+            ASSERT_EQ(handle, createdHandle);
+        }
+    };
+
+    // Create creators for different virtual memories
+    auto creator1 = std::make_unique<DummyCreator>();
+    auto creator2 = std::make_unique<DummyCreator>();
+    auto creator3 = std::make_unique<DummyCreator>();
+    auto creator4 = std::make_unique<DummyCreator>();
+
+    // Keep pointers to track state
+    auto* creator1Ptr = creator1.get();
+    auto* creator2Ptr = creator2.get();
+    auto* creator3Ptr = creator3.get();
+    auto* creator4Ptr = creator4.get();
+
+    mVMManager->add(0x1000, "mark_A", std::move(creator1), {});
+    mVMManager->add(0x2000, "mark_B", std::move(creator2), {});
+    mVMManager->add(0x3000, "mark_A", std::move(creator3), {});
+    mVMManager->add(0x4000, "mark_C", std::move(creator4), {});
+
+    // All should be materialized initially (since add() materializes automatically)
+    EXPECT_TRUE(creator1Ptr->createCalled);
+    EXPECT_TRUE(creator2Ptr->createCalled);
+    EXPECT_TRUE(creator3Ptr->createCalled);
+    EXPECT_TRUE(creator4Ptr->createCalled);
+
+    // Reset create flags to test materializeWithMark later
+    creator1Ptr->createCalled = false;
+    creator2Ptr->createCalled = false;
+    creator3Ptr->createCalled = false;
+    creator4Ptr->createCalled = false;
+
+    // Test releaseWithMark - should release only memories with "mark_A"
+    auto releaseCount = mVMManager->releaseWithMark("mark_A");
+    EXPECT_EQ(releaseCount, 2); // Should release 2 memories with mark_A
+
+    // Verify only mark_A memories were released
+    EXPECT_TRUE(creator1Ptr->releaseCalled);  // mark_A
+    EXPECT_FALSE(creator2Ptr->releaseCalled); // mark_B
+    EXPECT_TRUE(creator3Ptr->releaseCalled);  // mark_A
+    EXPECT_FALSE(creator4Ptr->releaseCalled); // mark_C
+
+    // Test materializeWithMark - should materialize only memories with "mark_A"
+    auto materializeCount = mVMManager->materializeWithMark("mark_A");
+    EXPECT_EQ(materializeCount, 2); // Should materialize 2 memories with mark_A
+
+    // Verify only mark_A memories were materialized
+    EXPECT_TRUE(creator1Ptr->createCalled);  // mark_A
+    EXPECT_FALSE(creator2Ptr->createCalled); // mark_B
+    EXPECT_TRUE(creator3Ptr->createCalled);  // mark_A
+    EXPECT_FALSE(creator4Ptr->createCalled); // mark_C
+
+    // Reset flags and test releasing with a different mark
+    creator2Ptr->releaseCalled = false;
+    releaseCount = mVMManager->releaseWithMark("mark_B");
+    EXPECT_EQ(releaseCount, 1);              // Should release 1 memory with mark_B
+    EXPECT_TRUE(creator2Ptr->releaseCalled); // mark_B should now be released
+
+    // Test with non-existent mark
+    releaseCount = mVMManager->releaseWithMark("nonexistent_mark");
+    EXPECT_EQ(releaseCount, 0); // Should release 0 memories
+
+    materializeCount = mVMManager->materializeWithMark("nonexistent_mark");
+    EXPECT_EQ(materializeCount, 0); // Should materialize 0 memories
+
+    // Clean up - remove all memories
+    mVMManager->remove(0x1000);
+    mVMManager->remove(0x2000);
+    mVMManager->remove(0x3000);
+    mVMManager->remove(0x4000);
+}
+
+TEST_F(VirtualMemoryManagerTest, TestAddException)
+{
+    // Dummy Creator that succeeds
+    class DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        CUmemGenericAllocationHandle createdHandle = 0xbaadf00dbaadf00d;
+
+        CUmemGenericAllocationHandle create() override
+        {
+            return createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            ASSERT_EQ(handle, createdHandle);
+        }
+    };
+
+    // Dummy Configurator that throws during setup
+    class ThrowingConfigurator : public CUDAVirtualMemoryChunk::Configurator
+    {
+    public:
+        void setup(CUmemGenericAllocationHandle) override
+        {
+            throw DummyException();
+        }
+
+        void teardown(CUmemGenericAllocationHandle) override
+        {
+            ASSERT_TRUE(false) << "Unreachable";
+        }
+    };
+
+    uintptr_t handle = 0x12345678;
+    std::string mark = "test_mark";
+
+    // Verify initial state is clean
+    EXPECT_TRUE(memories().empty());
+    EXPECT_TRUE(entries().empty());
+    EXPECT_TRUE(badHandles().empty());
+
+    auto creator = std::make_unique<DummyCreator>();
+    CUDAVirtualMemoryChunk::Configurators configurators;
+    configurators.push_back(std::make_unique<ThrowingConfigurator>());
+
+    // add() should throw because materialize() will fail due to ThrowingConfigurator
+    EXPECT_THROW(mVMManager->add(handle, mark, std::move(creator), std::move(configurators)), DummyException);
+
+    // Verify that the manager state is clean after the exception
+    // The ScopeGuards in add() should have cleaned up properly
+    EXPECT_TRUE(memories().empty()) << "mMemories should be empty after failed add()";
+    EXPECT_TRUE(entries().empty()) << "mEntries should be empty after failed add()";
+    EXPECT_TRUE(badHandles().empty()) << "mBadHandles should be empty after failed add()";
+
+    // Test that we can successfully add a memory with the same handle after the failure
+    auto successCreator = std::make_unique<DummyCreator>();
+    CUDAVirtualMemoryChunk::Configurators successConfigurators; // Empty configurators should work
+
+    // This should succeed without throwing
+    EXPECT_NO_THROW(mVMManager->add(handle, mark, std::move(successCreator), std::move(successConfigurators)));
+
+    // Verify that the manager now has the entry
+    EXPECT_EQ(memories().size(), 1);
+    EXPECT_EQ(entries().size(), 1);
+    EXPECT_TRUE(badHandles().empty());
+
+    // Clean up
+    auto removedMemory = mVMManager->remove(handle);
+    EXPECT_TRUE(removedMemory);
+}
+
+TEST_F(VirtualMemoryManagerTest, TestMaterializeException)
+{
+    // State structure to track create/release order and can throw on a specific call
+    struct CreatorState
+    {
+        int& createCounter;       // Reference to shared counter
+        int throwOnCreateIdx = 0; // 1-based index to throw on create
+        int myCreateIdx = INT_MAX;
+        bool createCalled = false;
+        bool releaseCalled = false;
+        CUmemGenericAllocationHandle createdHandle = 0xbaadf00dbaadf00d;
+
+        CreatorState(int& sharedCounter)
+            : createCounter(sharedCounter)
+        {
+        }
+    };
+
+    // Dummy Creator that uses external state
+    class TestMatEx_DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        CreatorState& state;
+
+        TestMatEx_DummyCreator(CreatorState& state)
+            : state(state)
+        {
+        }
+
+        CUmemGenericAllocationHandle create() override
+        {
+            state.createCalled = true;
+            state.myCreateIdx = ++state.createCounter;
+            if (state.throwOnCreateIdx > 0 && state.myCreateIdx == state.throwOnCreateIdx)
+            {
+                throw DummyException();
+            }
+            return state.createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            state.releaseCalled = true;
+            ASSERT_EQ(handle, state.createdHandle);
+        }
+    };
+
+    // Create shared counter
+    int sharedCreateCounter = 0;
+
+    // Create state objects for each creator
+    CreatorState state1(sharedCreateCounter);
+    CreatorState state2(sharedCreateCounter);
+    CreatorState state3(sharedCreateCounter);
+
+    // We want the second memory (by create order) to throw
+    state1.throwOnCreateIdx = 2;
+    state2.throwOnCreateIdx = 2;
+    state3.throwOnCreateIdx = 2;
+
+    // Create creators and configurators
+    auto creator1 = std::make_unique<TestMatEx_DummyCreator>(state1);
+    auto creator2 = std::make_unique<TestMatEx_DummyCreator>(state2);
+    auto creator3 = std::make_unique<TestMatEx_DummyCreator>(state3);
+
+    // Add memories to manager in RELEASED state (don't auto-materialize by constructing manually)
+    CUDAVirtualMemoryChunk vm1(std::move(creator1), {});
+    CUDAVirtualMemoryChunk vm2(std::move(creator2), {});
+    CUDAVirtualMemoryChunk vm3(std::move(creator3), {});
+
+    mVMManager->add(0x1000, "test_mark", std::move(vm1));
+    mVMManager->add(0x2000, "test_mark", std::move(vm2));
+    mVMManager->add(0x3000, "test_mark", std::move(vm3));
+
+    // Verify initial state is clean
+    EXPECT_TRUE(badHandles().empty());
+
+    // materializeWithMark should stop at the first exception (second memory by create order)
+    // and attempt to rollback the first memory that succeeded
+    EXPECT_THROW(mVMManager->materializeWithMark("test_mark"), DummyException);
+
+    // Find which creators were called and in what order
+    std::vector<std::pair<uintptr_t, CreatorState*>> creators
+        = {{0x1000, &state1}, {0x2000, &state2}, {0x3000, &state3}};
+    // Sort by myCreateIdx (nonzero means create was called)
+    std::sort(creators.begin(), creators.end(),
+        [](auto const& a, auto const& b) { return a.second->myCreateIdx < b.second->myCreateIdx; });
+
+    // The first memory (by create order) should have been materialized then released during rollback
+    auto* first = creators[0].second;
+    EXPECT_TRUE(first->createCalled);
+    EXPECT_TRUE(first->releaseCalled); // Rolled back
+    // The second memory should have thrown during setup, so creator was called but setup failed
+    auto* second = creators[1].second;
+    EXPECT_TRUE(second->createCalled);
+    EXPECT_FALSE(second->releaseCalled);
+    // The third memory should not have been touched (myCreateIdx == 0)
+    auto* third = creators[2].second;
+    EXPECT_FALSE(third->createCalled);
+    EXPECT_FALSE(third->releaseCalled);
+
+    // The handle of the memory that threw should be the second one's handle
+    uintptr_t thrownHandle = creators[1].first;
+
+    // Verify bad handles tracking - memories that threw exceptions should be removed
+    auto badHandles = mVMManager->retrieveBadHandles();
+    EXPECT_EQ(badHandles.size(), 1);
+    EXPECT_EQ(badHandles[0], thrownHandle);
+
+    // Verify the memory that threw was removed from the manager
+    auto removedMem = mVMManager->remove(thrownHandle);
+    EXPECT_FALSE(removedMem); // Should have been removed due to exception
+
+    // The other two memories should still be in manager
+    for (int i = 0; i < 3; ++i)
+    {
+        if (creators[i].first != thrownHandle)
+        {
+            auto removed = mVMManager->remove(creators[i].first);
+            EXPECT_TRUE(removed);
+        }
+    }
+}
+
+TEST_F(VirtualMemoryManagerTest, TestReleaseException)
+{
+    // State structure to track create/release calls
+    struct CreatorState
+    {
+        bool createCalled = false;
+        bool releaseCalled = false;
+        int& releaseCounter;
+        int throwOnReleaseCount;
+        CUmemGenericAllocationHandle createdHandle = 0xbaadf00dbaadf00d;
+
+        CreatorState(int& counter, int throwCount)
+            : releaseCounter(counter)
+            , throwOnReleaseCount(throwCount)
+        {
+        }
+    };
+
+    // State structure to track setup/teardown calls
+    struct ConfiguratorState
+    {
+        bool setupCalled = false;
+        bool teardownCalled = false;
+        int& teardownCounter;
+        int throwOnTeardownCount;
+
+        ConfiguratorState(int& counter, int throwCount)
+            : teardownCounter(counter)
+            , throwOnTeardownCount(throwCount)
+        {
+        }
+    };
+
+    // Dummy Creator that succeeds
+    class DummyCreator : public CUDAVirtualMemoryChunk::Creator
+    {
+    public:
+        CreatorState& state;
+
+        DummyCreator(CreatorState& state)
+            : state(state)
+        {
+        }
+
+        CUmemGenericAllocationHandle create() override
+        {
+            state.createCalled = true;
+            return state.createdHandle;
+        }
+
+        void release(CUmemGenericAllocationHandle handle) override
+        {
+            state.releaseCalled = true;
+            ASSERT_EQ(handle, state.createdHandle);
+            if (++state.releaseCounter == state.throwOnReleaseCount)
+            {
+                throw DummyException();
+            }
+        }
+    };
+
+    // Dummy Configurator that succeeds
+    class DummyConfigurator : public CUDAVirtualMemoryChunk::Configurator
+    {
+    public:
+        ConfiguratorState& state;
+
+        DummyConfigurator(ConfiguratorState& state)
+            : state(state)
+        {
+        }
+
+        void setup(CUmemGenericAllocationHandle) override
+        {
+            state.setupCalled = true;
+        }
+
+        void teardown(CUmemGenericAllocationHandle) override
+        {
+            state.teardownCalled = true;
+            if (++state.teardownCounter == state.throwOnTeardownCount)
+            {
+                throw DummyException();
+            }
+        }
+    };
+
+    // Create counters for tracking release/teardown calls
+    int releaseCounter = 0;
+    int teardownCounter = 0;
+
+    // Create state objects for each creator and configurator
+    CreatorState state1(releaseCounter, 2);             // Throw on 2nd release
+    CreatorState state2(releaseCounter, 2);             // Throw on 2nd release
+    CreatorState state3(releaseCounter, 2);             // Throw on 2nd release
+    CreatorState state4(releaseCounter, 2);             // Throw on 2nd release
+
+    ConfiguratorState configState1(teardownCounter, 3); // Throw on 3rd teardown
+    ConfiguratorState configState2(teardownCounter, 3); // Throw on 3rd teardown
+    ConfiguratorState configState3(teardownCounter, 3); // Throw on 3rd teardown
+    ConfiguratorState configState4(teardownCounter, 3); // Throw on 3rd teardown
+
+    // Create creators and configurators
+    auto creator1 = std::make_unique<DummyCreator>(state1);
+    auto creator2 = std::make_unique<DummyCreator>(state2);
+    auto creator3 = std::make_unique<DummyCreator>(state3);
+    auto creator4 = std::make_unique<DummyCreator>(state4);
+
+    auto config1 = std::make_unique<DummyConfigurator>(configState1);
+    auto config2 = std::make_unique<DummyConfigurator>(configState2);
+    auto config3 = std::make_unique<DummyConfigurator>(configState3);
+    auto config4 = std::make_unique<DummyConfigurator>(configState4);
+
+    CUDAVirtualMemoryChunk::Configurators configurators1;
+    configurators1.push_back(std::move(config1));
+
+    CUDAVirtualMemoryChunk::Configurators configurators2;
+    configurators2.push_back(std::move(config2));
+
+    CUDAVirtualMemoryChunk::Configurators configurators3;
+    configurators3.push_back(std::move(config3));
+
+    CUDAVirtualMemoryChunk::Configurators configurators4;
+    configurators4.push_back(std::move(config4));
+
+    mVMManager->add(0x1000, "test_mark", std::move(creator1), std::move(configurators1));
+    mVMManager->add(0x2000, "test_mark", std::move(creator2), std::move(configurators2));
+    mVMManager->add(0x3000, "test_mark", std::move(creator3), std::move(configurators3));
+    mVMManager->add(0x4000, "other_mark", std::move(creator4), std::move(configurators4));
+
+    // Verify initial state
+    EXPECT_TRUE(badHandles().empty());
+
+    // releaseWithMark should call release on all memories with "test_mark"
+    // and continue despite exceptions
+    EXPECT_THROW(mVMManager->releaseWithMark("test_mark"), DummyException);
+
+    // Verify behavior:
+    // - All memories with "test_mark" should have had release() attempted
+    EXPECT_TRUE(state1.releaseCalled);
+    EXPECT_TRUE(configState1.teardownCalled);
+
+    EXPECT_TRUE(state2.releaseCalled);
+    EXPECT_TRUE(configState2.teardownCalled);
+
+    EXPECT_TRUE(state3.releaseCalled);
+    EXPECT_TRUE(configState3.teardownCalled);
+
+    // - Memory with different mark should not be affected
+    EXPECT_FALSE(state4.releaseCalled);
+    EXPECT_FALSE(configState4.teardownCalled);
+
+    // Verify bad handles tracking - memories that threw exceptions should be removed
+    auto badHandles = mVMManager->retrieveBadHandles();
+    EXPECT_EQ(badHandles.size(), 2);
+    EXPECT_NE(std::find(badHandles.begin(), badHandles.end(), 0x2000), badHandles.end());
+    EXPECT_NE(std::find(badHandles.begin(), badHandles.end(), 0x3000), badHandles.end());
+
+    // Verify the memories were removed from the manager
+    auto removedMem1 = mVMManager->remove(0x1000);
+    auto removedMem2 = mVMManager->remove(0x2000);
+    auto removedMem3 = mVMManager->remove(0x3000);
+    auto removedMem4 = mVMManager->remove(0x4000);
+
+    EXPECT_TRUE(removedMem1);  // Should have been removed due to exception
+    EXPECT_FALSE(removedMem2); // Should have been removed due to exception
+    EXPECT_FALSE(removedMem3); // Should have been removed due to exception
+    EXPECT_TRUE(removedMem4);  // Should still be in manager (different mark, not affected)
+}
+
+TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
+{
+    std::size_t constexpr size = 64 * 1024 * 1024; // 64 MB
+    std::string mark = "test_allocator_mark";
+
+    // Create a CUDA stream for the allocator
+    CudaStream stream;
+    auto streamPtr = std::make_shared<CudaStream>(std::move(stream));
+
+    // Create configuration for the virtual address allocator
+    auto config = std::make_shared<CudaVirtualMemoryAllocator::Configuration>(
+        *mVMManager.get(), mark, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);
+
+    auto memoryBegin = getCurrentProcessMemoryInfo();
+
+    // Create a buffer using the virtual address allocator
+    auto buffer = std::make_unique<VirtualAddressDeviceBuffer>(
+        size, nvinfer1::DataType::kINT8, CudaVirtualMemoryAllocator{config});
+
+    auto memoryAfterAllocation = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin + size, memoryAfterAllocation) << "Buffer allocation does not allocate memory";
+    }
+
+    // Test that we can access the buffer data
+    ASSERT_NE(buffer->data(), nullptr) << "Buffer data should not be null";
+    ASSERT_EQ(buffer->getSize(), size) << "Buffer size should match requested size";
+    ASSERT_EQ(buffer->getDataType(), nvinfer1::DataType::kINT8) << "Buffer data type should be INT8";
+    ASSERT_EQ(buffer->getMemoryType(), MemoryType::kGPU) << "Buffer memory type should be GPU";
+
+    // Test memory access by setting memory to a known pattern
+    auto devicePtr = reinterpret_cast<CUdeviceptr>(buffer->data());
+    auto result = cuMemsetD8_v2(devicePtr, 0xAB, size);
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Memory access should succeed";
+    TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
+
+    // Test releasing memory with mark - this should free the virtual memory
+    auto releaseCount = mVMManager->releaseWithMark(mark);
+    ASSERT_EQ(releaseCount, 1) << "Expected to release 1 memory object";
+
+    auto memoryAfterRelease = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryAfterRelease) << "Release should free the memory";
+    }
+
+    // Test materializing memory with mark - this should re-allocate the virtual memory
+    auto materializeCount = mVMManager->materializeWithMark(mark);
+    ASSERT_EQ(materializeCount, 1) << "Expected to materialize 1 memory object";
+
+    auto memoryAfterMaterialize = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin + size, memoryAfterMaterialize) << "Materialize should allocate memory";
+    }
+
+    // Test memory access again after rematerialization
+    result = cuMemsetD8_v2(devicePtr, 0xCD, size);
+    ASSERT_EQ(result, CUDA_SUCCESS) << "Memory access should succeed after rematerialization";
+    TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
+
+    // Clean up by destroying the buffer (this should automatically clean up the virtual memory)
+    buffer.reset();
+
+    auto memoryAfterCleanup = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
+    }
+}
diff --git a/tensorrt_llm/_torch/virtual_memory.py b/tensorrt_llm/_torch/virtual_memory.py
new file mode 100644
index 00000000000..debd376c8d8
--- /dev/null
+++ b/tensorrt_llm/_torch/virtual_memory.py
@@ -0,0 +1,89 @@
+import functools
+from contextlib import contextmanager
+from typing import Generator
+
+import torch
+
+from tensorrt_llm.bindings.internal.runtime import \
+    CudaVirtualMemoryAllocatorRestoreMode as RestoreMode
+from tensorrt_llm.bindings.internal.runtime import (
+    get_virtual_memory_manager, pop_virtual_memory_allocator,
+    push_virtual_memory_allocator)
+
+__all__ = [
+    "RestoreMode", "maybe_scope", "scope", "release_with_mark",
+    "materialize_with_mark"
+]
+
+
+@functools.cache
+def _get_torch_pluggable_virtual_memory_allocator():
+    th_common = next(path for path in torch.classes.loaded_libraries
+                     if 'th_common' in path)
+    virtual_memory_allocator = torch.cuda.CUDAPluggableAllocator(
+        th_common, 'tensorrt_llm_virtual_memory_alloc',
+        'tensorrt_llm_virtual_memory_free')
+    return virtual_memory_allocator.allocator()
+
+
+@contextmanager
+def _virtual_address_helper(mark: str, mode: RestoreMode):
+    stream = torch.cuda.current_stream()
+    push_virtual_memory_allocator(mark, mode, stream.cuda_stream)
+    try:
+        yield
+    finally:
+        pop_virtual_memory_allocator()
+
+
+def _scope(
+    mark: str,
+    mode: RestoreMode = RestoreMode.NONE
+) -> Generator[torch.cuda.MemPool, None, None]:
+    """A context manager that routes allocations to virtual memory allocator
+    using given mark and backed mode.
+
+    :param mark: The mark to reference the memory for release and materialize
+    :param mode: The backed mode to choose how the memory content is backed up
+    """
+    pool = torch.cuda.MemPool(_get_torch_pluggable_virtual_memory_allocator())
+    with _virtual_address_helper(mark, mode), torch.cuda.use_mem_pool(pool):
+        yield pool
+
+
+scope = contextmanager(_scope)
+
+
+@contextmanager
+def maybe_scope(
+    enable: bool,
+    mark: str,
+    mode: RestoreMode = RestoreMode.NONE
+) -> Generator[torch.cuda.MemPool | None, None, None]:
+    if enable:
+        yield from _scope(mark, mode)
+    else:
+        yield
+
+
+def release_with_mark(*marks: str) -> int:
+    """Release virtual memory allocated with given marks
+
+    :param marks: The mark of the scope when the virtual memory is allocated
+    :return: Number of memory blobs released
+    """
+    manager = get_virtual_memory_manager()
+    released_blobs = sum(manager.release_with_mark(mark) for mark in marks)
+    return released_blobs
+
+
+def materialize_with_mark(*marks: str) -> int:
+    """Materialize virtual memory allocated with given marks
+
+    :param marks: The mark of the scope when the virtual memory is allocated
+    :return: Number of memory blobs materialized
+    """
+    manager = get_virtual_memory_manager()
+    materialized_blobs = sum(
+        manager.materialize_with_mark(mark) for mark in marks)
+    return materialized_blobs
diff --git a/tests/unittest/_torch/test_virtual_memory.py b/tests/unittest/_torch/test_virtual_memory.py
new file mode 100644
index 00000000000..0d13a1b84ee
--- /dev/null
+++ b/tests/unittest/_torch/test_virtual_memory.py
@@ -0,0 +1,254 @@
+import gc
+import os
+import warnings
+
+import pynvml
+import pytest
+import torch
+
+import tensorrt_llm
+from tensorrt_llm._torch import virtual_memory
+from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
+from tensorrt_llm.bindings.executor import KvCacheConfig
+from tensorrt_llm.bindings.internal.batch_manager import CacheType
+from tensorrt_llm.mapping import Mapping
+
+
+@pytest.fixture(scope="function", autouse=True)
+def cuda_sync_fixture():
+    """
+    Synchronizes CUDA to catch device errors.
+    """
+
+    torch.cuda.synchronize()
+    yield
+    torch.cuda.synchronize()
+
+
+@pytest.fixture(scope="module")
+def memory_info_available():
+    """
+    Checks if NVML can get per-process memory information.
+    """
+
+    # Allocate a small tensor to test memory tracking
+    tensor = torch.zeros(4096, dtype=torch.int32, device='cuda')
+    torch.cuda.synchronize()
+
+    # Try to get memory usage
+    usage = get_current_process_memory_info()
+
+    # Clean up
+    del tensor
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+    if usage == 0:
+        warnings.warn("Per process memory information unavailable.")
+        return False
+
+    return True
+
+
+@pytest.fixture(scope="module", autouse=True)
+def nvml_init():
+    pynvml.nvmlInit()
+
+
+def get_current_process_memory_info() -> int:
+    """
+    Returns GPU memory usage for current process in bytes.
+    """
+    # Get current process ID
+    current_pid = os.getpid()
+    # Get device handle for GPU 0
+    device_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+
+    # Get running processes
+    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
+
+    # Find current process
+    for process in processes:
+        if process.pid == current_pid:
+            return process.usedGpuMemory
+
+    return 0
+
+
+@pytest.fixture(scope="function", autouse=True)
+def clean_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
+    yield
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def test_basic(memory_info_available):
+    memory_usage_begin = get_current_process_memory_info()
+
+    alloc_size = 256 * 1024 * 1024
+    mark = "test_mark"
+
+    with virtual_memory.scope(mark) as pool:
+        tensor = torch.full([alloc_size], 42, dtype=torch.int8, device='cuda')
+        memory_usage_materialized = get_current_process_memory_info()
+        if memory_info_available:
+            assert memory_usage_begin + alloc_size == memory_usage_materialized
+
+    assert tensor[0].item() == 42
+
+    torch.cuda.synchronize()
+    virtual_memory.release_with_mark(mark)
+
+    memory_usage_released = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_begin == memory_usage_released
+
+    torch.cuda.synchronize()
+    virtual_memory.materialize_with_mark(mark)
+
+    memory_usage_rematerialized = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_begin + alloc_size == memory_usage_rematerialized
+
+    torch.fill_(tensor, 24)
+    assert tensor[0].item() == 24
+
+    del tensor
+    del pool
+
+    memory_usage_end = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_begin == memory_usage_end
+
+
+def test_restore():
+    alloc_size = 1024 * 1024
+    mark = "test_mark"
+
+    with virtual_memory.scope(mark, virtual_memory.RestoreMode.PINNED) as pool:
+        tensor = torch.full([alloc_size], 42, dtype=torch.int8, device='cuda')
+
+    assert tensor[0].item() == 42
+
+    torch.cuda.synchronize()
+    virtual_memory.release_with_mark(mark)
+
+    torch.cuda.synchronize()
+
+    virtual_memory.materialize_with_mark(mark)
+    torch.cuda.synchronize()
+
+    assert tensor[0].item() == 42
+
+    del tensor
+    del pool
+
+
+def test_kv_cache_manager(memory_info_available):
+    kv_cache_params = {
+        "kv_cache_config": KvCacheConfig(max_tokens=1024),
+        "kv_cache_type": CacheType.SELF,
+        "num_layers": 8,
+        "num_kv_heads": 256,
+        "head_dim": 64,
+        "tokens_per_block": 64,
+        "max_seq_len": 1024,
+        "max_batch_size": 1,
+        "mapping": Mapping(world_size=1, tp_size=1, rank=0),
+        "dtype": tensorrt_llm.bindings.DataType.FP8,
+    }
+
+    mgr = KVCacheManager(**kv_cache_params)
+    mgr.shutdown()
+    del mgr
+
+    memory_usage_begin = get_current_process_memory_info()
+
+    mark = "test_mark"
+    cache_size = torch.empty(
+        [
+            2,  # KV
+            8,  # Layers
+            256,  # Heads
+            1024,  # Tokens
+            64,  # Head dim
+        ],
+        dtype=torch.float8_e4m3fn,
+        device='meta')
+
+    alloc_size = cache_size.nelement()
+
+    with virtual_memory.scope(mark) as pool:
+        mgr = KVCacheManager(**kv_cache_params)
+        memory_usage_materialized = get_current_process_memory_info()
+        if memory_info_available:
+            assert memory_usage_begin + alloc_size == memory_usage_materialized
+
+    torch.cuda.synchronize()
+    virtual_memory.release_with_mark(mark)
+
+    memory_usage_released = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_begin == memory_usage_released
+
+    torch.cuda.synchronize()
+    virtual_memory.materialize_with_mark(mark)
+
+    memory_usage_rematerialized = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_begin + alloc_size == memory_usage_rematerialized
+
+    mgr.shutdown()
+    del mgr
+    del pool
+
+    memory_usage_end = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_begin == memory_usage_end
+
+
+def test_cuda_graph(memory_info_available):
+
+    def work(input: torch.Tensor) -> torch.Tensor:
+        intermediate = input + input
+        output = input + intermediate
+        return output
+
+    g = torch.cuda.CUDAGraph()
+    mark = "cuda_graph"
+
+    with virtual_memory.scope(mark) as pool:
+        static_input = torch.ones(1024, dtype=torch.float32, device='cuda')
+        static_output = torch.zeros(1024, dtype=torch.float32, device='cuda')
+
+        with torch.cuda.graph(g):
+            static_output.copy_(work(static_input))
+
+    torch.fill_(static_input, 1.0)
+    g.replay()
+
+    torch.cuda.synchronize()
+    assert static_output[0].item() == 3.0
+
+    memory_usage_before = get_current_process_memory_info()
+
+    torch.cuda.synchronize()
+    virtual_memory.release_with_mark(mark)
+
+    memory_usage_released = get_current_process_memory_info()
+    if memory_info_available:
+        assert memory_usage_released < memory_usage_before
+
+    torch.cuda.synchronize()
+    virtual_memory.materialize_with_mark(mark)
+
+    torch.fill_(static_input, 1.0)
+    torch.fill_(static_output, 0.0)
+    g.replay()
+
+    torch.cuda.synchronize()
+    assert static_output[0].item() == 3.0
+
+    del static_input, static_output, g, pool

From a7962ab17ca4dbb754710509ffe509643e4eda14 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Thu, 24 Jul 2025 16:20:40 +0800
Subject: [PATCH 02/10] fixup! feat: virtual device memory

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 .../tensorrt_llm/runtime/virtualMemory.h      | 31 ++++++++++++-------
 cpp/tensorrt_llm/common/cudaDriverWrapper.h   |  2 +-
 cpp/tensorrt_llm/runtime/virtualMemory.cpp    | 16 +++++++---
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
index 39959ed988d..40dbc0a9a3c 100644
--- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -55,8 +55,10 @@ class CUDAVirtualMemoryChunk
 
         // Note: create() shall not leak resources when throwing exceptions.
         // release() will only, and will always be called if create() success.
+        // release() will be called with destructing=true when the CUDAVirtualMemoryChunk
+        // is being destructed.
         virtual CUmemGenericAllocationHandle create() = 0;
-        virtual void release(CUmemGenericAllocationHandle) = 0;
+        virtual void release(CUmemGenericAllocationHandle handle, bool destructing) = 0;
     };
 
     using CreatorPtr = std::unique_ptr<Creator>;
@@ -78,8 +80,10 @@ class CUDAVirtualMemoryChunk
 
         // Note: setup() shall not leak resources when throwing exceptions.
         // teardown() will only, and will always be called if setup() success.
-        virtual void setup(CUmemGenericAllocationHandle) = 0;
-        virtual void teardown(CUmemGenericAllocationHandle) = 0;
+        // teardown() will be called with destructing=true when the CUDAVirtualMemoryChunk
+        // is being destructed.
+        virtual void setup(CUmemGenericAllocationHandle handle) = 0;
+        virtual void teardown(CUmemGenericAllocationHandle handle, bool destructing) = 0;
     };
 
     using ConfiguratorPtr = std::unique_ptr<Configurator>;
@@ -134,7 +138,10 @@ class CUDAVirtualMemoryChunk
      *
      * Never stops early upon exception. The last thrown exception will be propagated, and others logged.
      */
-    void release();
+    void release()
+    {
+        _release(false);
+    }
 
     CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk const&) = delete;
     CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk const&) = delete;
@@ -170,7 +177,7 @@ class CUDAVirtualMemoryChunk
         // we shouldn't call release() again.
         if (mHandle != 0 && mState != INVALID_STATE)
         {
-            release();
+            _release(true);
         }
     }
 
@@ -183,6 +190,8 @@ class CUDAVirtualMemoryChunk
     }
 
 private:
+    void _release(bool destructing);
+
     constexpr static size_t INVALID_STATE = static_cast<size_t>(-1);
     size_t mState = 0;
     CUmemGenericAllocationHandle mHandle{};
@@ -214,7 +223,7 @@ struct LocalCreator : CUDAVirtualMemoryChunk::Creator
         return handle;
     }
 
-    void release(CUmemGenericAllocationHandle handle) override
+    void release(CUmemGenericAllocationHandle handle, bool destructing) override
     {
         TLLM_CU_CHECK_FREE_RESOURCE(cuMemRelease(handle));
         if constexpr (count)
@@ -246,7 +255,7 @@ struct UnicastConfigurator : CUDAVirtualMemoryChunk::Configurator
         TLLM_CU_CHECK(cuMemSetAccess(mAddress, mSize, &mDesc, 1));
     }
 
-    void teardown(CUmemGenericAllocationHandle) override
+    void teardown(CUmemGenericAllocationHandle, bool) override
     {
         TLLM_CU_CHECK_FREE_RESOURCE(cuMemUnmap(mAddress, mSize));
     }
@@ -266,7 +275,7 @@ struct MulticastConfigurator : CUDAVirtualMemoryChunk::Configurator
         TLLM_CU_CHECK(cuMulticastBindMem(mMulticast, 0, handle, mBindOffset, mSize, 0));
     }
 
-    void teardown(CUmemGenericAllocationHandle) override
+    void teardown(CUmemGenericAllocationHandle, bool) override
     {
         TLLM_CU_CHECK_FREE_RESOURCE(cuMulticastUnbind(mMulticast, mDevice, 0, mSize));
     }
@@ -302,7 +311,7 @@ struct MemsetConfigurator : CUDAVirtualMemoryChunk::Configurator
         }
     }
 
-    void teardown(CUmemGenericAllocationHandle) noexcept override {}
+    void teardown(CUmemGenericAllocationHandle, bool) noexcept override {}
 
     CUdeviceptr mAddress;
     size_t mSize;
@@ -326,8 +335,8 @@ struct OffloadConfigurator : CUDAVirtualMemoryChunk::Configurator
     {
     }
 
-    void setup(CUmemGenericAllocationHandle) override;
-    void teardown(CUmemGenericAllocationHandle) override;
+    void setup(CUmemGenericAllocationHandle handle) override;
+    void teardown(CUmemGenericAllocationHandle handle, bool destructing) override;
 
     CUdeviceptr mAddress;
     size_t mSize;
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
index 396071bb314..cc3328993c9 100644
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@@ -158,7 +158,7 @@ void checkDriver(
 template <typename T>
 void checkDriverExitSafe(T result, char const* const func, char const* const file, int const line)
 {
-    if (result)
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED)
     {
         throw TllmException(
             file, line, fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %d.", func, result).c_str());
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index 945a65f3afa..16aba51549f 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -84,7 +84,7 @@ static bool safe_invoke_helper(std::exception_ptr& ep, char const* msg, Callable
     }
 }
 
-void CUDAVirtualMemoryChunk::release()
+void CUDAVirtualMemoryChunk::_release(bool destructing)
 {
     TLLM_CHECK_WITH_INFO(status() == MATERIALIZED || (status() == ERRORED && mState != INVALID_STATE),
         "virtual memory is in status %d which cannot be released", status());
@@ -99,9 +99,10 @@ void CUDAVirtualMemoryChunk::release()
     auto const* msg = "Multiple exceptions thrown during release. The previous exception is: %s";
     for (size_t i = start; i < count; ++i)
     {
-        safe_invoke_helper(ePtr, msg, &Configurator::teardown, mConfigurators[count - i - 1].get(), mHandle);
+        safe_invoke_helper(
+            ePtr, msg, &Configurator::teardown, mConfigurators[count - i - 1].get(), mHandle, destructing);
     }
-    safe_invoke_helper(ePtr, msg, &Creator::release, mCreator.get(), mHandle);
+    safe_invoke_helper(ePtr, msg, &Creator::release, mCreator.get(), mHandle, destructing);
     mHandle = {};
     mState = 0;
 
@@ -128,8 +129,13 @@ void OffloadConfigurator::setup(CUmemGenericAllocationHandle)
     }
 }
 
-void OffloadConfigurator::teardown(CUmemGenericAllocationHandle)
+void OffloadConfigurator::teardown(CUmemGenericAllocationHandle, bool destructing)
 {
+    if (destructing)
+    {
+        return;
+    }
+
     if (mBackedStorage == nullptr)
     {
         switch (mBackType)
@@ -140,7 +146,7 @@ void OffloadConfigurator::teardown(CUmemGenericAllocationHandle)
         }
     }
     // We have to synchronize here, or the memory may be unmapped before the copy operation.
-    TLLM_CU_CHECK(cuMemcpyDtoH_v2(mBackedStorage->data(), mAddress, mSize));
+    TLLM_CU_CHECK_FREE_RESOURCE(cuMemcpyDtoH_v2(mBackedStorage->data(), mAddress, mSize));
 }
 
 void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk&& memory)

From e6ca4c453335e22f8731857aed04757c5228e741 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Mon, 28 Jul 2025 12:23:38 +0800
Subject: [PATCH 03/10] review updates

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 .../tensorrt_llm/runtime/virtualMemory.h      |  63 +++-----
 .../nanobind/runtime/bindings.cpp             |   8 +-
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp  |   8 +-
 cpp/tensorrt_llm/runtime/virtualMemory.cpp    |  28 ++--
 .../unit_tests/runtime/virtualMemoryTest.cpp  | 148 +++++++++---------
 tensorrt_llm/_torch/virtual_memory.py         |  37 +++--
 tests/unittest/_torch/test_virtual_memory.py  |  32 ++--
 7 files changed, 153 insertions(+), 171 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
index 40dbc0a9a3c..0f171dbc8bf 100644
--- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -353,31 +353,31 @@ class CudaVirtualMemoryManager
     /**
      * Add memory to be managed by this manager.
      * @param handle  Unique handle provided to reference this memory in `remove`.
-     * @param mark    Mark the memory, so this memory can be targeted in `releaseWithMark` and `materializeWithMark`.
+     * @param tag     Tag the memory, so this memory can be targeted in `releaseWithTag` and `materializeWithTag`.
      * @param memory  The CUDAVirtualMemory object.
      *
      * The memory and internal state will remain valid if any exception is thrown.
      */
-    void add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk&& memory);
+    void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk&& memory);
 
     /**
      * Creates and adds memory to be managed by this manager. The created memory is automatically materialized.
      * @param handle         Unique handle provided to reference this memory in `remove`.
-     * @param mark           Mark the memory, so this memory can be targeted in `releaseWithMark` and
-     * `materializeWithMark`.
+     * @param tag            Tag the memory, so this memory can be targeted in `releaseWithTag` and
+     * `materializeWithTag`.
      * @param creator        The creator for the memory.
      * @param configurators  The configurators for the memory.
      *
      * The internal state will remain valid if any exception is thrown.
      */
-    void add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
+    void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
         CUDAVirtualMemoryChunk::Configurators&& configurators);
 
     template <typename... Configurators>
-    void add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
+    void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
         Configurators&&... configurators)
     {
-        add(handle, mark, std::move(creator), {std::forward<Configurators>(configurators)...});
+        add(handle, tag, std::move(creator), {std::forward<Configurators>(configurators)...});
     }
 
     /**
@@ -388,8 +388,8 @@ class CudaVirtualMemoryManager
     CUDAVirtualMemoryChunk remove(uintptr_t handle) noexcept;
 
     /**
-     * Call release for CUDAVirtualMemoryChunk objects with a given mark.
-     * @param mark the mark to select target memories.
+     * Call release for CUDAVirtualMemoryChunk objects with a given tag.
+     * @param tag the tag to select target memories.
      * @return Number of objects selected.
      *
      * This function will always call `CUDAVirtualMemoryChunk::release` on all selected objects.
@@ -398,11 +398,11 @@ class CudaVirtualMemoryManager
      * If any CUDAVirtualMemoryChunk threw an exception during `release`, it will be removed from the manager.
      * Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
      */
-    size_t releaseWithMark(std::string const& mark);
+    size_t releaseWithTag(std::string const& tag);
 
     /**
-     * Call materialize for CUDAVirtualMemoryChunk objects with a given mark.
-     * @param mark the mark to select target memories.
+     * Call materialize for CUDAVirtualMemoryChunk objects with a given tag.
+     * @param tag the tag to select target memories.
      * @return Number of objects selected.
      *
      * This function will stop at the first `CUDAVirtualMemoryChunk::materialize` that throws exception,
@@ -414,7 +414,7 @@ class CudaVirtualMemoryManager
      * manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed.
      * Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
      */
-    size_t materializeWithMark(std::string const& mark);
+    size_t materializeWithTag(std::string const& tag);
 
     /**
      * Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list.
@@ -431,39 +431,22 @@ class CudaVirtualMemoryManager
     struct Entry;
     // Unordered map invalidates iterator upon rehash, so we can only use the ordered map.
     using PointerMemoryMap = std::map<uintptr_t, Entry>;
-    using MarkEntryMap = std::multimap<std::string, PointerMemoryMap::iterator>;
+    using TagEntryMap = std::multimap<std::string, PointerMemoryMap::iterator>;
 
     struct Entry
     {
         CUDAVirtualMemoryChunk mMemory;
-        MarkEntryMap::iterator mEntryIt;
+        TagEntryMap::iterator mEntryIt;
     };
 
     std::mutex mMutex;
     PointerMemoryMap mMemories;
-    MarkEntryMap mEntries;
+    TagEntryMap mEntries;
     std::vector<uintptr_t> mBadHandles;
 
     friend VirtualMemoryManagerTest;
 };
 
-class CudaVirtualAddressManager
-{
-public:
-    static CudaVirtualAddressManager& instance();
-
-    CUdeviceptr Reserve(size_t size, size_t alignment = 0, CUdeviceptr addr = {}) const;
-    void Free(CUdeviceptr ptr, size_t size) const;
-
-private:
-    explicit CudaVirtualAddressManager(size_t pageSize)
-        : mPageSize(pageSize)
-    {
-    }
-
-    size_t mPageSize;
-};
-
 class CudaVirtualMemoryAllocator
 {
     using CudaStreamPtr = std::shared_ptr<CudaStream>;
@@ -481,7 +464,7 @@ class CudaVirtualMemoryAllocator
     class Configuration
     {
         CudaVirtualMemoryManager& mManager;
-        std::string mMark;
+        std::string mTag;
         CudaStreamPtr mBackStream;
         std::size_t mPageSize;
         RestoreMode mMode;
@@ -493,14 +476,14 @@ class CudaVirtualMemoryAllocator
         /**
          * CudaVirtualMemoryAllocator::Configuration
          * @param manager    Manager used to track and manage virtual memories
-         * @param mark       The mark for allocated memories
+         * @param tag        The tag for allocated memories
          * @param mode       Backed storage mode
          * @param backStream The CUDA stream used for restoring memory content
          *                   Note: Virtual Address Allocation is not async. The stream is not used in allocation.
          */
-        Configuration(CudaVirtualMemoryManager& manager, std::string mark, RestoreMode mode, CudaStreamPtr backStream)
+        Configuration(CudaVirtualMemoryManager& manager, std::string tag, RestoreMode mode, CudaStreamPtr backStream)
             : mManager(manager)
-            , mMark(std::move(mark))
+            , mTag(std::move(tag))
             , mBackStream(std::move(backStream))
             , mPageSize(getpagesize())
             , mMode(mode)
@@ -511,9 +494,9 @@ class CudaVirtualMemoryAllocator
         static Configuration backgroundConfiguration;
 
     private:
-        Configuration(CudaVirtualMemoryManager& manager, std::string mark, RestoreMode mode, CudaStreamPtr backStream,
+        Configuration(CudaVirtualMemoryManager& manager, std::string tag, RestoreMode mode, CudaStreamPtr backStream,
             bool background)
-            : Configuration(manager, std::move(mark), mode, std::move(backStream))
+            : Configuration(manager, std::move(tag), mode, std::move(backStream))
         {
             mBackground = background;
         }
@@ -544,7 +527,7 @@ namespace tensorrt_llm::runtime
 CudaVirtualMemoryManager& getVirtualMemoryManager();
 CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator();
 void pushVirtualMemoryAllocator(
-    std::string const& mark, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
+    std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
 void popVirtualMemoryAllocator();
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
index ce0b4ae2efa..6e1247076b4 100644
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -119,8 +119,8 @@ void initBindings(nb::module_& m)
         .def(nb::self == nb::self);
 
     nb::class_<tr::CudaVirtualMemoryManager>(m, "CudaVirtualMemoryManager")
-        .def("release_with_mark", &tr::CudaVirtualMemoryManager::releaseWithMark, nb::arg("mark"))
-        .def("materialize_with_mark", &tr::CudaVirtualMemoryManager::materializeWithMark, nb::arg("mark"));
+        .def("release_with_tag", &tr::CudaVirtualMemoryManager::releaseWithTag, nb::arg("tag"))
+        .def("materialize_with_tag", &tr::CudaVirtualMemoryManager::materializeWithTag, nb::arg("tag"));
 
     nb::class_<tr::BufferManager>(m, "BufferManager")
         .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
@@ -348,9 +348,9 @@ void initBindings(nb::module_& m)
 
     m.def(
         "push_virtual_memory_allocator",
-        [](std::string const& mark, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
+        [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
-            tr::pushVirtualMemoryAllocator(mark, mode,
+            tr::pushVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
         },
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index b26ac7c2b08..df279c124f6 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -216,8 +216,8 @@ void initBindings(pybind11::module_& m)
         .def(py::self == py::self);
 
     py::class_<tr::CudaVirtualMemoryManager>(m, "CudaVirtualMemoryManager")
-        .def("release_with_mark", &tr::CudaVirtualMemoryManager::releaseWithMark, py::arg("mark"))
-        .def("materialize_with_mark", &tr::CudaVirtualMemoryManager::materializeWithMark, py::arg("mark"));
+        .def("release_with_tag", &tr::CudaVirtualMemoryManager::releaseWithTag, py::arg("tag"))
+        .def("materialize_with_tag", &tr::CudaVirtualMemoryManager::materializeWithTag, py::arg("tag"));
 
     py::classh<tr::BufferManager>(m, "BufferManager")
         .def(py::init<tr::BufferManager::CudaStreamPtr, bool>(), py::arg("stream"), py::arg("trim_pool") = false)
@@ -441,9 +441,9 @@ void initBindings(pybind11::module_& m)
 
     m.def(
         "push_virtual_memory_allocator",
-        [](std::string const& mark, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
+        [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
-            tr::pushVirtualMemoryAllocator(mark, mode,
+            tr::pushVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
         },
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index 16aba51549f..17cc6d9348d 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -149,7 +149,7 @@ void OffloadConfigurator::teardown(CUmemGenericAllocationHandle, bool destructin
     TLLM_CU_CHECK_FREE_RESOURCE(cuMemcpyDtoH_v2(mBackedStorage->data(), mAddress, mSize));
 }
 
-void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk&& memory)
+void CudaVirtualMemoryManager::add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk&& memory)
 {
     bool success = false;
 
@@ -163,14 +163,14 @@ void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtu
         created, "CudaVirtualMemoryManager: handle 0x%016zx already being used by another memory", handle);
     ScopeGuard eraseMemIt{success, [&, memIt_ = memIt] { mMemories.erase(memIt_); }};
 
-    auto const entryIt = mEntries.emplace(std::move(mark), memIt);
+    auto const entryIt = mEntries.emplace(std::move(tag), memIt);
     entryIt->second->second.mEntryIt = entryIt;
 
     memIt->second.mMemory = std::move(memory);
     success = true;
 }
 
-void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
+void CudaVirtualMemoryManager::add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
     CUDAVirtualMemoryChunk::Configurators&& configurators)
 {
     std::unique_lock lock(mMutex);
@@ -184,9 +184,9 @@ void CudaVirtualMemoryManager::add(uintptr_t handle, std::string mark, CUDAVirtu
         created, "CudaVirtualMemoryManager: handle 0x%016zx already being used by another memory", handle);
     ScopeGuard eraseMemIt{success, [&, memIt_ = memIt] { mMemories.erase(memIt_); }};
 
-    auto const entryIt = mEntries.emplace(std::move(mark), memIt);
+    auto const entryIt = mEntries.emplace(std::move(tag), memIt);
     memIt->second.mEntryIt = entryIt;
-    ScopeGuard eraseMarkIt{success, [&] { mEntries.erase(entryIt); }};
+    ScopeGuard eraseTagIt{success, [&] { mEntries.erase(entryIt); }};
 
     try
     {
@@ -238,12 +238,12 @@ std::vector<uintptr_t> CudaVirtualMemoryManager::retrieveBadHandles() noexcept
     return std::move(mBadHandles);
 }
 
-size_t CudaVirtualMemoryManager::releaseWithMark(std::string const& mark)
+size_t CudaVirtualMemoryManager::releaseWithTag(std::string const& tag)
 {
     std::unique_lock lock(mMutex);
 
     std::exception_ptr ePtr{};
-    auto [begin, end] = mEntries.equal_range(mark);
+    auto [begin, end] = mEntries.equal_range(tag);
     size_t count = 0;
     for (auto it = begin; it != end;)
     {
@@ -253,7 +253,7 @@ size_t CudaVirtualMemoryManager::releaseWithMark(std::string const& mark)
         if (memory.status() == CUDAVirtualMemoryChunk::MATERIALIZED)
         {
             if (!safe_invoke_helper(ePtr,
-                    "Multiple exceptions thrown during releaseWithMark. The previous exception is: %s",
+                    "Multiple exceptions thrown during releaseWithTag. The previous exception is: %s",
                     &CUDAVirtualMemoryChunk::release, &memory))
             {
                 addBadHandle(handle);
@@ -271,11 +271,11 @@ size_t CudaVirtualMemoryManager::releaseWithMark(std::string const& mark)
     return count;
 }
 
-size_t CudaVirtualMemoryManager::materializeWithMark(std::string const& mark)
+size_t CudaVirtualMemoryManager::materializeWithTag(std::string const& tag)
 {
     std::unique_lock lock(mMutex);
 
-    auto [begin, end] = mEntries.equal_range(mark);
+    auto [begin, end] = mEntries.equal_range(tag);
     size_t count = 0;
 
     auto it = begin;
@@ -307,7 +307,7 @@ size_t CudaVirtualMemoryManager::materializeWithMark(std::string const& mark)
             {
                 addBadHandle(handle);
                 unsafeRemove(handle);
-                TLLM_LOG_ERROR("Additional exception thrown during rollback of materializeWithMark: %s", e.what());
+                TLLM_LOG_ERROR("Additional exception thrown during rollback of materializeWithTag: %s", e.what());
             }
         }
 
@@ -365,7 +365,7 @@ void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int devic
         break;
     }
 
-    mConfig->mManager.add(address, mConfig->mMark,
+    mConfig->mManager.add(address, mConfig->mTag,
         std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
                                              {
                                                  CU_MEM_LOCATION_TYPE_DEVICE,
@@ -409,9 +409,9 @@ CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator()
 }
 
 void pushVirtualMemoryAllocator(
-    std::string const& mark, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
+    std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
 {
-    vmAllocators.emplace_front(std::make_shared<AllocConf>(getVirtualMemoryManager(), mark, mode, backStream));
+    vmAllocators.emplace_front(std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream));
 }
 
 void popVirtualMemoryAllocator()
diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
index eacbfe6ad3e..70111fe7796 100644
--- a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
+++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@@ -277,7 +277,7 @@ TEST_F(VirtualMemoryTest, TestOrder)
             return createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             releaseOrder = ++mCallOrder;
             ASSERT_EQ(handle, createdHandle);
@@ -304,7 +304,7 @@ TEST_F(VirtualMemoryTest, TestOrder)
             setupOrder = ++mCallOrder;
         }
 
-        void teardown(CUmemGenericAllocationHandle handle) override
+        void teardown(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             teardownOrder = ++mCallOrder;
         }
@@ -377,7 +377,7 @@ TEST_F(VirtualMemoryTest, TestException)
             return createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             releaseCalled = true;
             ASSERT_EQ(handle, createdHandle);
@@ -412,7 +412,7 @@ TEST_F(VirtualMemoryTest, TestException)
             }
         }
 
-        void teardown(CUmemGenericAllocationHandle handle) override
+        void teardown(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             teardownCalled = true;
             if (throwOnTeardown)
@@ -662,7 +662,7 @@ TEST_F(VirtualMemoryTest, TestDestructor)
             return createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             releaseCalledRef = true;
             ASSERT_EQ(handle, createdHandle);
@@ -689,7 +689,7 @@ TEST_F(VirtualMemoryTest, TestDestructor)
             setupCalledRef = true;
         }
 
-        void teardown(CUmemGenericAllocationHandle) override
+        void teardown(CUmemGenericAllocationHandle, bool) override
         {
             teardownCalledRef = true;
         }
@@ -791,7 +791,7 @@ TEST_F(VirtualMemoryTest, TestDestructor)
                 throw DummyException();
             }
 
-            void teardown(CUmemGenericAllocationHandle) override
+            void teardown(CUmemGenericAllocationHandle, bool) override
             {
                 teardownCalledRef = true;
             }
@@ -842,7 +842,7 @@ TEST_F(VirtualMemoryTest, TestEdgeCases)
             return createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             ASSERT_EQ(handle, createdHandle);
         }
@@ -903,7 +903,7 @@ TEST_F(VirtualMemoryTest, TestEdgeCases)
                 }
             }
 
-            void teardown(CUmemGenericAllocationHandle) override {}
+            void teardown(CUmemGenericAllocationHandle, bool) override {}
         };
 
         auto creator = std::make_unique<DummyCreator>();
@@ -966,7 +966,7 @@ TEST_F(VirtualMemoryManagerTest, TestBasic)
     TLLM_CU_CHECK(cuMemAddressReserve(&address, size, 0, {}, 0));
 
     uintptr_t handle = static_cast<uintptr_t>(address);
-    std::string mark = "test_mark";
+    std::string tag = "test_tag";
 
     CUDAVirtualMemoryChunk::CreatorPtr creator
         = std::make_unique<LocalCreator<>>(CUmemAllocationProp{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
@@ -987,7 +987,7 @@ TEST_F(VirtualMemoryManagerTest, TestBasic)
     auto memoryBegin = getCurrentProcessMemoryInfo();
 
     // Add to manager - this automatically materializes
-    mVMManager->add(handle, mark, std::move(creator), std::move(configurators));
+    mVMManager->add(handle, tag, std::move(creator), std::move(configurators));
 
     auto memoryMaterialized = getCurrentProcessMemoryInfo();
     if (memoryInfoAvailable())
@@ -1001,23 +1001,23 @@ TEST_F(VirtualMemoryManagerTest, TestBasic)
     TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
 
     // Release memory through manager
-    auto releaseCount = mVMManager->releaseWithMark(mark);
+    auto releaseCount = mVMManager->releaseWithTag(tag);
     ASSERT_EQ(releaseCount, 1) << "Expected to release 1 memory object";
 
     auto memoryReleased = getCurrentProcessMemoryInfo();
     if (memoryInfoAvailable())
     {
-        ASSERT_EQ(memoryBegin, memoryReleased) << "releaseWithMark does not release memory";
+        ASSERT_EQ(memoryBegin, memoryReleased) << "releaseWithTag does not release memory";
     }
 
     // Materialize again through manager
-    auto materializeCount = mVMManager->materializeWithMark(mark);
+    auto materializeCount = mVMManager->materializeWithTag(tag);
     ASSERT_EQ(materializeCount, 1) << "Expected to materialize 1 memory object";
 
     auto memoryRematerialized = getCurrentProcessMemoryInfo();
     if (memoryInfoAvailable())
     {
-        ASSERT_EQ(memoryBegin + size, memoryRematerialized) << "materializeWithMark does not allocate memory";
+        ASSERT_EQ(memoryBegin + size, memoryRematerialized) << "materializeWithTag does not allocate memory";
     }
 
     // Test memory access after rematerialization
@@ -1041,9 +1041,9 @@ TEST_F(VirtualMemoryManagerTest, TestBasic)
     ASSERT_FALSE(unknownMemory) << "Expect invalid memory for unknown handle";
 }
 
-TEST_F(VirtualMemoryManagerTest, TestMarks)
+TEST_F(VirtualMemoryManagerTest, TestTags)
 {
-    // Dummy Creator for testing mark functionality
+    // Dummy Creator for testing tag functionality
     class DummyCreator : public CUDAVirtualMemoryChunk::Creator
     {
     public:
@@ -1057,7 +1057,7 @@ TEST_F(VirtualMemoryManagerTest, TestMarks)
             return createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             releaseCalled = true;
             ASSERT_EQ(handle, createdHandle);
@@ -1076,10 +1076,10 @@ TEST_F(VirtualMemoryManagerTest, TestMarks)
     auto* creator3Ptr = creator3.get();
     auto* creator4Ptr = creator4.get();
 
-    mVMManager->add(0x1000, "mark_A", std::move(creator1), {});
-    mVMManager->add(0x2000, "mark_B", std::move(creator2), {});
-    mVMManager->add(0x3000, "mark_A", std::move(creator3), {});
-    mVMManager->add(0x4000, "mark_C", std::move(creator4), {});
+    mVMManager->add(0x1000, "tag_A", std::move(creator1), {});
+    mVMManager->add(0x2000, "tag_B", std::move(creator2), {});
+    mVMManager->add(0x3000, "tag_A", std::move(creator3), {});
+    mVMManager->add(0x4000, "tag_C", std::move(creator4), {});
 
     // All should be materialized initially (since add() materializes automatically)
     EXPECT_TRUE(creator1Ptr->createCalled);
@@ -1087,43 +1087,43 @@ TEST_F(VirtualMemoryManagerTest, TestMarks)
     EXPECT_TRUE(creator3Ptr->createCalled);
     EXPECT_TRUE(creator4Ptr->createCalled);
 
-    // Reset create flags to test materializeWithMark later
+    // Reset create flags to test materializeWithTag later
     creator1Ptr->createCalled = false;
     creator2Ptr->createCalled = false;
     creator3Ptr->createCalled = false;
     creator4Ptr->createCalled = false;
 
-    // Test releaseWithMark - should release only memories with "mark_A"
-    auto releaseCount = mVMManager->releaseWithMark("mark_A");
-    EXPECT_EQ(releaseCount, 2); // Should release 2 memories with mark_A
+    // Test releaseWithTag - should release only memories with "tag_A"
+    auto releaseCount = mVMManager->releaseWithTag("tag_A");
+    EXPECT_EQ(releaseCount, 2); // Should release 2 memories with tag_A
 
-    // Verify only mark_A memories were released
-    EXPECT_TRUE(creator1Ptr->releaseCalled);  // mark_A
-    EXPECT_FALSE(creator2Ptr->releaseCalled); // mark_B
-    EXPECT_TRUE(creator3Ptr->releaseCalled);  // mark_A
-    EXPECT_FALSE(creator4Ptr->releaseCalled); // mark_C
+    // Verify only tag_A memories were released
+    EXPECT_TRUE(creator1Ptr->releaseCalled);  // tag_A
+    EXPECT_FALSE(creator2Ptr->releaseCalled); // tag_B
+    EXPECT_TRUE(creator3Ptr->releaseCalled);  // tag_A
+    EXPECT_FALSE(creator4Ptr->releaseCalled); // tag_C
 
-    // Test materializeWithMark - should materialize only memories with "mark_A"
-    auto materializeCount = mVMManager->materializeWithMark("mark_A");
-    EXPECT_EQ(materializeCount, 2); // Should materialize 2 memories with mark_A
+    // Test materializeWithTag - should materialize only memories with "tag_A"
+    auto materializeCount = mVMManager->materializeWithTag("tag_A");
+    EXPECT_EQ(materializeCount, 2); // Should materialize 2 memories with tag_A
 
-    // Verify only mark_A memories were materialized
-    EXPECT_TRUE(creator1Ptr->createCalled);  // mark_A
-    EXPECT_FALSE(creator2Ptr->createCalled); // mark_B
-    EXPECT_TRUE(creator3Ptr->createCalled);  // mark_A
-    EXPECT_FALSE(creator4Ptr->createCalled); // mark_C
+    // Verify only tag_A memories were materialized
+    EXPECT_TRUE(creator1Ptr->createCalled);  // tag_A
+    EXPECT_FALSE(creator2Ptr->createCalled); // tag_B
+    EXPECT_TRUE(creator3Ptr->createCalled);  // tag_A
+    EXPECT_FALSE(creator4Ptr->createCalled); // tag_C
 
-    // Reset flags and test releasing with a different mark
+    // Reset flags and test releasing with a different tag
     creator2Ptr->releaseCalled = false;
-    releaseCount = mVMManager->releaseWithMark("mark_B");
-    EXPECT_EQ(releaseCount, 1);              // Should release 1 memory with mark_B
-    EXPECT_TRUE(creator2Ptr->releaseCalled); // mark_B should now be released
+    releaseCount = mVMManager->releaseWithTag("tag_B");
+    EXPECT_EQ(releaseCount, 1);              // Should release 1 memory with tag_B
+    EXPECT_TRUE(creator2Ptr->releaseCalled); // tag_B should now be released
 
-    // Test with non-existent mark
-    releaseCount = mVMManager->releaseWithMark("nonexistent_mark");
+    // Test with non-existent tag
+    releaseCount = mVMManager->releaseWithTag("nonexistent_tag");
     EXPECT_EQ(releaseCount, 0); // Should release 0 memories
 
-    materializeCount = mVMManager->materializeWithMark("nonexistent_mark");
+    materializeCount = mVMManager->materializeWithTag("nonexistent_tag");
     EXPECT_EQ(materializeCount, 0); // Should materialize 0 memories
 
     // Clean up - remove all memories
@@ -1146,7 +1146,7 @@ TEST_F(VirtualMemoryManagerTest, TestAddException)
             return createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             ASSERT_EQ(handle, createdHandle);
         }
@@ -1161,14 +1161,14 @@ TEST_F(VirtualMemoryManagerTest, TestAddException)
             throw DummyException();
         }
 
-        void teardown(CUmemGenericAllocationHandle) override
+        void teardown(CUmemGenericAllocationHandle, bool) override
         {
             ASSERT_TRUE(false) << "Unreachable";
         }
     };
 
     uintptr_t handle = 0x12345678;
-    std::string mark = "test_mark";
+    std::string tag = "test_tag";
 
     // Verify initial state is clean
     EXPECT_TRUE(memories().empty());
@@ -1180,7 +1180,7 @@ TEST_F(VirtualMemoryManagerTest, TestAddException)
     configurators.push_back(std::make_unique<ThrowingConfigurator>());
 
     // add() should throw because materialize() will fail due to ThrowingConfigurator
-    EXPECT_THROW(mVMManager->add(handle, mark, std::move(creator), std::move(configurators)), DummyException);
+    EXPECT_THROW(mVMManager->add(handle, tag, std::move(creator), std::move(configurators)), DummyException);
 
     // Verify that the manager state is clean after the exception
     // The ScopeGuards in add() should have cleaned up properly
@@ -1193,7 +1193,7 @@ TEST_F(VirtualMemoryManagerTest, TestAddException)
     CUDAVirtualMemoryChunk::Configurators successConfigurators; // Empty configurators should work
 
     // This should succeed without throwing
-    EXPECT_NO_THROW(mVMManager->add(handle, mark, std::move(successCreator), std::move(successConfigurators)));
+    EXPECT_NO_THROW(mVMManager->add(handle, tag, std::move(successCreator), std::move(successConfigurators)));
 
     // Verify that the manager now has the entry
     EXPECT_EQ(memories().size(), 1);
@@ -1245,7 +1245,7 @@ TEST_F(VirtualMemoryManagerTest, TestMaterializeException)
             return state.createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             state.releaseCalled = true;
             ASSERT_EQ(handle, state.createdHandle);
@@ -1275,16 +1275,16 @@ TEST_F(VirtualMemoryManagerTest, TestMaterializeException)
     CUDAVirtualMemoryChunk vm2(std::move(creator2), {});
     CUDAVirtualMemoryChunk vm3(std::move(creator3), {});
 
-    mVMManager->add(0x1000, "test_mark", std::move(vm1));
-    mVMManager->add(0x2000, "test_mark", std::move(vm2));
-    mVMManager->add(0x3000, "test_mark", std::move(vm3));
+    mVMManager->add(0x1000, "test_tag", std::move(vm1));
+    mVMManager->add(0x2000, "test_tag", std::move(vm2));
+    mVMManager->add(0x3000, "test_tag", std::move(vm3));
 
     // Verify initial state is clean
     EXPECT_TRUE(badHandles().empty());
 
-    // materializeWithMark should stop at the first exception (second memory by create order)
+    // materializeWithTag should stop at the first exception (second memory by create order)
     // and attempt to rollback the first memory that succeeded
-    EXPECT_THROW(mVMManager->materializeWithMark("test_mark"), DummyException);
+    EXPECT_THROW(mVMManager->materializeWithTag("test_tag"), DummyException);
 
     // Find which creators were called and in what order
     std::vector<std::pair<uintptr_t, CreatorState*>> creators
@@ -1379,7 +1379,7 @@ TEST_F(VirtualMemoryManagerTest, TestReleaseException)
             return state.createdHandle;
         }
 
-        void release(CUmemGenericAllocationHandle handle) override
+        void release(CUmemGenericAllocationHandle handle, bool destructing) override
         {
             state.releaseCalled = true;
             ASSERT_EQ(handle, state.createdHandle);
@@ -1406,7 +1406,7 @@ TEST_F(VirtualMemoryManagerTest, TestReleaseException)
             state.setupCalled = true;
         }
 
-        void teardown(CUmemGenericAllocationHandle) override
+        void teardown(CUmemGenericAllocationHandle, bool) override
         {
             state.teardownCalled = true;
             if (++state.teardownCounter == state.throwOnTeardownCount)
@@ -1454,20 +1454,20 @@ TEST_F(VirtualMemoryManagerTest, TestReleaseException)
     CUDAVirtualMemoryChunk::Configurators configurators4;
     configurators4.push_back(std::move(config4));
 
-    mVMManager->add(0x1000, "test_mark", std::move(creator1), std::move(configurators1));
-    mVMManager->add(0x2000, "test_mark", std::move(creator2), std::move(configurators2));
-    mVMManager->add(0x3000, "test_mark", std::move(creator3), std::move(configurators3));
-    mVMManager->add(0x4000, "other_mark", std::move(creator4), std::move(configurators4));
+    mVMManager->add(0x1000, "test_tag", std::move(creator1), std::move(configurators1));
+    mVMManager->add(0x2000, "test_tag", std::move(creator2), std::move(configurators2));
+    mVMManager->add(0x3000, "test_tag", std::move(creator3), std::move(configurators3));
+    mVMManager->add(0x4000, "other_tag", std::move(creator4), std::move(configurators4));
 
     // Verify initial state
     EXPECT_TRUE(badHandles().empty());
 
-    // releaseWithMark should call release on all memories with "test_mark"
+    // releaseWithTag should call release on all memories with "test_tag"
     // and continue despite exceptions
-    EXPECT_THROW(mVMManager->releaseWithMark("test_mark"), DummyException);
+    EXPECT_THROW(mVMManager->releaseWithTag("test_tag"), DummyException);
 
     // Verify behavior:
-    // - All memories with "test_mark" should have had release() attempted
+    // - All memories with "test_tag" should have had release() attempted
     EXPECT_TRUE(state1.releaseCalled);
     EXPECT_TRUE(configState1.teardownCalled);
 
@@ -1477,7 +1477,7 @@ TEST_F(VirtualMemoryManagerTest, TestReleaseException)
     EXPECT_TRUE(state3.releaseCalled);
     EXPECT_TRUE(configState3.teardownCalled);
 
-    // - Memory with different mark should not be affected
+    // - Memory with different tag should not be affected
     EXPECT_FALSE(state4.releaseCalled);
     EXPECT_FALSE(configState4.teardownCalled);
 
@@ -1496,13 +1496,13 @@ TEST_F(VirtualMemoryManagerTest, TestReleaseException)
     EXPECT_TRUE(removedMem1);  // Should have been removed due to exception
     EXPECT_FALSE(removedMem2); // Should have been removed due to exception
     EXPECT_FALSE(removedMem3); // Should have been removed due to exception
-    EXPECT_TRUE(removedMem4);  // Should still be in manager (different mark, not affected)
+    EXPECT_TRUE(removedMem4);  // Should still be in manager (different tag, not affected)
 }
 
 TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
 {
     std::size_t constexpr size = 64 * 1024 * 1024; // 64 MB
-    std::string mark = "test_allocator_mark";
+    std::string tag = "test_allocator_tag";
 
     // Create a CUDA stream for the allocator
     CudaStream stream;
@@ -1510,7 +1510,7 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
 
     // Create configuration for the virtual address allocator
     auto config = std::make_shared<CudaVirtualMemoryAllocator::Configuration>(
-        *mVMManager.get(), mark, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);
+        *mVMManager.get(), tag, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);
 
     auto memoryBegin = getCurrentProcessMemoryInfo();
 
@@ -1536,8 +1536,8 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
     ASSERT_EQ(result, CUDA_SUCCESS) << "Memory access should succeed";
     TLLM_CU_CHECK(cuStreamSynchronize(nullptr));
 
-    // Test releasing memory with mark - this should free the virtual memory
-    auto releaseCount = mVMManager->releaseWithMark(mark);
+    // Test releasing memory with tag - this should free the virtual memory
+    auto releaseCount = mVMManager->releaseWithTag(tag);
     ASSERT_EQ(releaseCount, 1) << "Expected to release 1 memory object";
 
     auto memoryAfterRelease = getCurrentProcessMemoryInfo();
@@ -1546,8 +1546,8 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
         ASSERT_EQ(memoryBegin, memoryAfterRelease) << "Release should free the memory";
     }
 
-    // Test materializing memory with mark - this should re-allocate the virtual memory
-    auto materializeCount = mVMManager->materializeWithMark(mark);
+    // Test materializing memory with tag - this should re-allocate the virtual memory
+    auto materializeCount = mVMManager->materializeWithTag(tag);
     ASSERT_EQ(materializeCount, 1) << "Expected to materialize 1 memory object";
 
     auto memoryAfterMaterialize = getCurrentProcessMemoryInfo();
diff --git a/tensorrt_llm/_torch/virtual_memory.py b/tensorrt_llm/_torch/virtual_memory.py
index debd376c8d8..8a7cec2aede 100644
--- a/tensorrt_llm/_torch/virtual_memory.py
+++ b/tensorrt_llm/_torch/virtual_memory.py
@@ -11,8 +11,8 @@
     push_virtual_memory_allocator)
 
 __all__ = [
-    "RestoreMode", "maybe_scope", "scope", "release_with_mark",
-    "materialize_with_mark"
+    "RestoreMode", "maybe_scope", "scope", "release_with_tag",
+    "materialize_with_tag"
 ]
 
 
@@ -27,9 +27,9 @@ def _get_torch_pluggable_virtual_memory_allocator():
 
 
 @contextmanager
-def _virtual_address_helper(mark: str, mode: RestoreMode):
+def _virtual_address_helper(tag: str, mode: RestoreMode):
     stream = torch.cuda.current_stream()
-    push_virtual_memory_allocator(mark, mode, stream.cuda_stream)
+    push_virtual_memory_allocator(tag, mode, stream.cuda_stream)
     try:
         yield
     finally:
@@ -37,17 +37,17 @@ def _virtual_address_helper(mark: str, mode: RestoreMode):
 
 
 def _scope(
-    mark: str,
+    tag: str,
     mode: RestoreMode = RestoreMode.NONE
 ) -> Generator[torch.cuda.MemPool, None, None]:
     """A context manager that routes allocations to virtual memory allocator
-    using given mark and backed mode.
+    using given tag and backed mode.
 
-    :param mark: The mark to reference the memory for release and materialize
+    :param tag: The tag to reference the memory for release and materialize
     :param mode: The backed mode to choose how the memory content is backed up
     """
     pool = torch.cuda.MemPool(_get_torch_pluggable_virtual_memory_allocator())
-    with _virtual_address_helper(mark, mode), torch.cuda.use_mem_pool(pool):
+    with _virtual_address_helper(tag, mode), torch.cuda.use_mem_pool(pool):
         yield pool
 
 
@@ -57,33 +57,32 @@ def _scope(
 @contextmanager
 def maybe_scope(
     enable: bool,
-    mark: str,
+    tag: str,
     mode: RestoreMode = RestoreMode.NONE
 ) -> Generator[torch.cuda.MemPool | None, None, None]:
     if enable:
-        yield from _scope(mark, mode)
+        yield from _scope(tag, mode)
     else:
         yield
 
 
-def release_with_mark(*marks: str) -> int:
-    """Release virtual memory allocated with given marks
+def release_with_tag(*tags: str) -> int:
+    """Release virtual memory allocated with given tags
 
-    :param marks: The mark of the scope when the virtual memory is allocated
+    :param tags: The tag of the scope when the virtual memory is allocated
     :return: Number of memory blobs released
     """
     manager = get_virtual_memory_manager()
-    released_blobs = sum(manager.release_with_mark(mark) for mark in marks)
+    released_blobs = sum(manager.release_with_tag(tag) for tag in tags)
     return released_blobs
 
 
-def materialize_with_mark(*marks: str) -> int:
-    """Materialize virtual memory allocated with given marks
+def materialize_with_tag(*tags: str) -> int:
+    """Materialize virtual memory allocated with given tags
 
-    :param marks: The mark of the scope when the virtual memory is allocated
+    :param tags: The tag of the scope when the virtual memory is allocated
     :return: Number of memory blobs materialized
     """
     manager = get_virtual_memory_manager()
-    materialized_blobs = sum(
-        manager.materialize_with_mark(mark) for mark in marks)
+    materialized_blobs = sum(manager.materialize_with_tag(tag) for tag in tags)
     return materialized_blobs
diff --git a/tests/unittest/_torch/test_virtual_memory.py b/tests/unittest/_torch/test_virtual_memory.py
index 0d13a1b84ee..ce1b58ce701 100644
--- a/tests/unittest/_torch/test_virtual_memory.py
+++ b/tests/unittest/_torch/test_virtual_memory.py
@@ -88,9 +88,9 @@ def test_basic(memory_info_available):
     memory_usage_begin = get_current_process_memory_info()
 
     alloc_size = 256 * 1024 * 1024
-    mark = "test_mark"
+    tag = "test_tag"
 
-    with virtual_memory.scope(mark) as pool:
+    with virtual_memory.scope(tag) as pool:
         tensor = torch.full([alloc_size], 42, dtype=torch.int8, device='cuda')
         memory_usage_materialized = get_current_process_memory_info()
         if memory_info_available:
@@ -99,14 +99,14 @@ def test_basic(memory_info_available):
     assert tensor[0].item() == 42
 
     torch.cuda.synchronize()
-    virtual_memory.release_with_mark(mark)
+    virtual_memory.release_with_tag(tag)
 
     memory_usage_released = get_current_process_memory_info()
     if memory_info_available:
         assert memory_usage_begin == memory_usage_released
 
     torch.cuda.synchronize()
-    virtual_memory.materialize_with_mark(mark)
+    virtual_memory.materialize_with_tag(tag)
 
     memory_usage_rematerialized = get_current_process_memory_info()
     if memory_info_available:
@@ -125,19 +125,19 @@ def test_basic(memory_info_available):
 
 def test_restore():
     alloc_size = 1024 * 1024
-    mark = "test_mark"
+    tag = "test_tag"
 
-    with virtual_memory.scope(mark, virtual_memory.RestoreMode.PINNED) as pool:
+    with virtual_memory.scope(tag, virtual_memory.RestoreMode.PINNED) as pool:
         tensor = torch.full([alloc_size], 42, dtype=torch.int8, device='cuda')
 
     assert tensor[0].item() == 42
 
     torch.cuda.synchronize()
-    virtual_memory.release_with_mark(mark)
+    virtual_memory.release_with_tag(tag)
 
     torch.cuda.synchronize()
 
-    virtual_memory.materialize_with_mark(mark)
+    virtual_memory.materialize_with_tag(tag)
     torch.cuda.synchronize()
 
     assert tensor[0].item() == 42
@@ -166,7 +166,7 @@ def test_kv_cache_manager(memory_info_available):
 
     memory_usage_begin = get_current_process_memory_info()
 
-    mark = "test_mark"
+    tag = "test_tag"
     cache_size = torch.empty(
         [
             2,  # KV
@@ -180,21 +180,21 @@ def test_kv_cache_manager(memory_info_available):
 
     alloc_size = cache_size.nelement()
 
-    with virtual_memory.scope(mark) as pool:
+    with virtual_memory.scope(tag) as pool:
         mgr = KVCacheManager(**kv_cache_params)
         memory_usage_materialized = get_current_process_memory_info()
         if memory_info_available:
             assert memory_usage_begin + alloc_size == memory_usage_materialized
 
     torch.cuda.synchronize()
-    virtual_memory.release_with_mark(mark)
+    virtual_memory.release_with_tag(tag)
 
     memory_usage_released = get_current_process_memory_info()
     if memory_info_available:
         assert memory_usage_begin == memory_usage_released
 
     torch.cuda.synchronize()
-    virtual_memory.materialize_with_mark(mark)
+    virtual_memory.materialize_with_tag(tag)
 
     memory_usage_rematerialized = get_current_process_memory_info()
     if memory_info_available:
@@ -217,9 +217,9 @@ def work(input: torch.Tensor) -> torch.Tensor:
         return output
 
     g = torch.cuda.CUDAGraph()
-    mark = "cuda_graph"
+    tag = "cuda_graph"
 
-    with virtual_memory.scope(mark) as pool:
+    with virtual_memory.scope(tag) as pool:
         static_input = torch.ones(1024, dtype=torch.float32, device='cuda')
         static_output = torch.zeros(1024, dtype=torch.float32, device='cuda')
 
@@ -235,14 +235,14 @@ def work(input: torch.Tensor) -> torch.Tensor:
     memory_usage_before = get_current_process_memory_info()
 
     torch.cuda.synchronize()
-    virtual_memory.release_with_mark(mark)
+    virtual_memory.release_with_tag(tag)
 
     memory_usage_released = get_current_process_memory_info()
     if memory_info_available:
         assert memory_usage_released < memory_usage_before
 
     torch.cuda.synchronize()
-    virtual_memory.materialize_with_mark(mark)
+    virtual_memory.materialize_with_tag(tag)
 
     torch.fill_(static_input, 1.0)
     torch.fill_(static_output, 0.0)

From b1931ef9122b68793a7cd8e3c907c9599161d2d7 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Tue, 29 Jul 2025 10:56:36 +0800
Subject: [PATCH 04/10] cast check

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 cpp/tensorrt_llm/nanobind/runtime/bindings.cpp | 1 +
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
index 6e1247076b4..384720550be 100644
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -350,6 +350,7 @@ void initBindings(nb::module_& m)
         "push_virtual_memory_allocator",
         [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
+            static_assert(sizeof(uintptr_t) == sizeof(cudaStream_t));
             tr::pushVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index df279c124f6..c9d8d52b38a 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -443,6 +443,7 @@ void initBindings(pybind11::module_& m)
         "push_virtual_memory_allocator",
         [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
+            static_assert(sizeof(uintptr_t) == sizeof(cudaStream_t));
             tr::pushVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));

From 5fcab089b0a09054aed7b3f6403487e975c899d8 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Wed, 30 Jul 2025 11:24:05 +0800
Subject: [PATCH 05/10] address review

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 cpp/include/tensorrt_llm/runtime/virtualMemory.h   | 2 +-
 cpp/tensorrt_llm/runtime/virtualMemory.cpp         | 2 +-
 cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp   | 2 +-
 cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
index 0f171dbc8bf..a3549229eba 100644
--- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index 17cc6d9348d..b41cdffd355 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp b/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp
index 9904d985021..77c2c3dfd82 100644
--- a/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp
+++ b/cpp/tensorrt_llm/thop/virtualMemoryAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
index 70111fe7796..970a05299b1 100644
--- a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
+++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From ea689f516e4074ffdde9193aebbac706d839623d Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Wed, 30 Jul 2025 14:41:18 +0800
Subject: [PATCH 06/10] address review

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 cpp/include/tensorrt_llm/runtime/virtualMemory.h | 5 +++++
 cpp/tensorrt_llm/runtime/virtualMemory.cpp       | 9 +++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
index a3549229eba..c4b2631f3d4 100644
--- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -490,6 +490,11 @@ class CudaVirtualMemoryAllocator
         {
         }
 
+        [[nodiscard]] std::size_t pageAligned(std::size_t n) const noexcept
+        {
+            return (n + mPageSize - 1) & ~(mPageSize - 1);
+        }
+
         // Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.
         static Configuration backgroundConfiguration;
 
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index b41cdffd355..d73cae9fa25 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -338,7 +338,7 @@ static void* deviceptr_cast(CUdeviceptr ptr)
 void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const
 {
     CUdeviceptr address{};
-    std::size_t const pageAlignedSize = (n + mConfig->mPageSize - 1) & ~(mConfig->mPageSize - 1);
+    std::size_t const pageAlignedSize = mConfig->pageAligned(n);
     TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0));
 
     CUDAVirtualMemoryChunk::Configurators configurators;
@@ -382,7 +382,7 @@ void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const
     auto const address = deviceptr_cast(ptr);
     mConfig->mManager.remove(address);
 
-    std::size_t const pageAlignedSize = (n + mConfig->mPageSize - 1) & ~(mConfig->mPageSize - 1);
+    std::size_t const pageAlignedSize = mConfig->pageAligned(n);
     TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize));
 }
 
@@ -400,22 +400,27 @@ CudaVirtualMemoryManager& getVirtualMemoryManager()
 using AllocConf = CudaVirtualMemoryAllocator::Configuration;
 
 AllocConf AllocConf::backgroundConfiguration{getVirtualMemoryManager(), "", NONE, nullptr, true};
+
+static std::mutex vmAllocatorsMutex;
 static std::forward_list vmAllocators{
     CudaVirtualMemoryAllocator{{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration}}};
 
 CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator()
 {
+    std::unique_lock lock(vmAllocatorsMutex);
     return vmAllocators.front();
 }
 
 void pushVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
 {
+    std::unique_lock lock(vmAllocatorsMutex);
     vmAllocators.emplace_front(std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream));
 }
 
 void popVirtualMemoryAllocator()
 {
+    std::unique_lock lock(vmAllocatorsMutex);
     vmAllocators.pop_front();
 }
 

From 8775e85721708b2ced080277d01eec147530aa79 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Wed, 30 Jul 2025 14:54:23 +0800
Subject: [PATCH 07/10] shared mutex

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 cpp/tensorrt_llm/runtime/virtualMemory.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index d73cae9fa25..39fb6bdbc47 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -18,6 +18,7 @@
 #include "bufferManager.h"
 
 #include <forward_list>
+#include <shared_mutex>
 
 namespace tensorrt_llm::runtime
 {
@@ -401,13 +402,13 @@ using AllocConf = CudaVirtualMemoryAllocator::Configuration;
 
 AllocConf AllocConf::backgroundConfiguration{getVirtualMemoryManager(), "", NONE, nullptr, true};
 
-static std::mutex vmAllocatorsMutex;
+static std::shared_mutex vmAllocatorsMutex;
 static std::forward_list vmAllocators{
     CudaVirtualMemoryAllocator{{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration}}};
 
 CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator()
 {
-    std::unique_lock lock(vmAllocatorsMutex);
+    std::shared_lock lock(vmAllocatorsMutex);
     return vmAllocators.front();
 }
 

From 21865781d5dca46253b244e4b851c32950b315c5 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Wed, 30 Jul 2025 16:03:17 +0800
Subject: [PATCH 08/10] get vm allocator by value

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 cpp/include/tensorrt_llm/runtime/virtualMemory.h |  2 +-
 cpp/tensorrt_llm/runtime/bufferManager.cpp       | 16 ++++++++--------
 cpp/tensorrt_llm/runtime/virtualMemory.cpp       |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
index c4b2631f3d4..8df3ed7a60c 100644
--- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -530,7 +530,7 @@ class CudaVirtualMemoryAllocator
 namespace tensorrt_llm::runtime
 {
 CudaVirtualMemoryManager& getVirtualMemoryManager();
-CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator();
+CudaVirtualMemoryAllocator getVirtualMemoryAllocator();
 void pushVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
 void popVirtualMemoryAllocator();
diff --git a/cpp/tensorrt_llm/runtime/bufferManager.cpp b/cpp/tensorrt_llm/runtime/bufferManager.cpp
index 40e8ce58e5d..3de42a25315 100644
--- a/cpp/tensorrt_llm/runtime/bufferManager.cpp
+++ b/cpp/tensorrt_llm/runtime/bufferManager.cpp
@@ -39,9 +39,9 @@ BufferManager::BufferManager(CudaStreamPtr stream, bool trimPool)
 
 BufferManager::IBufferPtr BufferManager::gpu(std::size_t size, nvinfer1::DataType type) const
 {
-    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    if (auto vmAllocator = getVirtualMemoryAllocator())
     {
-        return std::make_unique<VirtualAddressDeviceBuffer>(size, type, vmAllocator);
+        return std::make_unique<VirtualAddressDeviceBuffer>(size, type, std::move(vmAllocator));
     }
     if (static_cast<bool>(mPool))
     {
@@ -53,9 +53,9 @@ BufferManager::IBufferPtr BufferManager::gpu(std::size_t size, nvinfer1::DataTyp
 
 BufferManager::ITensorPtr BufferManager::gpu(nvinfer1::Dims dims, nvinfer1::DataType type) const
 {
-    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    if (auto vmAllocator = getVirtualMemoryAllocator())
     {
-        return std::make_unique<VirtualAddressDeviceTensor>(dims, type, vmAllocator);
+        return std::make_unique<VirtualAddressDeviceTensor>(dims, type, std::move(vmAllocator));
     }
     if (static_cast<bool>(mPool))
     {
@@ -67,18 +67,18 @@ BufferManager::ITensorPtr BufferManager::gpu(nvinfer1::Dims dims, nvinfer1::Data
 
 BufferManager::IBufferPtr BufferManager::gpuSync(std::size_t size, nvinfer1::DataType type)
 {
-    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    if (auto vmAllocator = getVirtualMemoryAllocator())
     {
-        return std::make_unique<VirtualAddressDeviceBuffer>(size, type, vmAllocator);
+        return std::make_unique<VirtualAddressDeviceBuffer>(size, type, std::move(vmAllocator));
     }
     return std::make_unique<StaticDeviceBuffer>(size, type, CudaAllocator{});
 }
 
 BufferManager::ITensorPtr BufferManager::gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type)
 {
-    if (auto& vmAllocator = getVirtualMemoryAllocator())
+    if (auto vmAllocator = getVirtualMemoryAllocator())
     {
-        return std::make_unique<VirtualAddressDeviceTensor>(dims, type, vmAllocator);
+        return std::make_unique<VirtualAddressDeviceTensor>(dims, type, std::move(vmAllocator));
     }
     return std::make_unique<StaticDeviceTensor>(dims, type, CudaAllocator{});
 }
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index 39fb6bdbc47..152d5a5ff23 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -406,7 +406,7 @@ static std::shared_mutex vmAllocatorsMutex;
 static std::forward_list vmAllocators{
     CudaVirtualMemoryAllocator{{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration}}};
 
-CudaVirtualMemoryAllocator const& getVirtualMemoryAllocator()
+CudaVirtualMemoryAllocator getVirtualMemoryAllocator()
 {
     std::shared_lock lock(vmAllocatorsMutex);
     return vmAllocators.front();

From 9f347aea15ceb1874cad0e7b8506e18daf072cc5 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Wed, 30 Jul 2025 17:45:26 +0800
Subject: [PATCH 09/10] remove nested vm scope support

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 .../tensorrt_llm/runtime/virtualMemory.h      |  6 +++--
 .../nanobind/runtime/bindings.cpp             | 12 +++++-----
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp  | 12 +++++-----
 cpp/tensorrt_llm/runtime/virtualMemory.cpp    | 22 +++++++++----------
 tensorrt_llm/_torch/virtual_memory.py         | 12 +++++-----
 5 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
index 8df3ed7a60c..c39a60995eb 100644
--- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h
+++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -471,6 +471,8 @@ class CudaVirtualMemoryAllocator
         bool mBackground{};
 
         friend class CudaVirtualMemoryAllocator;
+        friend void setVirtualMemoryAllocator(
+            std::string const& tag, RestoreMode mode, std::shared_ptr<CudaStream> backStream);
 
     public:
         /**
@@ -531,8 +533,8 @@ namespace tensorrt_llm::runtime
 {
 CudaVirtualMemoryManager& getVirtualMemoryManager();
 CudaVirtualMemoryAllocator getVirtualMemoryAllocator();
-void pushVirtualMemoryAllocator(
+void setVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
-void popVirtualMemoryAllocator();
+void clearVirtualMemoryAllocator();
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
index 384720550be..7ce9760b94a 100644
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -347,20 +347,18 @@ void initBindings(nb::module_& m)
         nb::rv_policy::reference);
 
     m.def(
-        "push_virtual_memory_allocator",
+        "set_virtual_memory_allocator",
         [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
             static_assert(sizeof(uintptr_t) == sizeof(cudaStream_t));
-            tr::pushVirtualMemoryAllocator(tag, mode,
+            tr::setVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
         },
-        "Create a new virtual address allocator. Push it into the allocator stack to be used for subsequent "
-        "allocations");
+        "Set the virtual memory allocator and start allocating virtual memory for CUDA allocations");
 
-    m.def("pop_virtual_memory_allocator", &tr::popVirtualMemoryAllocator,
-        "Pop the current virtual address allocator from the allocator stack. Restore the active allocator for "
-        "subsequent allocations to be the previous one");
+    m.def("clear_virtual_memory_allocator", &tr::clearVirtualMemoryAllocator,
+        "Reset the current virtual memory allocator and stop allocating virtual memory for CUDA allocations");
 
     nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
         .def(nb::init<size_t, uint32_t, uint32_t, at::Device, bool>())
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index c9d8d52b38a..782a461c16d 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -440,20 +440,18 @@ void initBindings(pybind11::module_& m)
         py::return_value_policy::reference);
 
     m.def(
-        "push_virtual_memory_allocator",
+        "set_virtual_memory_allocator",
         [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
             static_assert(sizeof(uintptr_t) == sizeof(cudaStream_t));
-            tr::pushVirtualMemoryAllocator(tag, mode,
+            tr::setVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
         },
-        "Create a new virtual address allocator. Push it into the allocator stack to be used for subsequent "
-        "allocations");
+        "Set the virtual memory allocator and start allocating virtual memory for CUDA allocations");
 
-    m.def("pop_virtual_memory_allocator", &tr::popVirtualMemoryAllocator,
-        "Pop the current virtual address allocator from the allocator stack. Restore the active allocator for "
-        "subsequent allocations to be the previous one");
+    m.def("clear_virtual_memory_allocator", &tr::clearVirtualMemoryAllocator,
+        "Reset the current virtual memory allocator and stop allocating virtual memory for CUDA allocations");
 
     py::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
         .def(py::init<size_t, uint32_t, uint32_t, at::Device, bool>())
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index 152d5a5ff23..e5caf84a49a 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -402,27 +402,27 @@ using AllocConf = CudaVirtualMemoryAllocator::Configuration;
 
 AllocConf AllocConf::backgroundConfiguration{getVirtualMemoryManager(), "", NONE, nullptr, true};
 
-static std::shared_mutex vmAllocatorsMutex;
-static std::forward_list vmAllocators{
-    CudaVirtualMemoryAllocator{{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration}}};
+static const std::shared_ptr<AllocConf> bgConf{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration};
+static std::shared_ptr<AllocConf> currentConf = bgConf;
 
 CudaVirtualMemoryAllocator getVirtualMemoryAllocator()
 {
-    std::shared_lock lock(vmAllocatorsMutex);
-    return vmAllocators.front();
+    return CudaVirtualMemoryAllocator{std::atomic_load(&currentConf)};
 }
 
-void pushVirtualMemoryAllocator(
+void setVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
 {
-    std::unique_lock lock(vmAllocatorsMutex);
-    vmAllocators.emplace_front(std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream));
+    auto expected = bgConf;
+    auto const desired = std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream);
+    TLLM_CHECK_WITH_INFO(std::atomic_compare_exchange_strong(&currentConf, &expected, desired),
+        "An active virtual memory allocator (tag: %s, mode: %d, stream: %p) is already present", expected->mTag.c_str(),
+        expected->mMode, expected->mBackStream.get());
 }
 
-void popVirtualMemoryAllocator()
+void clearVirtualMemoryAllocator()
 {
-    std::unique_lock lock(vmAllocatorsMutex);
-    vmAllocators.pop_front();
+    std::atomic_store(&currentConf, bgConf);
 }
 
 } // namespace tensorrt_llm::runtime
diff --git a/tensorrt_llm/_torch/virtual_memory.py b/tensorrt_llm/_torch/virtual_memory.py
index 8a7cec2aede..e2335286333 100644
--- a/tensorrt_llm/_torch/virtual_memory.py
+++ b/tensorrt_llm/_torch/virtual_memory.py
@@ -7,8 +7,8 @@
 from tensorrt_llm.bindings.internal.runtime import \
     CudaVirtualMemoryAllocatorRestoreMode as RestoreMode
 from tensorrt_llm.bindings.internal.runtime import (
-    get_virtual_memory_manager, pop_virtual_memory_allocator,
-    push_virtual_memory_allocator)
+    clear_virtual_memory_allocator, get_virtual_memory_manager,
+    set_virtual_memory_allocator)
 
 __all__ = [
     "RestoreMode", "maybe_scope", "scope", "release_with_tag",
@@ -27,13 +27,13 @@ def _get_torch_pluggable_virtual_memory_allocator():
 
 
 @contextmanager
-def _virtual_address_helper(tag: str, mode: RestoreMode):
+def _virtual_memory_helper(tag: str, mode: RestoreMode):
     stream = torch.cuda.current_stream()
-    push_virtual_memory_allocator(tag, mode, stream.cuda_stream)
+    set_virtual_memory_allocator(tag, mode, stream.cuda_stream)
     try:
         yield
     finally:
-        pop_virtual_memory_allocator()
+        clear_virtual_memory_allocator()
 
 
 def _scope(
@@ -47,7 +47,7 @@ def _scope(
     :param mode: The backed mode to choose how the memory content is backed up
     """
     pool = torch.cuda.MemPool(_get_torch_pluggable_virtual_memory_allocator())
-    with _virtual_address_helper(tag, mode), torch.cuda.use_mem_pool(pool):
+    with _virtual_memory_helper(tag, mode), torch.cuda.use_mem_pool(pool):
         yield pool
 
 

From ee6676ac736ed17b3976e12daf3b195b19588689 Mon Sep 17 00:00:00 2001
From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
Date: Thu, 31 Jul 2025 11:56:39 +0800
Subject: [PATCH 10/10] use shared_mutex

Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
---
 cpp/tensorrt_llm/runtime/virtualMemory.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
index e5caf84a49a..488da30d653 100644
--- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -403,26 +403,31 @@ using AllocConf = CudaVirtualMemoryAllocator::Configuration;
 AllocConf AllocConf::backgroundConfiguration{getVirtualMemoryManager(), "", NONE, nullptr, true};
 
 static const std::shared_ptr<AllocConf> bgConf{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration};
+
+static std::shared_mutex currentConfMutex;
 static std::shared_ptr<AllocConf> currentConf = bgConf;
 
 CudaVirtualMemoryAllocator getVirtualMemoryAllocator()
 {
-    return CudaVirtualMemoryAllocator{std::atomic_load(&currentConf)};
+    std::shared_lock lock(currentConfMutex);
+    return CudaVirtualMemoryAllocator{currentConf};
 }
 
 void setVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
 {
-    auto expected = bgConf;
-    auto const desired = std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream);
-    TLLM_CHECK_WITH_INFO(std::atomic_compare_exchange_strong(&currentConf, &expected, desired),
-        "An active virtual memory allocator (tag: %s, mode: %d, stream: %p) is already present", expected->mTag.c_str(),
-        expected->mMode, expected->mBackStream.get());
+    std::unique_lock lock(currentConfMutex);
+
+    TLLM_CHECK_WITH_INFO(currentConf == bgConf,
+        "An active virtual memory allocator (tag: %s, mode: %d, stream: %p) is already present",
+        currentConf->mTag.c_str(), currentConf->mMode, currentConf->mBackStream.get());
+    currentConf = std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream);
 }
 
 void clearVirtualMemoryAllocator()
 {
-    std::atomic_store(&currentConf, bgConf);
+    std::unique_lock lock(currentConfMutex);
+    currentConf = bgConf;
 }
 
 } // namespace tensorrt_llm::runtime