Fixing test

pcastonguay · pcastonguay · commit 82078cbc8c3d · 2025-08-05T13:09:40.000-07:00
Signed-off-by: Patrice Castonguay &lt;55748270+pcastonguay@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
@@ -105,6 +105,9 @@ class KVCacheEventManager
 
     /// @brief The period in milliseconds to gather attention DP events across rank
     SizeType32 mAttentionDpEventsGatherPeriodMs;
+
+    /// @brief MPI communicator for attention DP
+    std::unique_ptr<tensorrt_llm::mpi::MpiComm> mMpiComm;
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
@@ -35,7 +35,6 @@ KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries, std::optional
     , mAttentionDpSize{attentionDpSize}
     , mAttentionDpEventsGatherPeriodMs(attentionDpEventsGatherPeriodMs)
 {
-
     TLLM_CHECK(mMaxSize > 0);
     if (mAttentionDpRank)
     {
@@ -49,6 +48,8 @@ KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries, std::optional
             // Need to increase size
             mMaxSize *= mAttentionDpSize.value();
         }
+        // Create a communicator to be used for event exchange
+        mMpiComm = std::make_unique<tensorrt_llm::mpi::MpiComm>(COMM_SESSION.split(0, mAttentionDpRank.value()));
     }
     else
     {
@@ -162,37 +163,49 @@ void KVCacheEventManager::exchangeAttentionDpThread()
     while (true)
     {
         TLLM_CHECK(mAttentionDpRank);
+
+        // Check if any of the ranks have been shutdown
+        int32_t numFinished = 0;
+        int32_t finished = mRun ? 0 : 1;
+        mMpiComm->allreduce(&finished, &numFinished, 1, mpi::MpiType::kINT32, mpi::MpiOp::SUM);
+        if (numFinished > 0)
+        {
+            TLLM_LOG_INFO("One of the rank has been shut down, exiting");
+            break;
+        }
+
         // If we are not rank 0, send events to rank 0
         if (mAttentionDpRank.value() != 0)
         {
             std::vector<char> serializedEvents;
+            uint64_t numEvents = 0;
             {
                 std::unique_lock<std::mutex> lck(mEventsMutex);
                 serializedEvents = executor::Serialization::serialize(mEvents);
+                numEvents = mEvents.size();
                 mEvents.clear();
             }
-            uint64_t vecSize = serializedEvents.size();
-            COMM_SESSION.send(&vecSize, 1, mpi::MpiType::kUINT64, 0, mpi::MpiTag::kKvCacheEventSize);
-            COMM_SESSION.send(
-                serializedEvents.data(), serializedEvents.size(), mpi::MpiType::kCHAR, 0, mpi::MpiTag::kKvCacheEvent);
+            uint64_t vecSize = numEvents > 0 ? serializedEvents.size() : 0;
+            mMpiComm->send(&vecSize, 1, mpi::MpiType::kUINT64, 0, mpi::MpiTag::kKvCacheEventSize);
+            if (vecSize > 0)
+            {
+                mMpiComm->send(serializedEvents.data(), serializedEvents.size(), mpi::MpiType::kCHAR, 0,
+                    mpi::MpiTag::kKvCacheEvent);
+            }
         }
         else
         {
             TLLM_CHECK(mAttentionDpSize.has_value());
             // Loop until have received events from all ranks
-            int32_t numRecvs = 0;
-            while (numRecvs < mAttentionDpSize.value() - 1)
+            for (int rank = 1; rank < mAttentionDpSize.value(); ++rank)
             {
-                MPI_Status probeStatus;
-                if (COMM_SESSION.iprobe(MPI_ANY_SOURCE, mpi::MpiTag::kKvCacheEvent, &probeStatus))
+                uint64_t vecSize{0};
+                mMpiComm->recv(&vecSize, 1, mpi::MpiType::kUINT64, rank, mpi::MpiTag::kKvCacheEventSize);
+                if (vecSize > 0)
                 {
-                    uint64_t vecSize{0};
-                    COMM_SESSION.recv(
-                        &vecSize, 1, mpi::MpiType::kUINT64, probeStatus.MPI_SOURCE, mpi::MpiTag::kKvCacheEventSize);
-
                     std::vector<char> serializedEvents(vecSize);
-                    COMM_SESSION.recv(serializedEvents.data(), vecSize, mpi::MpiType::kCHAR, probeStatus.MPI_SOURCE,
-                        mpi::MpiTag::kKvCacheEvent);
+                    mMpiComm->recv(
+                        serializedEvents.data(), vecSize, mpi::MpiType::kCHAR, rank, mpi::MpiTag::kKvCacheEvent);
 
                     // Deserialize the events and add them to the local queue
                     auto rankEvents = executor::Serialization::deserializeKVCacheEvents(serializedEvents);
@@ -201,11 +214,10 @@ void KVCacheEventManager::exchangeAttentionDpThread()
                         mEvents.insert(mEvents.end(), rankEvents.begin(), rankEvents.end());
                         mEmptyCV.notify_one();
                     }
-                    numRecvs++;
                 }
             }
-            std::this_thread::sleep_for(std::chrono::milliseconds(mAttentionDpEventsGatherPeriodMs));
         }
+        std::this_thread::sleep_for(std::chrono::milliseconds(mAttentionDpEventsGatherPeriodMs));
     }
 #else
     TLLM_THROW("Multi device support is disabled.");
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -325,7 +325,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def_static("hash", &tbk::BlockKeyHasher::hash, nb::arg("block_key"), nb::arg("parent_hash") = 0);
 
     nb::class_<tbk::KVCacheEventManager>(m, "KVCacheEventManager")
-        .def(nb::init<size_t>(), nb::arg("max_kv_event_entries"));
+        .def(nb::init<size_t, std::optional<SizeType32>, std::optional<SizeType32>, SizeType32>(),
+            nb::arg("max_kv_event_entries"), nb::arg("attention_dp_rank"), nb::arg("attention_dp_size"),
+            nb::arg("attention_dp_events_gather_period_ms"));
 
     nb::class_<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
         .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, nb::arg("config"),
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -321,7 +321,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def_static("hash", &tbk::BlockKeyHasher::hash, py::arg("block_key"), py::arg("parent_hash") = 0);
 
     py::class_<tbk::KVCacheEventManager, std::shared_ptr<tbk::KVCacheEventManager>>(m, "KVCacheEventManager")
-        .def(py::init<size_t>(), py::arg("max_kv_event_entries"));
+        .def(py::init<size_t, std::optional<SizeType32>, std::optional<SizeType32>, SizeType32>(),
+            py::arg("max_kv_event_entries"), py::arg("attention_dp_rank"), py::arg("attention_dp_size"),
+            py::arg("attention_dp_events_gather_period_ms"));
 
     py::classh<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
         .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, py::arg("config"),
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -108,7 +108,7 @@ void initConfigBindings(pybind11::module_& m)
     };
     auto kvCacheConfigSetstate = [](py::tuple const& state)
     {
-        if (state.size() != 13)
+        if (state.size() != 14)
         {
             throw std::runtime_error("Invalid state!");
         }
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheEventManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheEventManagerTest.cpp
diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py
@@ -147,14 +147,11 @@ async def main():
     asyncio.run(main())
 
 
-def test_llm_kv_events_api():
-    llm = create_llm()
-    sampling_params = SamplingParams(max_tokens=6, temperature=0.01)
-
-    requests = []
-    for i in range(3):
-        input_tokens = list(range(127 + i))[i:]
-        requests.append(input_tokens)
+def check_events(llm,
+                 requests,
+                 sampling_params,
+                 scheduling_params=None,
+                 attention_dp_rank=None):
 
     _ = llm.generate(requests[0], sampling_params=sampling_params)
     events1 = llm.get_kv_cache_events(5)
@@ -163,52 +160,95 @@ def test_llm_kv_events_api():
     event = events1.pop(0)  # created event
     while events1:
         event = events1.pop(0)
+        print("event1:", event)
         if event:
             assert event["event_id"] == 1
             assert event["data"]["type"] == "stored"
             assert len(event["data"]["blocks"]) == 5
+            if attention_dp_rank:
+                assert event["data"]["attention_dp_rank"] == attention_dp_rank
 
     _ = llm.generate(requests[1], sampling_params=sampling_params)
     events2 = llm.get_kv_cache_events(5)
 
     while events2:
         event = events2.pop(0)
+        print("event2:", event)
         if event:
             if event["event_id"] == 2:
                 # 2 removed events needed
                 # should be a removed event to make space for context block
                 assert event["data"]["type"] == "removed"
                 assert event["data"]["block_hashes"]
+                if attention_dp_rank:
+                    assert event["data"][
+                        "attention_dp_rank"] == attention_dp_rank
             elif event["event_id"] == 3:
                 assert event["data"]["type"] == "removed"
                 assert event["data"]["block_hashes"]
+                if attention_dp_rank:
+                    assert event["data"][
+                        "attention_dp_rank"] == attention_dp_rank
             # stored event for 2nd request
             elif event["event_id"] == 4:
                 assert event["data"]["type"] == "stored"
                 assert len(event["data"]["blocks"]) == 5
+                if attention_dp_rank:
+                    assert event["data"][
+                        "attention_dp_rank"] == attention_dp_rank
 
     _ = llm.generate(requests[2], sampling_params=sampling_params)
     events3 = llm.get_kv_cache_events(5)
 
     while events3:
         event = events3.pop(0)
+        print("event3:", event)
         if event:
             if event["event_id"] == 5:
                 assert event["data"]["type"] == "removed"
                 assert event["data"]["block_hashes"]
+                if attention_dp_rank:
+                    assert event["data"][
+                        "attention_dp_rank"] == attention_dp_rank
             elif event["event_id"] == 6:
                 assert event["data"]["type"] == "removed"
                 assert event["data"]["block_hashes"]
+                if attention_dp_rank:
+                    assert event["data"][
+                        "attention_dp_rank"] == attention_dp_rank
             elif event["event_id"] == 7:
                 assert event["data"]["type"] == "stored"
                 assert len(event["data"]["blocks"]) == 5
+                if attention_dp_rank:
+                    assert event["data"][
+                        "attention_dp_rank"] == attention_dp_rank
 
     # no more events after request is finished
     assert not llm.get_kv_cache_events(5)
 
 
+def test_llm_kv_events_api():
+    llm = create_llm()
+    sampling_params = SamplingParams(max_tokens=6, temperature=0.01)
+
+    requests = []
+    for i in range(3):
+        input_tokens = list(range(127 + i))[i:]
+        requests.append(input_tokens)
+
+    check_events(llm, requests, sampling_params)
+
+
 @skip_single_gpu
 def test_llm_api_attention_dp_kv_events():
+
+    kvcache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
+                                   event_buffer_max_size=1024,
+                                   attention_dp_events_gather_period_ms=10,
+                                   enable_block_reuse=True,
+                                   onboard_blocks=True,
+                                   max_tokens=256)
+
     llm = LLM(model=llama_model_path,
               tensor_parallel_size=2,
               enable_attention_dp=True,
@@ -217,59 +257,16 @@ def test_llm_api_attention_dp_kv_events():
 
     sampling_params = SamplingParams(max_tokens=6, temperature=0.01)
 
-    requests = []
-    for i in range(3):
-        input_tokens = list(range(127 + i))[i:]
-        requests.append(input_tokens)
-
-    _ = llm.generate(requests[0], sampling_params=sampling_params)
-    events1 = llm.get_kv_cache_events(5)
-
-    # Should have 1 stored event and 1 created event
-    event = events1.pop(0)  # created event
-    while events1:
-        event = events1.pop(0)
-        if event:
-            assert event["event_id"] == 1
-            assert event["data"]["type"] == "stored"
-            assert event["attention_dp_rank"] == 0
-            assert event["window_size"] == 32
-            assert len(event["data"]["blocks"]) == 5
+    for attention_dp_rank in range(2):
+        requests = []
+        for i in range(3):
+            input_tokens = list(range(127 + i))[i:]
+            requests.append(input_tokens)
 
-    _ = llm.generate(requests[1], sampling_params=sampling_params)
-    events2 = llm.get_kv_cache_events(5)
+        scheduling_params = SchedulingParams(
+            attention_dp_rank=attention_dp_rank, attention_dp_relax=False)
 
-    while events2:
-        event = events2.pop(0)
-        if event:
-            if event["event_id"] == 2:
-                # 2 removed events needed
-                # should be a removed event to make space for context block
-                assert event["data"]["type"] == "removed"
-                assert event["data"]["block_hashes"]
-            elif event["event_id"] == 3:
-                assert event["data"]["type"] == "removed"
-                assert event["data"]["block_hashes"]
-            # stored event for 2nd request
-            elif event["event_id"] == 4:
-                assert event["data"]["type"] == "stored"
-                assert len(event["data"]["blocks"]) == 5
+        check_events(llm, requests, sampling_params, scheduling_params,
+                     attention_dp_rank)
 
-    #_ = llm.generate(requests[2], sampling_params=sampling_params)
-    #events3 = llm.get_kv_cache_events(5)
-
-    #while events3:
-    #    event = events3.pop(0)
-    #    if event:
-    #        if event["event_id"] == 5:
-    #            assert event["data"]["type"] == "removed"
-    #            assert event["data"]["block_hashes"]
-    #        elif event["event_id"] == 6:
-    #            assert event["data"]["type"] == "removed"
-    #            assert event["data"]["block_hashes"]
-    #        elif event["event_id"] == 7:
-    #            assert event["data"]["type"] == "stored"
-    #            assert len(event["data"]["blocks"]) == 5
-
-    ## no more events after request is finished
-    #assert not llm.get_kv_cache_events(5)
+    time.sleep(5)

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,6 @@ KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries, std::optional`
`35`	`35`	`, mAttentionDpSize{attentionDpSize}`
`36`	`36`	`, mAttentionDpEventsGatherPeriodMs(attentionDpEventsGatherPeriodMs)`
`37`	`37`	`{`
`38`		`-`
`39`	`38`	`TLLM_CHECK(mMaxSize > 0);`
`40`	`39`	`if (mAttentionDpRank)`
`41`	`40`	`{`
`@@ -49,6 +48,8 @@ KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries, std::optional`
`49`	`48`	`// Need to increase size`
`50`	`49`	`mMaxSize *= mAttentionDpSize.value();`
`51`	`50`	`}`
	`51`	`+ // Create a communicator to be used for event exchange`
	`52`	`+ mMpiComm = std::make_unique<tensorrt_llm::mpi::MpiComm>(COMM_SESSION.split(0, mAttentionDpRank.value()));`
`52`	`53`	`}`
`53`	`54`	`else`
`54`	`55`	`{`
`@@ -162,37 +163,49 @@ void KVCacheEventManager::exchangeAttentionDpThread()`
`162`	`163`	`while (true)`
`163`	`164`	`{`
`164`	`165`	`TLLM_CHECK(mAttentionDpRank);`
	`166`	`+`
	`167`	`+ // Check if any of the ranks have been shutdown`
	`168`	`+ int32_t numFinished = 0;`
	`169`	`+ int32_t finished = mRun ? 0 : 1;`
	`170`	`+ mMpiComm->allreduce(&finished, &numFinished, 1, mpi::MpiType::kINT32, mpi::MpiOp::SUM);`
	`171`	`+ if (numFinished > 0)`
	`172`	`+ {`
	`173`	`+ TLLM_LOG_INFO("One of the rank has been shut down, exiting");`
	`174`	`+ break;`
	`175`	`+ }`
	`176`	`+`
`165`	`177`	`// If we are not rank 0, send events to rank 0`
`166`	`178`	`if (mAttentionDpRank.value() != 0)`
`167`	`179`	`{`
`168`	`180`	`std::vector<char> serializedEvents;`
	`181`	`+ uint64_t numEvents = 0;`
`169`	`182`	`{`
`170`	`183`	`std::unique_lock<std::mutex> lck(mEventsMutex);`
`171`	`184`	`serializedEvents = executor::Serialization::serialize(mEvents);`
	`185`	`+ numEvents = mEvents.size();`
`172`	`186`	`mEvents.clear();`
`173`	`187`	`}`
`174`		`- uint64_t vecSize = serializedEvents.size();`
`175`		`- COMM_SESSION.send(&vecSize, 1, mpi::MpiType::kUINT64, 0, mpi::MpiTag::kKvCacheEventSize);`
`176`		`- COMM_SESSION.send(`
`177`		`- serializedEvents.data(), serializedEvents.size(), mpi::MpiType::kCHAR, 0, mpi::MpiTag::kKvCacheEvent);`
	`188`	`+ uint64_t vecSize = numEvents > 0 ? serializedEvents.size() : 0;`
	`189`	`+ mMpiComm->send(&vecSize, 1, mpi::MpiType::kUINT64, 0, mpi::MpiTag::kKvCacheEventSize);`
	`190`	`+ if (vecSize > 0)`
	`191`	`+ {`
	`192`	`+ mMpiComm->send(serializedEvents.data(), serializedEvents.size(), mpi::MpiType::kCHAR, 0,`
	`193`	`+ mpi::MpiTag::kKvCacheEvent);`
	`194`	`+ }`
`178`	`195`	`}`
`179`	`196`	`else`
`180`	`197`	`{`
`181`	`198`	`TLLM_CHECK(mAttentionDpSize.has_value());`
`182`	`199`	`// Loop until have received events from all ranks`
`183`		`- int32_t numRecvs = 0;`
`184`		`- while (numRecvs < mAttentionDpSize.value() - 1)`
	`200`	`+ for (int rank = 1; rank < mAttentionDpSize.value(); ++rank)`
`185`	`201`	`{`
`186`		`- MPI_Status probeStatus;`
`187`		`- if (COMM_SESSION.iprobe(MPI_ANY_SOURCE, mpi::MpiTag::kKvCacheEvent, &probeStatus))`
	`202`	`+ uint64_t vecSize{0};`
	`203`	`+ mMpiComm->recv(&vecSize, 1, mpi::MpiType::kUINT64, rank, mpi::MpiTag::kKvCacheEventSize);`
	`204`	`+ if (vecSize > 0)`
`188`	`205`	`{`
`189`		`- uint64_t vecSize{0};`
`190`		`- COMM_SESSION.recv(`
`191`		`- &vecSize, 1, mpi::MpiType::kUINT64, probeStatus.MPI_SOURCE, mpi::MpiTag::kKvCacheEventSize);`
`192`		`-`
`193`	`206`	`std::vector<char> serializedEvents(vecSize);`
`194`		`- COMM_SESSION.recv(serializedEvents.data(), vecSize, mpi::MpiType::kCHAR, probeStatus.MPI_SOURCE,`
`195`		`- mpi::MpiTag::kKvCacheEvent);`
	`207`	`+ mMpiComm->recv(`
	`208`	`+ serializedEvents.data(), vecSize, mpi::MpiType::kCHAR, rank, mpi::MpiTag::kKvCacheEvent);`
`196`	`209`
`197`	`210`	`// Deserialize the events and add them to the local queue`
`198`	`211`	`auto rankEvents = executor::Serialization::deserializeKVCacheEvents(serializedEvents);`
`@@ -201,11 +214,10 @@ void KVCacheEventManager::exchangeAttentionDpThread()`
`201`	`214`	`mEvents.insert(mEvents.end(), rankEvents.begin(), rankEvents.end());`
`202`	`215`	`mEmptyCV.notify_one();`
`203`	`216`	`}`
`204`		`- numRecvs++;`
`205`	`217`	`}`
`206`	`218`	`}`
`207`		`- std::this_thread::sleep_for(std::chrono::milliseconds(mAttentionDpEventsGatherPeriodMs));`
`208`	`219`	`}`
	`220`	`+ std::this_thread::sleep_for(std::chrono::milliseconds(mAttentionDpEventsGatherPeriodMs));`
`209`	`221`	`}`
`210`	`222`	`#else`
`211`	`223`	`TLLM_THROW("Multi device support is disabled.");`
Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ void initConfigBindings(pybind11::module_& m)`
`108`	`108`	`};`
`109`	`109`	`auto kvCacheConfigSetstate = [](py::tuple const& state)`
`110`	`110`	`{`
`111`		`- if (state.size() != 13)`
	`111`	`+ if (state.size() != 14)`
`112`	`112`	`{`
`113`	`113`	`throw std::runtime_error("Invalid state!");`
`114`	`114`	`}`