Skip to content

Commit b12a7dd

Browse files
authored
Merge branch 'main' into user/pengyunl/disagg_check
2 parents 7ddd65c + 1f39a11 commit b12a7dd

File tree

120 files changed

+3025
-764
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

120 files changed

+3025
-764
lines changed

.coderabbit.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ reviews:
2727
related_issues: true
2828
related_prs: true
2929
suggested_labels: true
30-
auto_apply_labels: true
3130
suggested_reviewers: true
3231
auto_assign_reviewers: true
3332
poem: false

.github/workflows/label_issue.yml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
name: Label New Issues
2+
3+
on:
4+
issues:
5+
types: [opened]
6+
7+
permissions:
8+
issues: write
9+
contents: read
10+
11+
jobs:
12+
label-issue:
13+
runs-on: ubuntu-latest
14+
steps:
15+
- name: Checkout private action repository
16+
uses: actions/checkout@v4
17+
with:
18+
repository: poweiw/goggles_action
19+
path: ./.github/actions/goggles_action # local path to store the action
20+
token: ${{ secrets.GOGGLES_ACTION_REPO_TOKEN}} # token to access poweiw/goggles_action
21+
ref: v1.2.1
22+
23+
- name: AI Label Issue
24+
uses: ./.github/actions/goggles_action/actions/llm_label
25+
with:
26+
ACTION_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27+
LLM_MODEL_NAME: ${{ secrets.GOGGLES_LLM_MODEL_NAME }}
28+
LLM_TOKEN_SERVER_URL: ${{ secrets.GOGGLES_LLM_TOKEN_SERVER_URL }}
29+
LLM_TOKEN_CLIENT_ID: ${{ secrets.GOGGLES_LLM_TOKEN_CLIENT_ID }}
30+
LLM_TOKEN_CLIENT_SECRET: ${{ secrets.GOGGLES_LLM_TOKEN_CLIENT_SECRET }}
31+
LLM_GENERATE_URL: ${{ secrets.GOGGLES_LLM_GENERATE_URL }}
32+
LLM_TOKEN_SCOPE: ${{ secrets.GOGGLES_LLM_TOKEN_SCOPE }}
33+
REPO_OWNER: ${{ github.repository_owner }}
34+
REPO_NAME: ${{ github.event.repository.name }}
35+
ISSUE_NUMBER: ${{ github.event.issue.number }}
36+
ISSUE_TITLE: ${{ github.event.issue.title }}
37+
ISSUE_BODY: ${{ github.event.issue.body }}
38+
GITHUB_API_URL: ${{ github.api_url }}
39+
ACTIONS_STEP_VERBOSE: false
40+
EXCLUDED_LABELS: "bug,Community want to contribute,Community Engagement,duplicate,help wanted,Investigating,need more info,question,roadmap,stale,waiting for feedback,wontfix"
41+
LLM_SYSTEM_PROMPT: |
42+
You are an expert GitHub issue labeler. Your task is to analyze the provided issue title, issue body, and a list of available labels with their descriptions.
43+
Based on this information, select the single most appropriate label from the list that best captures the primary issue or request.
44+
Prefer selecting only one label that represents the main topic or problem. Only suggest multiple labels if the issue genuinely spans multiple distinct areas that are equally important.
45+
Respond with ONLY the chosen label name (e.g., 'bug', 'feature-request') or comma-separated names if multiple are truly needed.
46+
If no labels seem appropriate, respond with 'NONE'.
47+
Do not add any other text, explanation, or markdown formatting.

benchmarks/cpp/prepare_dataset.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@
1616
from typing import Optional, Tuple
1717

1818
import click
19-
from pydantic import BaseModel, field_validator
19+
from pydantic import BaseModel, model_validator
2020
from transformers import AutoTokenizer
21-
from transformers.tokenization_utils import PreTrainedTokenizer
22-
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
2321
from utils.prepare_real_data import dataset
2422
from utils.prepare_synthetic_data import token_norm_dist, token_unif_dist
2523

@@ -30,20 +28,25 @@ class RootArgs(BaseModel):
3028
random_seed: int
3129
task_id: int
3230
std_out: bool
31+
trust_remote_code: bool = False
3332
rand_task_id: Optional[Tuple[int, int]]
3433
lora_dir: Optional[str] = None
3534

36-
@field_validator('tokenizer')
37-
def get_tokenizer(cls,
38-
v: str) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
35+
@model_validator(mode='after')
36+
def validate_tokenizer(self):
3937
try:
40-
tokenizer = AutoTokenizer.from_pretrained(v, padding_side='left')
38+
tokenizer = AutoTokenizer.from_pretrained(
39+
self.tokenizer,
40+
padding_side='left',
41+
trust_remote_code=self.trust_remote_code)
4142
except EnvironmentError as e:
4243
raise ValueError(
4344
f"Cannot find a tokenizer from the given string because of {e}\nPlease set tokenizer to the directory that contains the tokenizer, or set to a model name in HuggingFace."
4445
)
4546
tokenizer.pad_token = tokenizer.eos_token
46-
return tokenizer
47+
self.tokenizer = tokenizer
48+
49+
return self
4750

4851

4952
@click.group()
@@ -82,6 +85,11 @@ def get_tokenizer(cls,
8285
default="info",
8386
type=click.Choice(['info', 'debug']),
8487
help="Logging level.")
88+
@click.option("--trust-remote-code",
89+
is_flag=True,
90+
default=False,
91+
envvar="TRUST_REMOTE_CODE",
92+
help="Trust remote code.")
8593
@click.pass_context
8694
def cli(ctx, **kwargs):
8795
"""This script generates dataset input for gptManagerBenchmark."""
@@ -98,7 +106,8 @@ def cli(ctx, **kwargs):
98106
random_seed=kwargs['random_seed'],
99107
task_id=kwargs['task_id'],
100108
rand_task_id=kwargs['rand_task_id'],
101-
lora_dir=kwargs['lora_dir'])
109+
lora_dir=kwargs['lora_dir'],
110+
trust_remote_code=kwargs['trust_remote_code'])
102111

103112

104113
cli.add_command(dataset)

cpp/include/tensorrt_llm/executor/executor.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1484,7 +1484,8 @@ class ExecutorConfig
14841484
std::optional<GuidedDecodingConfig> guidedDecodingConfig = std::nullopt,
14851485
std::optional<std::vector<AdditionalModelOutput>> additionalModelOutputs = std::nullopt,
14861486
std::optional<CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt,
1487-
bool gatherGenerationLogits = false, bool promptTableOffloading = false, bool enableTrtOverlap = false);
1487+
bool gatherGenerationLogits = false, bool promptTableOffloading = false, bool enableTrtOverlap = false,
1488+
bool failFastOnAttentionWindowTooLarge = false);
14881489

14891490
[[nodiscard]] SizeType32 getMaxBeamWidth() const;
14901491
[[nodiscard]] SchedulerConfig getSchedulerConfig() const;
@@ -1519,6 +1520,7 @@ class ExecutorConfig
15191520
[[nodiscard]] bool getPromptTableOffloading() const;
15201521
[[nodiscard]] std::optional<CacheTransceiverConfig> getCacheTransceiverConfig() const;
15211522
[[nodiscard]] bool getEnableTrtOverlap() const;
1523+
[[nodiscard]] bool getFailFastOnAttentionWindowTooLarge() const;
15221524

15231525
void setMaxBeamWidth(SizeType32 maxBeamWidth);
15241526
void setMaxBatchSize(SizeType32 maxBatchSize);
@@ -1548,6 +1550,7 @@ class ExecutorConfig
15481550
void setPromptTableOffloading(bool promptTableOffloading);
15491551
void setCacheTransceiverConfig(CacheTransceiverConfig const& cacheTransceiverConfig);
15501552
void setEnableTrtOverlap(bool enableTrtOverlap);
1553+
void setFailFastOnAttentionWindowTooLarge(bool failFastOnAttentionWindowTooLarge);
15511554

15521555
private:
15531556
friend class Serialization;
@@ -1634,6 +1637,10 @@ class ExecutorConfig
16341637

16351638
/// @brief Controls whether preparation and TRT engine execution should be overlapped.
16361639
bool mEnableTrtOverlap{false};
1640+
1641+
/// @brief Controls whether to fail fast when attention window is too large to fit even a single sequence in the KV
1642+
/// cache.
1643+
bool mFailFastOnAttentionWindowTooLarge{false};
16371644
};
16381645

16391646
struct KVCacheCreatedData

cpp/include/tensorrt_llm/kernels/archCondition.h

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,22 @@ namespace detail
2424

2525
#ifdef __CUDA_ARCH__
2626

27-
#ifdef __CUDA_ARCH_SPECIFIC__
27+
// __CUDA_ARCH_SPECIFIC__ is only available starting from CUDA 12.9
28+
#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
29+
#define HAS_CUDA_SPECIFIC_MACRO 1
30+
31+
#if __CUDA_ARCH__ >= 900
32+
#if !defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)
33+
#error "Compiling for SM90 or newer architectures must use Arch specific or Arch Family specific target"
34+
#endif
35+
#endif
36+
37+
#else
38+
#define HAS_CUDA_SPECIFIC_MACRO 0
39+
#endif
40+
41+
// For CUDA < 12.9, we assume that sm90 or newer architectures are always built with arch specific.
42+
#if defined(__CUDA_ARCH_SPECIFIC__) || (!HAS_CUDA_SPECIFIC_MACRO && __CUDA_ARCH__ >= 900)
2843
static constexpr bool isArchSpecific = true;
2944
#else
3045
static constexpr bool isArchSpecific = false;
@@ -52,12 +67,6 @@ struct arch_info
5267

5368
#endif
5469

55-
#if __CUDA_ARCH__ >= 900
56-
#if !defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)
57-
#error "Compiling for SM90 or newer architectures must use Arch specific or Arch Family specific target"
58-
#endif
59-
#endif
60-
6170
} // namespace detail
6271

6372
namespace arch

cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ void CacheFormatter::format(TransferSession& session)
166166
auto const numPools = blockManager.getNumPools();
167167
// TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
168168

169+
auto lastTokenTime = llmRequest.getPerfMetrics().timingMetrics.lastTokenTime;
170+
bool recordDelay = lastTokenTime != std::chrono::steady_clock::time_point();
171+
169172
bool layerWise = common::getEnvDisaggLayerwise() && numPools == 1;
170173
if (layerWise)
171174
{
@@ -350,9 +353,14 @@ void CacheFormatter::format(TransferSession& session)
350353
}
351354

352355
auto endTime = std::chrono::steady_clock::now();
356+
double delay = 0.0;
357+
if (recordDelay)
358+
{
359+
delay = std::chrono::duration<double, std::milli>(startTime - lastTokenTime).count();
360+
}
353361
double cacheTransferTime
354362
= std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
355-
kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, cacheTransferTime, size);
363+
kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, delay, cacheTransferTime, size);
356364
};
357365

358366
if (connections.size() > 1)
@@ -408,16 +416,19 @@ void CacheFormatter::unformat(TransferSession& session)
408416
{
409417
NVTX3_SCOPED_RANGE(CacheFormatter_unformat);
410418
auto const& llmRequest = session.getLlmRequest();
419+
auto const ctxReqId = llmRequest.getContextPhaseParams().value().getReqId();
411420
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
412-
"Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
413-
llmRequest.getContextPhaseParams().value().getReqId());
421+
"Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId, ctxReqId);
414422
auto const& connections = session.getConnections();
415423
auto const& selfConfig = session.getSelfState().getCacheState().value();
416424
auto const& destConfig = session.getOtherState().getCacheState().value();
417425
auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
418426
auto& bufferManager = session.getBufferManager();
419427
auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest);
420428

429+
auto arrivalTime = llmRequest.getPerfMetrics().timingMetrics.arrivalTime;
430+
bool recordDelay = arrivalTime != std::chrono::steady_clock::time_point();
431+
421432
auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
422433

423434
TLLM_LOG_DEBUG("pickUpConnections size: %d connections size: %d", pickUpConnections.size(), connections.size());
@@ -546,7 +557,7 @@ void CacheFormatter::unformat(TransferSession& session)
546557
}
547558
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
548559
"End receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
549-
llmRequest.getContextPhaseParams().value().getReqId());
560+
ctxReqId);
550561
return;
551562
}
552563
// legacyPath: context executor rank only send data to one gen executor rank. it sends multiple cache
@@ -634,6 +645,8 @@ void CacheFormatter::unformat(TransferSession& session)
634645
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
635646
TLLM_CHECK(pickUpConnections.size() > processIdx);
636647
TLLM_CHECK(recvSplitCaches.size() > processIdx);
648+
auto startTime = std::chrono::steady_clock::now();
649+
size_t size = 0;
637650
if (legacyPath)
638651
{
639652
size_t idx = processIdx * blockNum;
@@ -645,6 +658,7 @@ void CacheFormatter::unformat(TransferSession& session)
645658
size_t recvBufferIdx = blockIdx * pickUpConnections.size() + commIdx;
646659
llmRequest.updateKvCacheSize((*recvSplitCaches[recvBufferIdx]).getSizeInBytes());
647660
auto& buffer = recvSplitCaches.at(recvBufferIdx);
661+
size += buffer->getSizeInBytes();
648662
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
649663
idx++;
650664
}
@@ -655,6 +669,7 @@ void CacheFormatter::unformat(TransferSession& session)
655669
{
656670
llmRequest.updateKvCacheSize((*recvSplitCaches.at(processIdx)).getSizeInBytes());
657671
auto& buffer = recvSplitCaches[processIdx];
672+
size = buffer->getSizeInBytes();
658673
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
659674
}
660675
else if (bufferCoverTargetNum > 0)
@@ -663,6 +678,7 @@ void CacheFormatter::unformat(TransferSession& session)
663678
+ remainNoCoverTargetNum; // caches.at(recvBufferIdx) is allocated by cudaMalloc
664679
llmRequest.updateKvCacheSize((*recvSplitCaches.at(recvBufferIdx)).getSizeInBytes());
665680
auto& buffer = recvSplitCaches.at(recvBufferIdx);
681+
size = buffer->getSizeInBytes();
666682
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
667683
bufferManager.copy(*recvSplitCaches.at(recvBufferIdx), *recvSplitCaches[processIdx]);
668684
bufferManager.getStream().synchronize();
@@ -679,6 +695,7 @@ void CacheFormatter::unformat(TransferSession& session)
679695
auto recvSlice = runtime::ITensor::slice(preAllocRecvBuffer, 0, recvSize);
680696
auto copySlice = runtime::ITensor::slice(
681697
recvSplitCaches[processIdx], targetBufferSize - remainRecvSize, recvSize);
698+
size += recvSlice->getSizeInBytes();
682699
llmRequest.updateKvCacheSize((*recvSlice).getSizeInBytes());
683700
session.recv(pickUpConnections[processIdx], recvSlice->data(), recvSlice->getSizeInBytes());
684701
bufferManager.copy(*recvSlice, *copySlice);
@@ -687,6 +704,15 @@ void CacheFormatter::unformat(TransferSession& session)
687704
}
688705
}
689706
}
707+
auto endTime = std::chrono::steady_clock::now();
708+
double delay = 0.0;
709+
if (recordDelay)
710+
{
711+
delay = std::chrono::duration<double, std::milli>(startTime - arrivalTime).count();
712+
}
713+
double cacheTransferTime
714+
= std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
715+
kvCacheMeasureHelper.appendKVCacheTransfer(ctxReqId, delay, cacheTransferTime, size);
690716
};
691717
if (pickUpConnections.size() > 1)
692718
{

cpp/tensorrt_llm/batch_manager/cacheFormatter.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,15 @@ class BaseCacheFormatter
7676

7777
/// @brief Destructor.
7878
virtual ~BaseCacheFormatter() = default;
79+
80+
// TODO: better way for context/generation tagging
81+
void markAsSender(bool isSender)
82+
{
83+
kvCacheMeasureHelper.markAsSender(isSender);
84+
}
85+
86+
protected:
87+
KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};
7988
};
8089

8190
// Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
@@ -115,7 +124,6 @@ class CacheFormatter final : public BaseCacheFormatter
115124
private:
116125
BaseKVCacheManager* mCacheManager;
117126
CacheTransBufferManager* mCacheTransBufferManager;
118-
KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};
119127
};
120128

121129
std::unique_ptr<BaseCacheFormatter> createCacheFormatter(

0 commit comments

Comments
 (0)