Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions examples/disaggregated/slurm/disaggr_torch.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,42 @@ container_image=${19}
mounts=${20}
workdir=${21}
model_dir=${22}
trtllm_repo=${23}

echo "================= parameters ================="
echo "num_ctx_servers: ${num_ctx_servers}"
echo "ctx_tp_size: ${ctx_tp_size}"
echo "ctx_batch_size: ${ctx_batch_size}"
echo "ctx_max_num_tokens: ${ctx_max_num_tokens}"
echo "ctx_enable_attention_dp: ${ctx_enable_attention_dp}"
echo "num_gen_servers: ${num_gen_servers}"
echo "gen_tp_size: ${gen_tp_size}"
echo "gen_batch_size: ${gen_batch_size}"
echo "gen_max_num_tokens: ${gen_max_num_tokens}"
echo "gen_enable_attention_dp: ${gen_enable_attention_dp}"
echo "gen_gpu_memory_fraction: ${gen_gpu_memory_fraction}"
echo "eplb_num_slots: ${eplb_num_slots}"
echo "mtp_size: ${mtp_size}"
echo "concurrency: ${concurrency}"
echo "isl: ${isl}"
echo "osl: ${osl}"
echo "multi_round: ${multi_round}"
echo "streaming: ${streaming}"
echo "container_image: ${container_image}"
echo "mounts: ${mounts}"
echo "workdir: ${workdir}"
echo "model_dir: ${model_dir}"
echo "trtllm_repo: ${trtllm_repo}"
echo "==========================================="


ctx_max_seq_len=$((isl + 1))
gen_max_seq_len=$((isl + osl))
ctx_gpu_frac=0.75
cache_transceiver_max_num_tokens=8448

container_name=disaggr
logdir=${workdir}/benchmark-${isl}-${osl}/
logdir=${workdir}/benchmark-${isl}-${osl}
mkdir -p ${logdir}
full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}

Expand All @@ -65,16 +93,27 @@ fi
mkdir -p ${full_logdir}
echo "Log will be saved to: ${full_logdir}"

if [ -z "${TRT_LLM_GIT_COMMIT}" ]; then
export TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
fi

nsys_on=""
# nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling

# start the container
srun -l --container-image=${container_image} \
--container-name=${container_name} \
--container-mounts=${mounts} \
--mpi=pmix \
echo "Container up."

if [ -n "${trtllm_repo}" ]; then
srun --container-name=${container_name} \
--container-mounts=${mounts} \
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
bash -c "cd ${trtllm_repo} && echo 'Running install operation...' && pip install -e . " 2>&1 | tee ${full_logdir}/install.log
fi

# generate the yaml file
srun -l --container-name=${container_name} \
--container-mounts=${mounts} \
Expand Down Expand Up @@ -104,11 +143,12 @@ echo "YAML file generated."
hostname_value=$(grep '^hostname:' ${full_logdir}/config.yaml | awk -F': ' '{print $2}')
echo "server host name: $hostname_value"


# start the workers
srun -l --container-name=${container_name} \
--container-mounts=${mounts} \
--mpi=pmix --overlap \
bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
--mpi=pmix --overlap \
bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &

# start the server
srun -l --container-name=${container_name} \
Expand All @@ -121,7 +161,7 @@ srun -l --container-name=${container_name} \
srun -l --container-name=${container_name} \
--container-mounts=${mounts} \
--mpi=pmix --overlap -N 1 -n 1 \
bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir}/ > ${full_logdir}/benchmark.log 2>&1
bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir} > ${full_logdir}/benchmark.log 2>&1

# try to kill the server and workers
srun -l --container-name=${container_name} \
Expand Down
52 changes: 26 additions & 26 deletions examples/disaggregated/slurm/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ isl=$1
osl=$2
multi_round=$3
model_name=$4
concurrency=$5
concurrency_list=$5
streaming=$6
log_path=$7

Expand Down Expand Up @@ -89,31 +89,31 @@ do_get_logs(){
}

# run the loadgen

mkdir -p ${log_path}/concurrency_${concurrency}
cp ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}/workers_start.log
max_count=$((${concurrency} * ${multi_round}))
echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"

python -m tensorrt_llm.serve.scripts.benchmark_serving \
--model ${model_name} \
--tokenizer ${model_name} \
--dataset-name random \
--dataset-path ${shared_gpt_path} \
--random-input-len ${isl} \
--random-output-len ${osl} \
--random-prefix-len 0 \
--num-prompts ${max_count} \
--max-concurrency ${concurrency} \
--host ${hostname} \
--port ${port} \
--ignore-eos \
--no-test-input \
$(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)

do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
# echo "" > ${log_path}/output_workers.log
echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
cp ${log_path}/output_workers.log ${log_path}/workers_start.log
for concurrency in ${concurrency_list}; do
mkdir -p ${log_path}/concurrency_${concurrency}
max_count=$((${concurrency} * ${multi_round}))
echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
python -m tensorrt_llm.serve.scripts.benchmark_serving \
--model ${model_name} \
--tokenizer ${model_name} \
--dataset-name random \
--dataset-path ${shared_gpt_path} \
--random-input-len ${isl} \
--random-output-len ${osl} \
--random-prefix-len 0 \
--num-prompts ${max_count} \
--max-concurrency ${concurrency} \
--host ${hostname} \
--port ${port} \
--ignore-eos \
--no-test-input \
$(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)

do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
echo "" > ${log_path}/output_workers.log
echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
done

echo "Benchmark done, gracefully shutting down server and workers..."
kill -9 $(ps aux | grep '[s]tart_server.sh' | awk '{print $2}') >/dev/null 2>&1 || true
Expand Down
2 changes: 2 additions & 0 deletions examples/disaggregated/slurm/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ container_image=<container_image>
mounts=<mounts> # e.g. /mnt/data:/mnt/data
workdir=<workdir> # Path to disaggr_torch.slurm
model_dir=<model_dir> # Path to the model checkpoint
repo_dir=<repo_dir> # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used

ntasks_per_node=4 # 4 GPUs per GB200 node
total_node_num=8
Expand All @@ -31,6 +32,7 @@ args=(
$mounts
$workdir
$model_dir
$repo_dir
)

# This command starts a job with 8 nodes, 32 GPUs in total.
Expand Down
5 changes: 4 additions & 1 deletion examples/wide_ep/slurm_scripts/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ container_image=<container_image>
mounts=<mounts> # e.g. /mnt/data:/mnt/data
workdir=<workdir> # Path to disaggr_torch.slurm
model_dir=<model_dir> # Path to the model checkpoint
repo_dir=<repo_dir> # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used

mtp_size=0
ntasks_per_node=4 # 4 GPUs per GB200 node
Expand All @@ -28,7 +29,7 @@ for b in 1 64 1024; do

args=(
${ctx_num} 4 4 4480 true # Context servers arguments
1 16 1024 1024 "0.7" # Generation servers arguments
1 16 1024 1024 true "0.7" # Generation servers arguments
$eplb_num_slots $mtp_size # Other arguments
$concurrency # Benchmarking arguments
$isl
Expand All @@ -39,6 +40,7 @@ for b in 1 64 1024; do
$mounts
$workdir
$model_dir
$repo_dir
)

sbatch --nodes=${total_node_num} \
Expand Down Expand Up @@ -74,6 +76,7 @@ for b in 512; do
$mounts
$workdir
$model_dir
$repo_dir
)

sbatch --nodes=${total_node_num} \
Expand Down
Loading