Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 0 additions & 61 deletions tests/integration/defs/triton_server/build_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,67 +575,6 @@ def prepare_llava_onevision_engine(tensorrt_llm_multimodal_example_root,
return engine_dir, multimodal_engine_dir


def prepare_vila_engine(tensorrt_llm_multimodal_example_root,
tensorrt_llm_llama_example_root, vila_model_root,
vila_repo_root):
# Convert LLAMA from HF
ckpt_dir = os.path.join(tensorrt_llm_multimodal_example_root, "model_dir",
"vila")
convert_cmd = [
"python3",
f"{tensorrt_llm_llama_example_root}/convert_checkpoint.py",
f"--model_dir={vila_model_root}",
f"--output_dir={ckpt_dir}",
"--dtype=float16",
]

# Build VILA
engine_dir = os.path.join(tensorrt_llm_multimodal_example_root,
"engine_dir", "vila")

build_cmd = [
"trtllm-build",
f"--checkpoint_dir={ckpt_dir}",
"--gemm_plugin=float16",
"--max_batch_size=8",
"--max_multimodal_len=6272",
"--max_input_len=2048",
"--max_seq_len=2560",
f"--output_dir={engine_dir}",
]

multimodal_engine_dir = os.path.join(tensorrt_llm_multimodal_example_root,
"tmp", "trt_engines", "VILA1.5-3b",
"multimodal_encoder")

build_visual_engine_cmd = [
"python3", "build_multimodal_engine.py", "--model_type=vila",
f"--model_path={vila_model_root}", "--max_batch_size=32",
f"--vila_path={vila_repo_root}", f"--output_dir={multimodal_engine_dir}"
]

append_timing_cache_args(build_cmd)
convert_cmd = " ".join(convert_cmd)
build_cmd = " ".join(build_cmd)
build_visual_engine_cmd = " ".join(build_visual_engine_cmd)
if not os.path.exists(engine_dir):
check_call(convert_cmd, shell=True)
check_call(build_cmd, shell=True)
check_call(build_visual_engine_cmd,
shell=True,
cwd=tensorrt_llm_multimodal_example_root)
else:
print_info(f"Reusing engine: {engine_dir}")
print_info(f"Skipped: {convert_cmd}")
print_info(f"Skipped: {build_cmd}")
print_info(f"Skipped: {build_visual_engine_cmd}")

assert os.path.exists(engine_dir), f"{engine_dir} does not exists."
assert os.path.exists(
multimodal_engine_dir), f"{multimodal_engine_dir} does not exists."
return engine_dir, multimodal_engine_dir


def prepare_mllama_engine(tensorrt_llm_multimodal_example_root,
tensorrt_llm_mllama_example_root, mllama_model_root,
llm_backend_root):
Expand Down
24 changes: 0 additions & 24 deletions tests/integration/defs/triton_server/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,30 +465,6 @@ def llava_model_root():
return llava_model_root


@pytest.fixture(scope="session")
def vila_model_root():
models_root = llm_models_root()
assert models_root, "Did you set LLM_MODELS_ROOT?"
vila_model_root = os.path.join(models_root, "vila", "VILA1.5-3b")

assert os.path.exists(
vila_model_root
), f"{vila_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
return vila_model_root


@pytest.fixture(scope="session")
def vila_repo_root():
models_root = llm_models_root()
assert models_root, "Did you set LLM_MODELS_ROOT?"
vila_repo_root = os.path.join(models_root, "vila", "VILA")

assert os.path.exists(
vila_repo_root
), f"{vila_repo_root} does not exist under NFS LLM_MODELS_ROOT dir"
return vila_repo_root


@pytest.fixture(scope="session")
def mllama_model_root():
models_root = llm_models_root()
Expand Down
145 changes: 0 additions & 145 deletions tests/integration/defs/triton_server/test_triton_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2658,151 +2658,6 @@ def test_llava_onevision(
validate_by_keyword(output_result, keyword)


@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
["max_utilization", "guaranteed_no_evict"])
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"])
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
ids=["disableTrtOverlap"])
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching", "V1"])
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
ids=["enableDecoupleMode", "disableDecoupleMode"])
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
@pytest.mark.parametrize("DECODING_MODE", [""])
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
@pytest.mark.parametrize("MAX_NUM_IMAGES", ["4"])
def test_vila(
E2E_MODEL_NAME,
MAX_TOKENS_IN_KV_CACHE,
MAX_ATTENTION_WINDOW_SIZE,
BATCH_SCHEDULER_POLICY,
KV_CACHE_FREE_GPU_MEM_FRACTION,
ENABLE_TRT_OVERLAP,
BATCHING_STRATEGY,
DECOUPLED_MODE,
TRITON_MAX_BATCH_SIZE,
MAX_QUEUE_DELAY_MICROSECONDS,
MAX_BEAM_WIDTH,
ENABLE_KV_CACHE_REUSE,
NORMALIZE_LOG_PROBS,
ENABLE_CHUNKED_CONTEXT,
GPU_DEVICE_IDS,
DECODING_MODE,
PREPROCESSING_INSTANCE_COUNT,
POSTPROCESSING_INSTANCE_COUNT,
ACCUMULATE_TOKEN,
BLS_INSTANCE_COUNT,
EXCLUDE_INPUT_IN_OUTPUT,
MAX_NUM_IMAGES,
tensorrt_llm_multimodal_example_root,
tensorrt_llm_llama_example_root,
vila_model_root,
vila_repo_root,
llm_backend_multimodal_example_root,
llm_backend_venv,
):
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
pytest.skip("Skipping. V1 doesn't support max_utilization.")

if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
pytest.skip("Skipping.")

llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]

# install vila requirements
requirements_vila = os.path.join(llm_backend_repo_root, "all_models",
"multimodal", "requirements-vila.txt")
check_call(f"pip install -r {requirements_vila}", shell=True)

# Build Engine
ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_vila_engine(
tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root,
vila_model_root, vila_repo_root)
# Prepare model repo
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)

# Prepare multimodal specific repo
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
"ensemble")
prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo,
"multimodal_encoders")

# Modify config.pbtxt
TOKENIZER_PATH = os.path.join(vila_model_root, "llm")
modify_ib_config_pbtxt(
new_model_repo,
ENGINE_PATH,
TOKENIZER_PATH,
llm_backend_repo_root,
DECOUPLED_MODE,
MAX_TOKENS_IN_KV_CACHE,
MAX_ATTENTION_WINDOW_SIZE,
BATCH_SCHEDULER_POLICY,
BATCHING_STRATEGY,
KV_CACHE_FREE_GPU_MEM_FRACTION,
EXCLUDE_INPUT_IN_OUTPUT,
ENABLE_TRT_OVERLAP,
TRITON_MAX_BATCH_SIZE,
MAX_QUEUE_DELAY_MICROSECONDS,
MAX_BEAM_WIDTH,
ENABLE_KV_CACHE_REUSE,
NORMALIZE_LOG_PROBS,
ENABLE_CHUNKED_CONTEXT,
GPU_DEVICE_IDS,
DECODING_MODE,
PREPROCESSING_INSTANCE_COUNT,
POSTPROCESSING_INSTANCE_COUNT,
ACCUMULATE_TOKEN,
BLS_INSTANCE_COUNT,
MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR,
MAX_NUM_IMAGES=MAX_NUM_IMAGES,
)

# Launch Triton Server
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
"launch_triton_server.py")

# NOTE
# Due to mpi init error, manually set PMIX_MCA_gds=hash (ref: https://github.com/open-mpi/ompi/issues/6981)
check_call(
f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
shell=True)
check_server_ready()
# Run Test

text_prompt = "<image>\nPlease elaborate what you see in the image?"
run_cmd = [
f"{llm_backend_multimodal_example_root}/client.py",
"--model_type=vila",
f"--hf_model_dir={vila_model_root}",
f"--text='{text_prompt}'",
]
if DECOUPLED_MODE == "True":
run_cmd += [
"--streaming",
]

if E2E_MODEL_NAME == "tensorrt_llm_bls":
run_cmd += [
"--use_bls",
]

venv_check_call(llm_backend_venv, run_cmd)


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble"])
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/test-db/l0_a30.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ l0_a30:
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llava[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
Expand Down
16 changes: 0 additions & 16 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -465,22 +465,6 @@ unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" SKIP (https://nv
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
unittest/_torch/speculative/test_eagle3.py SKIP (https://nvbugs/5280806)
test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image] SKIP (https://nvbugs/5226211)
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/4931591)
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-V1-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-ensemble]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
triton_server/test_triton_llm.py::test_vila[4-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-ensemble]
triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP
Expand Down