Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion tensorrt_llm/bench/build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,14 @@ def apply_build_mode_settings(params):
default=False,
help=
"Do not load the weights from the checkpoint. Use dummy weights instead.")
@optgroup.option(
"--trust_remote_code",
type=bool,
default=False,
help=
"Trust remote code for the HF models that are not natively implemented in the transformers library. "
"This is needed when using LLM API when loading the HF config to build the engine."
)
@optgroup.group(
"Build Engine with Dataset Information",
cls=AllOptionGroup,
Expand Down Expand Up @@ -238,6 +246,7 @@ def build_command(
target_output_len: int = params.get("target_output_len")

load_format = "dummy" if params.get("no_weights_loading") else "auto"
trust_remote_code: bool = params.get("trust_remote_code")
model_name = bench_env.model
checkpoint_path = bench_env.checkpoint_path or model_name
model_config = get_model_config(model_name, bench_env.checkpoint_path)
Expand Down Expand Up @@ -315,7 +324,8 @@ def build_command(
build_config=build_config,
quant_config=quant_config,
workspace=str(bench_env.workspace),
load_format=load_format)
load_format=load_format,
trust_remote_code=trust_remote_code)
# Save the engine.
llm.save(engine_dir)
llm.shutdown()
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/models/nemotron_nas/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def from_hugging_face(
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
trust_remote_code: bool = False,
trust_remote_code: bool = True,
**kwargs):
import transformers

Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/models/nemotron_nas/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ def from_hugging_face(cls,
quant_config: Optional[QuantConfig] = None,
load_by_shard: bool = False,
load_model_on_cpu: bool = False,
trust_remote_code: bool = False,
trust_remote_code: bool = True,
**kwargs) -> "DeciLMForCausalLM":
import transformers

Expand Down
11 changes: 10 additions & 1 deletion tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@
"llm-models/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
"llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
"llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
"llama_v3.3_nemotron_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1/",
"llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
"llama_v3.3_nemotron_super_49b":
"nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
# "llama_30b": "llama-models/llama-30b-hf",
"mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
"mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
Expand Down Expand Up @@ -99,6 +100,8 @@
"llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
"llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
"llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
"llama_v3.3_nemotron_super_49b_hf":
"nvidia/Llama-3_3-Nemotron-Super-49B-v1",
"mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1",
"mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
Expand All @@ -112,6 +115,10 @@

TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")

TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=True
"llama_v3.3_nemotron_super_49b"
}


def cpu_socket_count_gt_1():
global MAP_BY_SOCKET
Expand Down Expand Up @@ -925,6 +932,8 @@ def get_trtllm_bench_build_command(self, engine_dir) -> list:
if self._config.quantization:
build_cmd.append(
f"--quantization={self._config.quantization.upper()}")
if self._config.model_name in TRUST_REMOTE_CODE_MODELS:
build_cmd.append(f"--trust_remote_code=True")
return build_cmd

def get_benchmark_build_command(self, engine_dir) -> list:
Expand Down
10 changes: 10 additions & 0 deletions tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,16 @@ trt_llm_release_perf_test:
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4]
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:4]
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4]
# Llama-3.3-Nemotron-Super-49B-v1
# trt backend
# timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-con:1-gpus:4]
# timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-quant:fp8-con:1-gpus:4]
# timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:1-gpus:4]
# timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-con:1-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-con:250-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-quant:fp8-con:250-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4]
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-con:250-gpus:4]

- condition:
terms:
Expand Down