NVIDIA · schetlur-nv · May 19, 2025 · May 7, 2025 · May 7, 2025 · May 8, 2025
@@ -164,6 +164,14 @@ def apply_build_mode_settings(params):
     default=False,
     help=
     "Do not load the weights from the checkpoint. Use dummy weights instead.")
+@optgroup.option(
+    "--trust_remote_code",
+    type=bool,
+    default=False,
+    help=
+    "Trust remote code for the HF models that are not natively implemented in the transformers library. "
+    "This is needed when using LLM API when loading the HF config to build the engine."
+)
 @optgroup.group(
     "Build Engine with Dataset Information",
     cls=AllOptionGroup,
@@ -238,6 +246,7 @@ def build_command(
     target_output_len: int = params.get("target_output_len")
 
     load_format = "dummy" if params.get("no_weights_loading") else "auto"
+    trust_remote_code: bool = params.get("trust_remote_code")
     model_name = bench_env.model
     checkpoint_path = bench_env.checkpoint_path or model_name
     model_config = get_model_config(model_name, bench_env.checkpoint_path)
@@ -315,7 +324,8 @@ def build_command(
               build_config=build_config,
               quant_config=quant_config,
               workspace=str(bench_env.workspace),
-              load_format=load_format)
+              load_format=load_format,
+              trust_remote_code=trust_remote_code)
     # Save the engine.
     llm.save(engine_dir)
     llm.shutdown()

diff --git a/tensorrt_llm/models/nemotron_nas/config.py b/tensorrt_llm/models/nemotron_nas/config.py
@@ -154,7 +154,7 @@ def from_hugging_face(
             dtype: str = 'auto',
             mapping: Optional[Mapping] = None,
             quant_config: Optional[QuantConfig] = None,
-            trust_remote_code: bool = False,
+            trust_remote_code: bool = True,
             **kwargs):
         import transformers
 

diff --git a/tensorrt_llm/models/nemotron_nas/model.py b/tensorrt_llm/models/nemotron_nas/model.py
@@ -642,7 +642,7 @@ def from_hugging_face(cls,
                           quant_config: Optional[QuantConfig] = None,
                           load_by_shard: bool = False,
                           load_model_on_cpu: bool = False,
-                          trust_remote_code: bool = False,
+                          trust_remote_code: bool = True,
                           **kwargs) -> "DeciLMForCausalLM":
         import transformers
 

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -52,8 +52,9 @@
     "llm-models/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
     "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
     "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
-    "llama_v3.3_nemotron_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1/",
     "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
+    "llama_v3.3_nemotron_super_49b":
+    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
     # "llama_30b": "llama-models/llama-30b-hf",
     "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
     "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
@@ -99,6 +100,8 @@
     "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
     "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
     "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
+    "llama_v3.3_nemotron_super_49b_hf":
+    "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
     "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1",
     "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
@@ -112,6 +115,10 @@
 
 TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")
 
+TRUST_REMOTE_CODE_MODELS = {  # these models require explicit trust_remote_code=True
+    "llama_v3.3_nemotron_super_49b"
+}
+
 
 def cpu_socket_count_gt_1():
     global MAP_BY_SOCKET
@@ -925,6 +932,8 @@ def get_trtllm_bench_build_command(self, engine_dir) -> list:
         if self._config.quantization:
             build_cmd.append(
                 f"--quantization={self._config.quantization.upper()}")
+        if self._config.model_name in TRUST_REMOTE_CODE_MODELS:
+            build_cmd.append(f"--trust_remote_code=True")
         return build_cmd
 
     def get_benchmark_build_command(self, engine_dir) -> list:

diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
@@ -219,6 +219,16 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:512,200-quant:fp8-tp:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:4]
   - perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4]
+  # Llama-3.3-Nemotron-Super-49B-v1
+  # trt backend
+  # timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-con:1-gpus:4]
+  # timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-quant:fp8-con:1-gpus:4]
+  # timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:1-gpus:4]
+  # timeout - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-con:1-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-con:250-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:5000,500-quant:fp8-con:250-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-con:250-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-maxbs:16-input_output_len:500,2000-quant:fp8-con:250-gpus:4]
 
 - condition:
     terms: