Handled the memory issue

cehongwang · cehongwang · commit a4a1767e94df · 2024-08-17T13:58:09.000-07:00
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -359,9 +359,8 @@ def check_weight_equal(
             sd_weight = sd_weight.reshape(-1)
             if not isinstance(network_weight, torch.Tensor):
                 network_weight = torch.from_numpy(network_weight).cuda()
-            return (
-                sd_weight.shape == network_weight.shape
-                and torch.all(torch.abs(sd_weight - network_weight) < 0.1).cpu()
+            return sd_weight.shape == network_weight.shape and torch.all(
+                torch.abs(sd_weight - network_weight) < 0.1
             )
 
         MODULE_MAP = {
@@ -481,6 +480,7 @@ def check_weight_equal(
         # If the model original position is on CPU, set it back to CPU and save GPU memory
         if not gm_is_on_cuda:
             self.module.to("cpu")
+        del np_map, sd
         torch.cuda.empty_cache()
 
     def run(
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -3,6 +3,7 @@
 import logging
 from typing import Any, List, Optional, Sequence
 
+import tensorrt as trt
 import torch
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._Device import Device
@@ -17,8 +18,6 @@
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
 from torch_tensorrt.dynamo.utils import get_torch_inputs
 
-import tensorrt as trt
-
 logger = logging.getLogger(__name__)
 
 
@@ -131,13 +130,13 @@ def convert_module(
     from torch_tensorrt.dynamo._refit import _refit_single_trt_engine_with_gm
     from torch_tensorrt.logging import TRT_LOGGER
 
-    runtime = trt.Runtime(TRT_LOGGER)
-    refit_test_engine = runtime.deserialize_cuda_engine(
-        interpreter_result.serialized_engine
-    )
     weight_name_map: Any = None
     # Do the test refit with cached map if make_refitable is enabled
     if settings.make_refitable:
+        runtime = trt.Runtime(TRT_LOGGER)
+        refit_test_engine = runtime.deserialize_cuda_engine(
+            interpreter_result.serialized_engine
+        )
         weight_name_map = interpreter_result.weight_name_map
         try:
             _refit_single_trt_engine_with_gm(
@@ -150,6 +149,9 @@ def convert_module(
         except AssertionError:
             logger.warning("Fast refit test failed. Removing the weight map caching.")
 
+        del refit_test_engine
+        torch.cuda.empty_cache()
+
     rt_cls = PythonTorchTensorRTModule
 
     if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime: