analytics-zoo · yangw1234 · Mar 12, 2024
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -85,11 +85,13 @@ def run_vllm(
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
         device=device,
+        gpu_memory_utilization=0.85,
     )
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
         sampling_params = SamplingParams(
+
             n=n,
             temperature=0.0 if use_beam_search else 1.0,
             top_p=1.0,

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -62,7 +62,7 @@ def get_model(model_config: ModelConfig,
     with _set_default_torch_dtype(model_config.dtype):
         # Create a model instance.
         # The weights will be initialized as empty tensors.
-        with torch.device(device_config.device):
+        with torch.device("cpu"):
             model = model_class(model_config.hf_config, linear_method)
         if model_config.load_format == "dummy":
             # NOTE(woosuk): For accurate performance evaluation, we assign
@@ -76,5 +76,16 @@ def get_model(model_config: ModelConfig,
 
     if is_xpu():
         import intel_extension_for_pytorch as ipex
-        model = ipex.optimize(model)
+        # model = ipex.optimize(model)
+        from bigdl.llm import optimize_model
+        # print(model)
+        # input("pause")
+        optimize_model(model)
+        # print("optimized ***********************************")
+        # print(model)
+        model = model.to(device=device_config.device, dtype=model_config.dtype)
+        # import gc
+        # gc.collect()
+        # torch.xpu.empty_cache()
+        # input("pause")
     return model
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -73,9 +73,9 @@ def __init__(
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
+        gate_up = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
+        x = self.down_proj(x)
         return x
 
 
@@ -148,12 +148,12 @@ def forward(
         kv_cache: KVCache,
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
+        qkv = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         k_cache, v_cache = kv_cache
         attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
+        output = self.o_proj(attn_output)
         return output