diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 89c079c7ed16..cd9877cc7024 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -85,11 +85,13 @@ def run_vllm( max_model_len=max_model_len, enforce_eager=enforce_eager, device=device, + gpu_memory_utilization=0.85, ) # Add the requests to the engine. for prompt, _, output_len in requests: sampling_params = SamplingParams( + n=n, temperature=0.0 if use_beam_search else 1.0, top_p=1.0, diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index 8c17b3df1311..1145de416414 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -62,7 +62,7 @@ def get_model(model_config: ModelConfig, with _set_default_torch_dtype(model_config.dtype): # Create a model instance. # The weights will be initialized as empty tensors. - with torch.device(device_config.device): + with torch.device("cpu"): model = model_class(model_config.hf_config, linear_method) if model_config.load_format == "dummy": # NOTE(woosuk): For accurate performance evaluation, we assign @@ -76,5 +76,16 @@ def get_model(model_config: ModelConfig, if is_xpu(): import intel_extension_for_pytorch as ipex - model = ipex.optimize(model) + # model = ipex.optimize(model) + from bigdl.llm import optimize_model + # print(model) + # input("pause") + optimize_model(model) + # print("optimized ***********************************") + # print(model) + model = model.to(device=device_config.device, dtype=model_config.dtype) + # import gc + # gc.collect() + # torch.xpu.empty_cache() + # input("pause") return model diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3791aa893893..804b1ce4d689 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -73,9 +73,9 @@ def __init__( self.act_fn = SiluAndMul() def forward(self, x): - gate_up, _ = self.gate_up_proj(x) + gate_up = self.gate_up_proj(x) x = self.act_fn(gate_up) - x, _ = self.down_proj(x) + x = self.down_proj(x) return x @@ -148,12 +148,12 @@ def forward( kv_cache: KVCache, input_metadata: InputMetadata, ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) + qkv = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) k_cache, v_cache = kv_cache attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) + output = self.o_proj(attn_output) return output