From efc655b2fb8ac1c7b8714bac5aa176d0557ce249 Mon Sep 17 00:00:00 2001 From: gc-fu Date: Thu, 24 Oct 2024 13:57:18 +0800 Subject: [PATCH 1/4] temp --- csrc/prepare_inputs/advance_step.cu | 5 ++ csrc/xpu/attention_xpu.cpp | 66 +++++++++++++++ csrc/xpu/pybind.cpp | 6 ++ csrc/xpu/xpu_ops.h | 9 ++ vllm/attention/backends/ipex_attn.py | 2 + vllm/executor/ray_utils.py | 2 + vllm/worker/xpu_multi_step_model_runner.py | 99 +++++++++++++++------- 7 files changed, 159 insertions(+), 30 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index a9d08ca0dc14..315741e34061 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -19,10 +19,12 @@ __global__ void advance_step_flashattn_kernel( int64_t const block_tables_stride) { int num_query_blocks = div_ceil(num_queries, num_threads); + // We only need num_query_blocks in total... if (blockIdx.x >= num_query_blocks) { return; } + // For this thread, if cur_query_id >= num_queries, then done... int cur_query_id = blockIdx.x * num_threads + threadIdx.x; if (cur_query_id >= num_queries) { @@ -30,6 +32,7 @@ __global__ void advance_step_flashattn_kernel( } // Update input_tokens + // Handle for this query input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; int seq_len = seq_lens_ptr[cur_query_id]; @@ -181,6 +184,8 @@ void advance_step_flashattn(int num_seqs, int num_queries, int block_size, int blocks; cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); + // Dimension is blocks, max_threads + // Get num available blocks... advance_step_flashattn_kernel <<>>( num_seqs, num_queries, block_size, diff --git a/csrc/xpu/attention_xpu.cpp b/csrc/xpu/attention_xpu.cpp index 4fc9afe78680..effa1b424651 100644 --- a/csrc/xpu/attention_xpu.cpp +++ b/csrc/xpu/attention_xpu.cpp @@ -1254,4 +1254,70 @@ void paged_attention_v2( query.scalar_type(), "paged_attention_xpu_v2_impl", [&] { CALL_V2_LAUNCHER_BLOCK_SIZE(scalar_t); }); +} + + +constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } + +void advance_step_ipex(int num_seqs, int num_queries, int block_size, + torch::Tensor& input_tokens, // type: long + torch::Tensor& sampled_token_ids, // type: long + torch::Tensor& input_positions, // type: long + torch::Tensor& seq_lens, // type: int + torch::Tensor& slot_mapping, // type: long + torch::Tensor& block_tables) { + // std::cout << "advance step ipex get called!!!!!!" << std::endl; + sycl::queue& queue = vllm::xpu::vllmGetQueue(); + int num_blocks = 32; + int num_threads = 128; + long* input_tokens_ptr = reinterpret_cast(input_tokens.data_ptr()); + long const* sampled_token_ids_ptr = reinterpret_cast(sampled_token_ids.data_ptr()); + long* input_positions_ptr = reinterpret_cast(input_positions.data_ptr()); + int* seq_lens_ptr = reinterpret_cast(seq_lens.data_ptr()); + long* slot_mapping_ptr = reinterpret_cast(slot_mapping.data_ptr()); + int const* block_tables_ptr = reinterpret_cast(block_tables.data_ptr()); + int64_t const block_tables_stride = block_tables.stride(0); + sycl::range<1> grid(num_blocks); + sycl::range<1> block(num_threads); + queue.submit([&](sycl::handler & cgh){ + cgh.parallel_for( + sycl::nd_range<1>(grid * block, block), + [=](sycl::nd_item<1> item_ct1){ + //constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } + int num_query_blocks = div_ceil(num_queries, num_threads); + + int group = item_ct1.get_group(0); + + if (group >= num_query_blocks) { + return; + } + + int cur_query_id = group * num_threads + item_ct1.get_local_id(0); + + if (cur_query_id >= num_queries) { + return; + } + + input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; + int seq_len = seq_lens_ptr[cur_query_id]; + int next_seq_len = seq_len + 1; + int next_input_pos = next_seq_len - 1; + + // Update seq_lens + seq_lens_ptr[cur_query_id] = next_seq_len; + // Update input_positions + input_positions_ptr[cur_query_id] = next_input_pos; + + int const* seq_block_tables_ptr = + block_tables_ptr + block_tables_stride * cur_query_id; + int block_index = next_input_pos / block_size; + int block_offset = next_input_pos % block_size; + + int slot_num = + seq_block_tables_ptr[block_index] * block_size + block_offset; + // Update slot_mapping + slot_mapping_ptr[cur_query_id] = slot_num; + } + ); + }); } \ No newline at end of file diff --git a/csrc/xpu/pybind.cpp b/csrc/xpu/pybind.cpp index 4e7f2fa6bd80..b70b2eba121a 100644 --- a/csrc/xpu/pybind.cpp +++ b/csrc/xpu/pybind.cpp @@ -69,6 +69,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "reshape_and_cache", &reshape_and_cache, "Reshape the key and value tensors and cache them"); + + ops.def( + "advance_step_ipex", + &advance_step_ipex, + "Advance steps function used in multi-steps scheduler" + ); // Quant ops.def( diff --git a/csrc/xpu/xpu_ops.h b/csrc/xpu/xpu_ops.h index 6125b19ac80b..2dcc82dcbbc0 100644 --- a/csrc/xpu/xpu_ops.h +++ b/csrc/xpu/xpu_ops.h @@ -93,6 +93,15 @@ torch::Tensor marlin_gemm( TORCH_CHECK(false, "marlin_gemm is not supported on XPU."); } + +void advance_step_ipex(int num_seqs, int num_queries, int block_size, + torch::Tensor& input_tokens, // type: long + torch::Tensor& sampled_token_ids, // type: long + torch::Tensor& input_positions, // type: long + torch::Tensor& seq_lens, // type: int + torch::Tensor& slot_mapping, // type: long + torch::Tensor& block_tables); + torch::Tensor awq_dequantize(torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 4e595390637f..5a5c3fb933da 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -131,6 +131,8 @@ def advance_step(self, num_seqs, num_queries): for i in range(num_queries): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) + # import vllm._C.ops + # vllm._C.ops.advance_step_ipex(0,0,0,torch.empty(3), torch.empty(3), torch.empty(3), torch.empty(3), torch.empty(3), torch.empty(3)) from torch.nn.functional import scaled_dot_product_attention diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7e46acefc5b0..f1e00619f1a0 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -32,6 +32,8 @@ class RayWorkerWrapper(WorkerWrapperBase): lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" def __init__(self, *args, **kwargs) -> None: + from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert + _ipex_llm_convert(load_in_low_bit="fp8") super().__init__(*args, **kwargs) # Since the compiled DAG runs a main execution # in a different thread that calls cuda.set_device. diff --git a/vllm/worker/xpu_multi_step_model_runner.py b/vllm/worker/xpu_multi_step_model_runner.py index 2bd1f84698a3..60e14a118e3c 100644 --- a/vllm/worker/xpu_multi_step_model_runner.py +++ b/vllm/worker/xpu_multi_step_model_runner.py @@ -431,6 +431,8 @@ def execute_model( # Pythonize the output and block if needed since it is the last step if model_input.is_last_step: + # import pdb + # pdb.set_trace() outputs = self._final_process_outputs(model_input, output_proc_callback) self.pythonization_cache.reset() @@ -472,39 +474,76 @@ def _advance_step(self, model_input: XPUStatefulModelInput, attn_metadata = frozen_model_input.attn_metadata assert isinstance(attn_metadata, IpexAttnMetadata) + # Add one to self.seq_lens attn_metadata.advance_step(num_seqs, num_queries) - - # refer ops.advance_step() - next_seq_len = attn_metadata.seq_lens_tensor + 1 - next_input_pos = next_seq_len - 1 - attn_metadata.seq_lens_tensor = next_seq_len - - block_index = next_input_pos // self.block_size - block_offset = next_input_pos % self.block_size - slot = attn_metadata.block_tables - slot_num = slot[torch.arange(num_queries), block_index] * self.block_size + block_offset - attn_metadata.slot_mapping = slot_num.to(dtype=torch.long) - - tmp_input_tokens = frozen_model_input.input_tokens sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids - if sampled_token_ids.dim() > 1 and sampled_token_ids.size(-1) == 1: - sampled_token_ids = sampled_token_ids.squeeze(-1) - tmp_input_tokens[:num_queries] = sampled_token_ids[:num_queries] - tmp_input_positions = frozen_model_input.input_positions - tmp_input_positions[:num_queries] = next_input_pos[:num_queries] - frozen_model_input = dataclasses.replace( - frozen_model_input, - input_tokens=tmp_input_tokens, - input_positions=tmp_input_positions, - ) - if frozen_model_input.seq_lens is not None: - tmp_seq_lens = frozen_model_input.seq_lens - tmp_seq_lens[:num_queries] = attn_metadata.seq_lens[:num_queries] - frozen_model_input = dataclasses.replace( - frozen_model_input, - seq_lens=tmp_seq_lens, - ) + cloned_input_tokens = frozen_model_input.input_tokens.clone() + cloned_sampled_token_ids = sampled_token_ids.clone() + cloned_input_positions = frozen_model_input.input_positions.clone() + cloned_seq_lens = attn_metadata.seq_lens_tensor.clone() + cloned_slot_mappings = attn_metadata.slot_mapping.clone() + cloned_block_tables = attn_metadata.block_tables.clone() + + # import pdb + # pdb.set_trace() + import vllm._C.ops + # update the following: input_tokens, seq_lens, input_positions, slot_mapping + vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, frozen_model_input.input_tokens, sampled_token_ids, frozen_model_input.input_positions, attn_metadata.seq_lens_tensor, attn_metadata.slot_mapping, attn_metadata.block_tables) + # torch.xpu.synchronize() + # vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, cloned_input_tokens, cloned_sampled_token_ids, cloned_input_positions, cloned_seq_lens, cloned_slot_mappings, cloned_block_tables) + + # refer ops.advance_step() + # What we tryna do: + # Update model_input + + # 1. construct next_seq_len and next_input_pos + # next_seq_len = attn_metadata.seq_lens_tensor + 1 + # # 2. Construct next_input_pos, which is current_input_pos + # next_input_pos = next_seq_len - 1 + # # 3. Update attn_metadata + # attn_metadata.seq_lens_tensor = next_seq_len + + # block_index = next_input_pos // self.block_size + # block_offset = next_input_pos % self.block_size + # slot = attn_metadata.block_tables + # # 4. Calculate slot_mapping + # slot_num = slot[torch.arange(num_queries), block_index] * self.block_size + block_offset + # attn_metadata.slot_mapping = slot_num.to(dtype=torch.long) + + # tmp_input_tokens = frozen_model_input.input_tokens + # sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids + # if sampled_token_ids.dim() > 1 and sampled_token_ids.size(-1) == 1: + # sampled_token_ids = sampled_token_ids.squeeze(-1) + # # 5. Construct tmp_input_tokens, set to sampled_token_ids + # tmp_input_tokens[:num_queries] = sampled_token_ids[:num_queries] + # # 6. Construct tmp_input_positions, set to next_input_pos + # tmp_input_positions = frozen_model_input.input_positions + # tmp_input_positions[:num_queries] = next_input_pos[:num_queries] + # # 7. Set input_tokens and input_positions. + # frozen_model_input = dataclasses.replace( + # frozen_model_input, + # input_tokens=tmp_input_tokens, + # input_positions=tmp_input_positions, + # ) + + # Reset seq_lens + # if frozen_model_input.seq_lens is not None: + # tmp_seq_lens = frozen_model_input.seq_lens + # tmp_seq_lens[:num_queries] = attn_metadata.seq_lens[:num_queries] + # frozen_model_input = dataclasses.replace( + # frozen_model_input, + # seq_lens=tmp_seq_lens, + # ) + # else: + # print("We are finding a None seq_lens!!!!!!!!!!!!!!!!!!!!") + + # assert torch.equal(frozen_model_input.input_tokens, cloned_input_tokens) + # assert torch.equal(frozen_model_input.input_positions, cloned_input_positions) + # assert torch.equal(attn_metadata.slot_mapping, cloned_slot_mappings) + # assert torch.equal(attn_metadata.seq_lens_tensor, cloned_seq_lens) + + # print("All checked passed") return model_input From f40d8d64ca1a6b702aa29df9523faada546e17cf Mon Sep 17 00:00:00 2001 From: gc-fu Date: Fri, 25 Oct 2024 15:43:18 +0800 Subject: [PATCH 2/4] add sycl kernel for scheduler --- csrc/xpu/attention_xpu.cpp | 5 +- vllm/worker/xpu_multi_step_model_runner.py | 88 +++++++++------------- 2 files changed, 39 insertions(+), 54 deletions(-) diff --git a/csrc/xpu/attention_xpu.cpp b/csrc/xpu/attention_xpu.cpp index effa1b424651..323af828a52c 100644 --- a/csrc/xpu/attention_xpu.cpp +++ b/csrc/xpu/attention_xpu.cpp @@ -1268,8 +1268,9 @@ void advance_step_ipex(int num_seqs, int num_queries, int block_size, torch::Tensor& block_tables) { // std::cout << "advance step ipex get called!!!!!!" << std::endl; sycl::queue& queue = vllm::xpu::vllmGetQueue(); - int num_blocks = 32; - int num_threads = 128; + // TODO: we might want to adjust this value + int num_blocks = 1024; + int num_threads = 32; long* input_tokens_ptr = reinterpret_cast(input_tokens.data_ptr()); long const* sampled_token_ids_ptr = reinterpret_cast(sampled_token_ids.data_ptr()); long* input_positions_ptr = reinterpret_cast(input_positions.data_ptr()); diff --git a/vllm/worker/xpu_multi_step_model_runner.py b/vllm/worker/xpu_multi_step_model_runner.py index 60e14a118e3c..8cf676600853 100644 --- a/vllm/worker/xpu_multi_step_model_runner.py +++ b/vllm/worker/xpu_multi_step_model_runner.py @@ -478,66 +478,50 @@ def _advance_step(self, model_input: XPUStatefulModelInput, attn_metadata.advance_step(num_seqs, num_queries) sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids - cloned_input_tokens = frozen_model_input.input_tokens.clone() - cloned_sampled_token_ids = sampled_token_ids.clone() - cloned_input_positions = frozen_model_input.input_positions.clone() - cloned_seq_lens = attn_metadata.seq_lens_tensor.clone() - cloned_slot_mappings = attn_metadata.slot_mapping.clone() - cloned_block_tables = attn_metadata.block_tables.clone() - - # import pdb - # pdb.set_trace() + # cloned_input_tokens = frozen_model_input.input_tokens.clone() + # cloned_sampled_token_ids = sampled_token_ids.clone() + # cloned_input_positions = frozen_model_input.input_positions.clone() + # cloned_seq_lens = attn_metadata.seq_lens_tensor.clone() + # cloned_slot_mappings = attn_metadata.slot_mapping.clone() + # cloned_block_tables = attn_metadata.block_tables.clone() + import vllm._C.ops - # update the following: input_tokens, seq_lens, input_positions, slot_mapping vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, frozen_model_input.input_tokens, sampled_token_ids, frozen_model_input.input_positions, attn_metadata.seq_lens_tensor, attn_metadata.slot_mapping, attn_metadata.block_tables) # torch.xpu.synchronize() # vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, cloned_input_tokens, cloned_sampled_token_ids, cloned_input_positions, cloned_seq_lens, cloned_slot_mappings, cloned_block_tables) # refer ops.advance_step() - # What we tryna do: - # Update model_input - - # 1. construct next_seq_len and next_input_pos - # next_seq_len = attn_metadata.seq_lens_tensor + 1 - # # 2. Construct next_input_pos, which is current_input_pos - # next_input_pos = next_seq_len - 1 - # # 3. Update attn_metadata - # attn_metadata.seq_lens_tensor = next_seq_len - - # block_index = next_input_pos // self.block_size - # block_offset = next_input_pos % self.block_size - # slot = attn_metadata.block_tables - # # 4. Calculate slot_mapping - # slot_num = slot[torch.arange(num_queries), block_index] * self.block_size + block_offset - # attn_metadata.slot_mapping = slot_num.to(dtype=torch.long) - - # tmp_input_tokens = frozen_model_input.input_tokens - # sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids - # if sampled_token_ids.dim() > 1 and sampled_token_ids.size(-1) == 1: - # sampled_token_ids = sampled_token_ids.squeeze(-1) - # # 5. Construct tmp_input_tokens, set to sampled_token_ids - # tmp_input_tokens[:num_queries] = sampled_token_ids[:num_queries] - # # 6. Construct tmp_input_positions, set to next_input_pos - # tmp_input_positions = frozen_model_input.input_positions - # tmp_input_positions[:num_queries] = next_input_pos[:num_queries] - # # 7. Set input_tokens and input_positions. - # frozen_model_input = dataclasses.replace( - # frozen_model_input, - # input_tokens=tmp_input_tokens, - # input_positions=tmp_input_positions, - # ) + ##################### Original implementation ################### + next_seq_len = attn_metadata.seq_lens_tensor + 1 + next_input_pos = next_seq_len - 1 + attn_metadata.seq_lens_tensor = next_seq_len + + block_index = next_input_pos // self.block_size + block_offset = next_input_pos % self.block_size + slot = attn_metadata.block_tables + slot_num = slot[torch.arange(num_queries), block_index] * self.block_size + block_offset + attn_metadata.slot_mapping = slot_num.to(dtype=torch.long) + + tmp_input_tokens = frozen_model_input.input_tokens + if sampled_token_ids.dim() > 1 and sampled_token_ids.size(-1) == 1: + sampled_token_ids = sampled_token_ids.squeeze(-1) + tmp_input_tokens[:num_queries] = sampled_token_ids[:num_queries] + tmp_input_positions = frozen_model_input.input_positions + tmp_input_positions[:num_queries] = next_input_pos[:num_queries] + frozen_model_input = dataclasses.replace( + frozen_model_input, + input_tokens=tmp_input_tokens, + input_positions=tmp_input_positions, + ) # Reset seq_lens - # if frozen_model_input.seq_lens is not None: - # tmp_seq_lens = frozen_model_input.seq_lens - # tmp_seq_lens[:num_queries] = attn_metadata.seq_lens[:num_queries] - # frozen_model_input = dataclasses.replace( - # frozen_model_input, - # seq_lens=tmp_seq_lens, - # ) - # else: - # print("We are finding a None seq_lens!!!!!!!!!!!!!!!!!!!!") - + if frozen_model_input.seq_lens is not None: + tmp_seq_lens = frozen_model_input.seq_lens + tmp_seq_lens[:num_queries] = attn_metadata.seq_lens[:num_queries] + frozen_model_input = dataclasses.replace( + frozen_model_input, + seq_lens=tmp_seq_lens, + ) # assert torch.equal(frozen_model_input.input_tokens, cloned_input_tokens) # assert torch.equal(frozen_model_input.input_positions, cloned_input_positions) # assert torch.equal(attn_metadata.slot_mapping, cloned_slot_mappings) From ad590af5fa2fccccafbfe72aedee30a6d912ce25 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Mon, 23 Sep 2024 18:38:04 -0400 Subject: [PATCH 3/4] Enable new option --multi-step-stream-outputs --- vllm/sequence.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/sequence.py b/vllm/sequence.py index 49a198df045b..f0b1eaa8d98f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -695,6 +695,8 @@ def __init__( self.cached_request_output = None + self.cached_request_output = None + @property def prompt(self) -> Optional[str]: # All sequences in the group should have the same prompt. From fd9d2cdea496dfd83b5c61eac5eb4ddfc97626bd Mon Sep 17 00:00:00 2001 From: gc-fu Date: Tue, 29 Oct 2024 15:20:51 +0800 Subject: [PATCH 4/4] fix --- csrc/prepare_inputs/advance_step.cu | 5 ----- csrc/xpu/attention_xpu.cpp | 2 +- vllm/attention/backends/ipex_attn.py | 2 -- vllm/executor/ray_utils.py | 2 -- vllm/sequence.py | 3 --- vllm/worker/xpu_multi_step_model_runner.py | 7 +++---- 6 files changed, 4 insertions(+), 17 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index 315741e34061..a9d08ca0dc14 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -19,12 +19,10 @@ __global__ void advance_step_flashattn_kernel( int64_t const block_tables_stride) { int num_query_blocks = div_ceil(num_queries, num_threads); - // We only need num_query_blocks in total... if (blockIdx.x >= num_query_blocks) { return; } - // For this thread, if cur_query_id >= num_queries, then done... int cur_query_id = blockIdx.x * num_threads + threadIdx.x; if (cur_query_id >= num_queries) { @@ -32,7 +30,6 @@ __global__ void advance_step_flashattn_kernel( } // Update input_tokens - // Handle for this query input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; int seq_len = seq_lens_ptr[cur_query_id]; @@ -184,8 +181,6 @@ void advance_step_flashattn(int num_seqs, int num_queries, int block_size, int blocks; cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); - // Dimension is blocks, max_threads - // Get num available blocks... advance_step_flashattn_kernel <<>>( num_seqs, num_queries, block_size, diff --git a/csrc/xpu/attention_xpu.cpp b/csrc/xpu/attention_xpu.cpp index 323af828a52c..3f4d71e9d7fc 100644 --- a/csrc/xpu/attention_xpu.cpp +++ b/csrc/xpu/attention_xpu.cpp @@ -1321,4 +1321,4 @@ void advance_step_ipex(int num_seqs, int num_queries, int block_size, } ); }); -} \ No newline at end of file +} diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 5a5c3fb933da..4e595390637f 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -131,8 +131,6 @@ def advance_step(self, num_seqs, num_queries): for i in range(num_queries): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) - # import vllm._C.ops - # vllm._C.ops.advance_step_ipex(0,0,0,torch.empty(3), torch.empty(3), torch.empty(3), torch.empty(3), torch.empty(3), torch.empty(3)) from torch.nn.functional import scaled_dot_product_attention diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index f1e00619f1a0..7e46acefc5b0 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -32,8 +32,6 @@ class RayWorkerWrapper(WorkerWrapperBase): lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" def __init__(self, *args, **kwargs) -> None: - from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert - _ipex_llm_convert(load_in_low_bit="fp8") super().__init__(*args, **kwargs) # Since the compiled DAG runs a main execution # in a different thread that calls cuda.set_device. diff --git a/vllm/sequence.py b/vllm/sequence.py index f0b1eaa8d98f..a46042a4f821 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -692,9 +692,6 @@ def __init__( self.encoder_seq = encoder_seq self.trace_headers = trace_headers self.priority = priority - - self.cached_request_output = None - self.cached_request_output = None @property diff --git a/vllm/worker/xpu_multi_step_model_runner.py b/vllm/worker/xpu_multi_step_model_runner.py index 8cf676600853..71e851dd5210 100644 --- a/vllm/worker/xpu_multi_step_model_runner.py +++ b/vllm/worker/xpu_multi_step_model_runner.py @@ -431,8 +431,6 @@ def execute_model( # Pythonize the output and block if needed since it is the last step if model_input.is_last_step: - # import pdb - # pdb.set_trace() outputs = self._final_process_outputs(model_input, output_proc_callback) self.pythonization_cache.reset() @@ -485,8 +483,9 @@ def _advance_step(self, model_input: XPUStatefulModelInput, # cloned_slot_mappings = attn_metadata.slot_mapping.clone() # cloned_block_tables = attn_metadata.block_tables.clone() - import vllm._C.ops - vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, frozen_model_input.input_tokens, sampled_token_ids, frozen_model_input.input_positions, attn_metadata.seq_lens_tensor, attn_metadata.slot_mapping, attn_metadata.block_tables) + ############### New implementation ############################## + # import vllm._C.ops + # vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, frozen_model_input.input_tokens, sampled_token_ids, frozen_model_input.input_positions, attn_metadata.seq_lens_tensor, attn_metadata.slot_mapping, attn_metadata.block_tables) # torch.xpu.synchronize() # vllm._C.ops.advance_step_ipex(num_seqs, num_queries, self.block_size, cloned_input_tokens, cloned_sampled_token_ids, cloned_input_positions, cloned_seq_lens, cloned_slot_mappings, cloned_block_tables)