diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2 index a7d633563d8..34fd1e33073 100644 --- a/server/Makefile-flash-att-v2 +++ b/server/Makefile-flash-att-v2 @@ -1,4 +1,4 @@ -flash_att_v2_commit := 4f285b354796fb17df8636485b9a04df3ebbb7dc +flash_att_v2_commit := 92dd5703ecdb99aa4a4aee9817f28557907403a2 # v2.3.6 flash-attention-v2: # Clone flash attention @@ -10,4 +10,4 @@ build-flash-attention-v2: flash-attention-v2 cd flash-attention-v2 && python setup.py build install-flash-attention-v2: build-flash-attention-v2 - cd flash-attention-v2 && python setup.py install \ No newline at end of file + cd flash-attention-v2 && python setup.py install diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index c472d1fceab..f7d4bcc7d30 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -66,12 +66,15 @@ def attention( out, cu_seqlens, cu_seqlens, + None, # seqused_k added in ce3e728 max_s, max_s, 0.0, softmax_scale, False, True, + -1, # window_size[0] added in 083e8f52. -1 means infinite window size + -1, # window_size[1] added in 083e8f52. -1 means infinite window size False, None, )