diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 4674efb812cf..17d40d0ee131 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 2c62e565c04c..f90d15d4207e 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -111,8 +111,16 @@ def __init__( self.matmul_qk = Matmul() self.softmax = Softmax() self.matmul_av = Matmul() + self.batch2block_matmul = Matmul() + self.block2batch_matmul = Matmul() + # NOTE(kzawora): Contiguous PA is off until model runner supports it self.k_cache = VLLMKVCache() + self.k_cache.use_contiguous_pa = False self.v_cache = VLLMKVCache() + self.v_cache.use_contiguous_pa = False + # NOTE(kzawora): Pipelined PA is off until model runner supports it + ops.pa_impl = ops.pa + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window self.alibi_slopes = alibi_slopes @@ -228,9 +236,12 @@ def forward( block_mapping=attn_metadata.block_mapping, block_bias=attn_metadata.attn_bias, block_scales=attn_metadata.block_scales, + block_groups=None, scale=self.scale, matmul_qk_op=self.matmul_qk, matmul_av_op=self.matmul_av, + batch2block_matmul_op=self.batch2block_matmul, + block2batch_matmul_op=self.block2batch_matmul, keys_fetch_func=self.k_cache.fetch_from_cache, values_fetch_func=self.v_cache.fetch_from_cache) # Reshape the output tensor.