From e5a1fa840b66e7628ef1a3ef19629026c2dd98ee Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 3 Mar 2023 04:16:04 +0000 Subject: [PATCH 1/4] Fix a bug in 1D shape --- cacheflow/models/attention.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cacheflow/models/attention.py b/cacheflow/models/attention.py index 34edeec02cbc..8b2fb3f85b2a 100644 --- a/cacheflow/models/attention.py +++ b/cacheflow/models/attention.py @@ -47,9 +47,8 @@ def multi_query_kv_attention( max_s=max_prompt_len, causal=True, )[0] - num_tokens = prefix_sum[-1] # FIXME(woosuk): Unnecessary copy. Optimize this. - output[:num_tokens].copy_(out, non_blocking=True) + output.copy_(out, non_blocking=True) def single_query_cached_kv_attention( self, @@ -108,8 +107,13 @@ def forward( # Compute the attention op for prompts. if input_metadata.num_prompts > 0: + num_prompt_tokens = sum(input_metadata.prompt_lens) self.multi_query_kv_attention( - output, query, key, value, input_metadata.prompt_lens) + output[:num_prompt_tokens], + query[:num_prompt_tokens], + key[:num_prompt_tokens], + value[:num_prompt_tokens], + input_metadata.prompt_lens) # Wait until the cache op is done. if cache_event is not None: From 342275fdcd6bc2ba5332335bdb0a53e46b2011e0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 3 Mar 2023 04:16:15 +0000 Subject: [PATCH 2/4] Minor --- cacheflow/models/input_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cacheflow/models/input_metadata.py b/cacheflow/models/input_metadata.py index 86cc2e8f1f5a..77f25054e38a 100644 --- a/cacheflow/models/input_metadata.py +++ b/cacheflow/models/input_metadata.py @@ -24,7 +24,7 @@ def __init__( self.num_prompts = len(prompt_lens) self.num_generation_tokens = context_lens.shape[0] - self.num_valid_tokens = len(slot_mapping) + self.num_valid_tokens = slot_mapping.shape[0] if block_tables.numel() > 0: self.max_num_blocks_per_seq = block_tables.shape[1] else: From b91a2fada7090cd6b7cdf9d1f26ef1e32f2737b7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 3 Mar 2023 04:19:52 +0000 Subject: [PATCH 3/4] Minor --- cacheflow/models/attention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cacheflow/models/attention.py b/cacheflow/models/attention.py index 8b2fb3f85b2a..7c77db5a819b 100644 --- a/cacheflow/models/attention.py +++ b/cacheflow/models/attention.py @@ -113,7 +113,8 @@ def forward( query[:num_prompt_tokens], key[:num_prompt_tokens], value[:num_prompt_tokens], - input_metadata.prompt_lens) + input_metadata.prompt_lens, + ) # Wait until the cache op is done. if cache_event is not None: From 4db2916e1889b1b3f5b582ce716eca7533dd17f7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 6 Mar 2023 18:05:05 +0000 Subject: [PATCH 4/4] Test iteration-level scheduling --- server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server.py b/server.py index d70dab01abd4..04e2f6d72669 100644 --- a/server.py +++ b/server.py @@ -57,11 +57,11 @@ def main(): 'UC Berkeley is', 'The future of cloud computing is', ] - for prompt in test_inputs: - frontend.query(prompt) # FIXME while True: + if test_inputs: + frontend.query(test_inputs.pop()) scheduler.step() if not scheduler.pending and not scheduler.running: break