From 6a9eee332c6cd27153e30d315b90231cec7827ec Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 16 May 2025 22:36:08 +0000 Subject: [PATCH 01/28] [Add] cpu kv sender interfaces Signed-off-by: ApostaC --- .../kv_connector/v1/cpu_connector.py | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py new file mode 100644 index 000000000000..d73426f08361 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -0,0 +1,257 @@ +# SPDX-License-Identifier: Apache-2.0 +import contextlib +import math +import threading +import time +import uuid +from abc import ABC, abstractmethod +from collections import defaultdict +from collections.abc import Iterator +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +import msgspec +import torch +import zmq + +from vllm import envs +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, + get_tp_group) +from vllm.logger import init_logger +from vllm.utils import make_zmq_path, make_zmq_socket, round_down +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.request import RequestStatus + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.request import Request + +logger = init_logger(__name__) + +@dataclass +class DestinationSpec: + """DestinationSpec is used to specify the destination of kv sending task. + + Attributes: + rank (int): The rank of the destination. + host (str): The path of the destination. + base_port (int): The base port of the destination. + """ + rank: int + host: str + base_port: int + + def __str__(self) -> str: + return f"DestinationSpec(rank={self.rank}, host={self.host}, base_port={self.base_port})" + +@dataclass +class SourceSpec: + """SourceSpec is used to specify the source of kv sending task. + """ + # The request id of the kv cache + request_id: str + + # The layer id of the kv cache + layer_id: int + + # The range of tokens to be offloaded + token_range: slice + + # The shape of the offloaded KV cache tensor + shape: torch.Size + + def __str__(self) -> str: + return f"SourceSpec(request_id={self.request_id}, layer_id={self.layer_id}, token_range={self.token_range}, shape={self.shape})" + +@dataclass +class SendTaskState: + """SendTaskState is used to track the state of a send task. + """ + sender_ready: bool = False + receiver_ready: bool = False + is_sending: bool = False + send_done: bool = False + + def __str__(self) -> str: + return (f"SendTaskState(sender_ready={self.sender_ready}, " + f"receiver_ready={self.receiver_ready}, " + f"is_sending={self.is_sending}, " + f"send_done={self.send_done})") + + def is_ready(self) -> bool: + """Check if the send task is ready to be sent. + + Returns: + bool: True if the send task is ready, False otherwise. + """ + return self.sender_ready and self.receiver_ready + + def is_sending(self) -> bool: + """Check if the send task is currently sending. + + Returns: + bool: True if the send task is sending, False otherwise. + """ + return self.is_sending + + def is_done(self) -> bool: + """Check if the send task is done. + + Returns: + bool: True if the send task is done, False otherwise. + """ + return self.send_done + +@dataclass +class SendTask: + """Wraps a KV Cache sending task + """ + + # A flat buffer holding the tensor data + buffer: torch.Tensor + source_spec: SourceSpec + destination_spec: DestinationSpec + state: SendTaskState + + @property + def tensor(self) -> torch.Tensor: + """Get the tensor of the send task. + + Returns: + torch.Tensor: The tensor of the send task. + """ + num_elements = self.source_spec.shape.numel() + return self.buffer[:num_elements].view(self.source_spec.shape) + + def update_states(self) -> None: + """Update the states of the send task. + + This function should be called periodically to ensure that the send + task is being processed. + + This function should be implemented in sub-classes to handle different + types of send tasks. + """ + raise NotImplementedError + + def is_ready(self) -> bool: + """Check if the send task is ready to be sent. + + Returns: + bool: True if the send task is ready, False otherwise. + """ + return self.state.is_ready() + + def is_sending(self) -> bool: + """Check if the send task is currently sending. + + Returns: + bool: True if the send task is sending, False otherwise. + """ + return self.state.is_sending() + + def is_done(self) -> bool: + """Check if the send task is done. + + Returns: + bool: True if the send task is done, False otherwise. + """ + return self.state.is_done() + +class KVSenderInterface(ABC): + """KVSenderInterface is an interface for sending KV cache data. + """ + + def __init__(self) -> None: + self._send_tasks: list[SendTask] = [] + + + def add_send_task(self, task: SendTask) -> None: + """Add a send task to the list of send tasks. + + Args: + task (SendTask): The send task to be added. + """ + self._send_tasks.append(task) + + def progress(self) -> None: + """A fast, non-blocking function to check and update the states of all + send tasks. This function should be called periodically to ensure that + the send tasks are being processed. + """ + # Update before going through all send tasks + self.pre_progress_hook() + + for task in self._send_tasks: + if task.is_ready() and not task.is_sending(): + task.mark_sending() + self._send(task) + + if task.is_done(): + self.free_task(task) + self._send_tasks.remove(task) + + # Update after going through all send tasks + self.post_progress_hook() + + ###################################################### + # Abstract methods (to be implemented by subclasses) # + ###################################################### + + @abstractmethod + def create_send_task( + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: + """Create a non-ready send task with a CPU buffer allocated. + + Args: + source_spec (SourceSpec): The source specification of the send + task. + destination_spec (DestinationSpec): The destination + specification of the send task. + """ + raise NotImplementedError("create_send_task() not implemented") + + @abstractmethod + def free_task(self, task: SendTask) -> None: + """Free the send task. + + Args: + task (SendTask): The send task to be freed. + """ + raise NotImplementedError("free_task() not implemented") + + @abstractmethod + def send_task(self, task: SendTask) -> None: + """Send the send task after it is ready. + + Args: + task (SendTask): The send task to be sent. + """ + raise NotImplementedError("send_task() not implemented") + + @abstractmethod + def pre_progress_hook(self, task: SendTask) -> None: + """Hook to be called before processing the send task. + + Args: + task (SendTask): The send task to be processed. + """ + raise NotImplementedError("pre_progress_hook() not implemented") + + @abstractmethod + def post_progress_hook(self, task: SendTask) -> None: + """Hook to be called after processing the send task. + + Args: + task (SendTask): The send task to be processed. + """ + raise NotImplementedError("post_progress_hook() not implemented") From ad7d8520b8f49095e08a96b931a8ef670e367f1f Mon Sep 17 00:00:00 2001 From: ApostaC Date: Mon, 19 May 2025 05:26:29 +0000 Subject: [PATCH 02/28] [WIP] Rewrite the whole implementation for layer-wise pipeline Signed-off-by: ApostaC --- .../kv_transfer/kv_connector/factory.py | 5 + .../kv_connector/v1/cpu_connector.py | 788 +++++++++++++++++- .../kv_connector/v1/request_tracker.py | 221 +++++ 3 files changed, 994 insertions(+), 20 deletions(-) create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index f998f5dd7b15..9b77df1a9abb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -115,3 +115,8 @@ def create_connector_v1( "MultiConnector", "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector", "MultiConnector") + +KVConnectorFactory.register_connector( + "CPUConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector", + "CPUConnector") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index d73426f08361..f1e3f1d452b9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -5,10 +5,10 @@ import time import uuid from abc import ABC, abstractmethod -from collections import defaultdict +from collections import defaultdict, OrderedDict from collections.abc import Iterator from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any, Optional, Tuple import msgspec import torch @@ -22,18 +22,67 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) from vllm.logger import init_logger -from vllm.utils import make_zmq_path, make_zmq_socket, round_down +from vllm.utils import make_zmq_path, make_zmq_socket, round_down, cdiv from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus +from vllm import _custom_ops as ops + +from lmcache.utils import _lmcache_nvtx_annotate if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.core.sched.output import CachedRequestData, NewRequestData from vllm.v1.request import Request logger = init_logger(__name__) +def d2h_page_copy( + src_layer: torch.Tensor, + dst_buffer: torch.Tensor, + block_ids: list[int] + ) -> None: + """Copy data from device to host. + + Args: + src_layer (torch.Tensor): The source layer on device, shape is + (2, num_vllm_blocks, page_size, ...remaining dims...) + dst_buffer (torch.Tensor): The destination buffer on host, shape is + (2, len(block_ids), page_size, ...remaining dims...) + """ + # debug copy: + block_mapping = torch.stack([torch.tensor(block_ids, dtype = torch.long), + torch.arange(len(block_ids), dtype = torch.long)], dim = 1) + ops.swap_blocks(src_layer[0], dst_buffer[0], block_mapping) + ops.swap_blocks(src_layer[1], dst_buffer[1], block_mapping) + #for dst_idx, block_id in enumerate(block_ids): + # src_k, src_v = src_layer[:, block_id, :, :] + # dst_k, dst_v = dst_buffer[:, dst_idx, :, :] + # # Copy the data from device to host + # dst_k.copy_(src_k, non_blocking=True) + # dst_v.copy_(src_v, non_blocking=True) + +def h2d_page_copy( + src_buffer: torch.Tensor, + dst_layer: torch.Tensor, + block_ids: list[int] + ) -> None: + """Copy data from host to device. + + Args: + src_buffer (torch.Tensor): The source buffer on host, shape is + (2, len(block_ids), page_size, ...remaining dims...) + dst_layer (torch.Tensor): The destination layer on device, shape is + (2, num_vllm_pages, page_size, ...remaining dims...) + """ + for src_idx, block_id in enumerate(block_ids): + dst_k, dst_v = dst_layer[:, block_id, :, :] + src_k, src_v = src_buffer[:, src_idx, :, :] + # Copy the data from host to device + dst_k.copy_(src_k, non_blocking=True) + dst_v.copy_(src_v, non_blocking=True) + @dataclass class DestinationSpec: """DestinationSpec is used to specify the destination of kv sending task. @@ -66,8 +115,18 @@ class SourceSpec: # The shape of the offloaded KV cache tensor shape: torch.Size + # The dtype of the offloaded KV cache tensor + dtype: torch.dtype + + def get_size(self) -> int: + """Get the size in bytes of the cooresponding kv cache. + """ + return self.shape.numel() * self.dtype.itemsize + def __str__(self) -> str: - return f"SourceSpec(request_id={self.request_id}, layer_id={self.layer_id}, token_range={self.token_range}, shape={self.shape})" + return f"SourceSpec(request_id={self.request_id}, " + \ + f"layer_id={self.layer_id}, " + \ + f"token_range={self.token_range}, shape={self.shape})" @dataclass class SendTaskState: @@ -92,14 +151,6 @@ def is_ready(self) -> bool: """ return self.sender_ready and self.receiver_ready - def is_sending(self) -> bool: - """Check if the send task is currently sending. - - Returns: - bool: True if the send task is sending, False otherwise. - """ - return self.is_sending - def is_done(self) -> bool: """Check if the send task is done. @@ -127,16 +178,16 @@ def tensor(self) -> torch.Tensor: torch.Tensor: The tensor of the send task. """ num_elements = self.source_spec.shape.numel() - return self.buffer[:num_elements].view(self.source_spec.shape) + return self.buffer.view( + self.source_spec.dtype)[:num_elements].view( + self.source_spec.shape) def update_states(self) -> None: - """Update the states of the send task. + """Update the states of the send task. This needs to be OVERWRITTEN in + subclasses to handle different types of send tasks. This function should be called periodically to ensure that the send task is being processed. - - This function should be implemented in sub-classes to handle different - types of send tasks. """ raise NotImplementedError @@ -154,7 +205,7 @@ def is_sending(self) -> bool: Returns: bool: True if the send task is sending, False otherwise. """ - return self.state.is_sending() + return self.state.is_sending def is_done(self) -> bool: """Check if the send task is done. @@ -180,6 +231,15 @@ def add_send_task(self, task: SendTask) -> None: """ self._send_tasks.append(task) + def get_send_tasks(self) -> list[SendTask]: + """Get the list of send tasks. + + Returns: + list[SendTask]: The list of send tasks. + """ + return self._send_tasks + + @_lmcache_nvtx_annotate def progress(self) -> None: """A fast, non-blocking function to check and update the states of all send tasks. This function should be called periodically to ensure that @@ -188,14 +248,22 @@ def progress(self) -> None: # Update before going through all send tasks self.pre_progress_hook() + new_task_list = [] + for task in self._send_tasks: + should_add = True + if task.is_ready() and not task.is_sending(): - task.mark_sending() self._send(task) if task.is_done(): self.free_task(task) - self._send_tasks.remove(task) + should_add = False + + if should_add: + new_task_list.append(task) + + self._send_tasks = new_task_list # Update after going through all send tasks self.post_progress_hook() @@ -255,3 +323,683 @@ def post_progress_hook(self, task: SendTask) -> None: task (SendTask): The send task to be processed. """ raise NotImplementedError("post_progress_hook() not implemented") + + +# DEBUG IMPLEMENTATION: NO REAL SEND BUT HAVE MEMORY MANAGEMENT AND D2H COPY +class RingBufferAllocator: + """RingBufferAllocator is a simple ring buffer allocator for managing + memory allocation and deallocation. + """ + + def __init__(self, size: int, align_to: int = 256) -> None: + """Initialize the ring buffer allocator with the given size. + + Args: + size (int): The size of the ring buffer (in bytes). + align_to (int): The alignment size (in bytes). Default is 8. + """ + self._size = size + self._buffer = torch.empty(size, dtype=torch.uint8) + self._high_watermark = 0 + self._low_watermark = 0 + self._align_to = align_to + + self._allocated = OrderedDict() # Track allocated buffers + + # Register pin memory + cudart = torch.cuda.cudart() + cudart.cudaHostRegister(self._buffer.data_ptr(), size, 0) + + def _align_size(self, base: int) -> int: + """Align the given size to the nearest multiple of the alignment size. + + Args: + base (int): The size to be aligned. + + Returns: + int: The aligned size. + """ + return ((base - 1) // self._align_to + 1) * self._align_to + + def allocate(self, size: int) -> Tuple[int, Optional[torch.Tensor]]: + """Allocate a buffer of the given size. + + Args: + size (int): The size of the buffer to be allocated. + + Returns: + Optional[Tuple[int, torch.Tensor]]: A tuple containing the address + of the allocated buffer and the buffer itself. If allocation + fails, returns None. + """ + # During allocation, we always make sure that high watermark and + # low watermark are aligned to the alignment size + aligned_size = self._align_size(size) # Align the requested size + turnaround_size = (self._high_watermark // self._size + 1) * self._size + + local_high = self._high_watermark % self._size + local_low = self._low_watermark % self._size + + if local_high >= local_low: + if local_high == local_low and \ + self._high_watermark > self._low_watermark: + # No space available + return -1, None + + # If high watermark + requested size is okay, directly allocate + if local_high + size < self._size: + address = self._high_watermark + self._allocated[address] = aligned_size + start = local_high + end = start + size + self._high_watermark += aligned_size + return address, self._buffer[start:end] + else: + # If high watermark + requested size is not okay, we need to + # wrap around and allocate again + self._high_watermark = turnaround_size + return self.allocate(size) + else: + # High watermark is below low watermark, check if we can allocate + if local_high + size < local_low: + address = self._high_watermark + self._allocated[address] = aligned_size + start = local_high + end = start + size + self._high_watermark += aligned_size + return address, self._buffer[start:end] + else: + # No space available + return -1, None + + def free(self, address: int) -> None: + """Free the buffer at the given address. + + Args: + address (int): The address of the buffer to be freed, which + is returned by the allocate() method. + """ + assert address in self._allocated, \ + "Address not found in allocated buffers" + + # Pop the address from the allocated dict, and update the + # low watermark + self._allocated.pop(address) + + # If there is nothing allocated, set low_watermark to high watermark + new_low_watermark = self._high_watermark + + # Else, set the low_watermark to the first address in the allocated + # dict + for addr in self._allocated.keys(): + new_low_watermark = addr + break + self._low_watermark = new_low_watermark + + @property + def high_watermark(self) -> int: + return self._high_watermark + + @property + def low_watermark(self) -> int: + return self._low_watermark + + +@dataclass +class CPUSendTask(SendTask): + """CPUSendTask is a send task that uses CPU memory for the buffer. + """ + buffer_addr: int + creation_time: float = 0.0 + cuda_event: Optional[torch.cuda.Event] = None + + dbg_send_time: Optional[float] = None + + def __post_init__(self) -> None: + self.creation_time = time.time() + + @_lmcache_nvtx_annotate + def update_states(self) -> None: + """Update the states of the send task. + """ + # Check the cuda event + if not self.state.sender_ready and self.cuda_event is not None \ + and self.cuda_event.query(): + self.state.sender_ready = True + + curr_time = time.time() + if curr_time - self.creation_time > 0.5: + self.state.receiver_ready = True + + if self.dbg_send_time is not None and \ + curr_time - self.dbg_send_time > 1: + self.state.send_done = True + + def dbg_mark_sending(self) -> None: + """Mark the send task as sending. + """ + self.state.is_sending = True + self.dbg_send_time = time.time() + +class CPUKVSender(KVSenderInterface): + """CPUKVSender is an implementation of KVSenderInterface that provides a + ring buffer allocator for managing pin memory allocation and deallocation. + """ + + def __init__(self, buffer_size: int) -> None: + super().__init__() + self._buffer_size = buffer_size + self._allocator = RingBufferAllocator(self._buffer_size) + + def create_send_task( + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: + """Create a non-ready send task with a CPU buffer allocated. + + Args: + source_spec (SourceSpec): The source specification of the send + task. + destination_spec (DestinationSpec): The destination + specification of the send task. + """ + # Allocate a buffer for the send task + size = source_spec.get_size() + address, buffer = self._allocator.allocate(size) + while address == -1: + # If allocation fails, wait for a while to process + # and try again + time.sleep(0.001) + self.progress() + address, buffer = self._allocator.allocate(size) + assert buffer is not None, "Buffer allocation failed" + + # Create a send task with the allocated buffer + task = CPUSendTask( + buffer=buffer, + source_spec=source_spec, + destination_spec=destination_spec, + state=SendTaskState(), + buffer_addr=address, + ) + self.add_send_task(task) + return task + + def free_task(self, task: SendTask) -> None: + """Free the send task. + + Args: + task (SendTask): The send task to be freed. + """ + # Free the buffer in the ring buffer allocator + self._allocator.free(task.buffer_addr) + + def send_task(self, task: SendTask) -> None: + """Send the send task after it is ready. + + Args: + task (SendTask): The send task to be sent. + """ + # DEBUG IMPLEMENTATION + logger.error("CPUKVSender.send_task() not implemented, running a debug implementation!") + task.dbg_mark_sending() + + def pre_progress_hook(self) -> None: + for task in self.get_send_tasks(): + task.update_states() + + def post_progress_hook(self) -> None: + pass + + def _send(self, task: SendTask) -> None: + # NO IMPLEMENTATION YET + pass + + +##################################################################### +# Connector related code +##################################################################### + +@dataclass +class PrefillRequestTracker: + """RequestTracker is used to track the state of a request. + + Attributes: + req_id (str): The id of the request. + num_saved_tokens (int): The number of tokens saved. + num_loaded_tokens (int): The number of tokens loaded. + num_computed_tokens (int): The number of tokens computed. + allocated_block_ids (list[int]): The list of allocated block ids. + """ + # Request id + req_id: str + + # Total number of tokens that are in this request + num_total_tokens: int = 0 + + # Number of tokens that are already saved + num_saved_tokens: int = 0 + + # Block ids that are already allocated for this request + allocated_block_ids: list[int] = None + + @staticmethod + def from_new_request( + new_request: "NewRequestData", + num_tokens_to_compute: int, + ) -> "PrefillRequestTracker": + """Create the request tracker from a new request. + + Args: + new_request (NewRequestData): the new request data. + num_tokens_to_compute (int): the number of tokens that will + be 'computed', including the `num_computed_tokens` (vLLM's + local cache hit) and new tokens that will be scheduled. + """ + unfolded_block_ids = [] + for block_ids in new_request.block_ids: + unfolded_block_ids.extend(block_ids) + + return PrefillRequestTracker( + req_id=new_request.req_id, + num_total_tokens = num_tokens_to_compute, + num_saved_tokens=0, + allocated_block_ids=unfolded_block_ids, + ) + + def update(self, cached_request: "CachedRequestData") -> None: + """Update the request tracker with the cached request data. + + Args: + cached_request (CachedRequestData): the cached request data. + """ + new_block_ids = [] + for nb in cached_request.new_block_ids: + new_block_ids.extend(nb) + self.allocated_block_ids.extend(new_block_ids) + self.num_total_tokens += len(cached_request.new_token_ids) + + def update_num_saved_tokens(self, num_saved_tokens: int) -> None: + """Update the number of saved tokens. + + Args: + num_saved_tokens (int): the number of saved tokens. + """ + self.num_saved_tokens = num_saved_tokens + +@dataclass +class PrefillReqMeta: + # Request id + req_id: str + # Blocks to save + blocks_to_save: list[int] + # The range of tokens to save + token_range: slice + # Skip first N tokens + skip_leading_tokens: int + # Skip last N tokens + skip_trailing_tokens: int + + @staticmethod + def from_request_tracker( + request_tracker: PrefillRequestTracker, + block_size: int, + ) -> "PrefillReqMeta": + """Create the request meta from the request tracker. Determine which + blocks to save and the number of leading/trailing tokens to skip for + the worker connector. + It also updates the request tracker's num_saved_tokens. + + Args: + request_tracker (PrefillRequestTracker): the request tracker. + block_size (int): the block size in vLLM. + + Returns: + PrefillReqMeta: the request meta. + """ + assert request_tracker.num_total_tokens <= \ + len(request_tracker.allocated_block_ids) * block_size, \ + f"Request {req_id} has more tokens than allocated blocks" + + token_range = slice(request_tracker.num_saved_tokens, + request_tracker.num_total_tokens) + + num_saved_full_blocks = request_tracker.num_saved_tokens // block_size + num_active_blocks = cdiv(request_tracker.num_total_tokens, block_size) + + blocks_to_save = request_tracker.allocated_block_ids[\ + num_saved_full_blocks:num_active_blocks] + skip_leading_tokens = request_tracker.num_saved_tokens % block_size + skip_trailing_tokens = num_active_blocks * block_size - \ + request_tracker.num_total_tokens + logger.debug( + "Request %s: num_saved_full_blocks=%d, num_active_blocks=%d, " + "blocks_to_save=%s, skip_leading_tokens=%d, " + "skip_trailing_tokens=%d", + request_tracker.req_id, + num_saved_full_blocks, num_active_blocks, + blocks_to_save, skip_leading_tokens, skip_trailing_tokens) + + # Update the request tracker with the number of saved tokens + request_tracker.update_num_saved_tokens( + request_tracker.num_total_tokens) + return PrefillReqMeta( + req_id=request_tracker.req_id, + blocks_to_save=blocks_to_save, + token_range=token_range, + skip_leading_tokens=skip_leading_tokens, + skip_trailing_tokens=skip_trailing_tokens, + ) + + +@dataclass +class DecodeReqMeta: + pass + +@dataclass +class CPUConnectorMetadata(KVConnectorMetadata): + prefill_meta: list[PrefillReqMeta] + decode_meta: list[DecodeReqMeta] + + def __init__(self) -> None: + super().__init__() + self.prefill_meta = [] + self.decode_meta = [] + + def add_prefill(self, prefill_meta: PrefillReqMeta) -> None: + """Add a prefill request metadata to the metadata. + + Args: + prefill_meta (PrefillReqMeta): The prefill request metadata to be + added. + """ + self.prefill_meta.append(prefill_meta) + + def add_decode(self, decode_meta: DecodeReqMeta) -> None: + """Add a decode request metadata to the metadata. + + Args: + decode_meta (DecodeReqMeta): The decode request metadata to be + added. + """ + self.decode_meta.append(decode_meta) + + +class CPUConnector(KVConnectorBase_V1): + """CPUKVConnector is an implementation of KVConnectorBase_V1 that + provides a CPU-based KV cache sending mechanism. + """ + + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: + super().__init__(vllm_config, role) + + self.kv_role = vllm_config.kv_transfer_config.kv_role + + self._block_size = vllm_config.cache_config.block_size + + if role == KVConnectorRole.SCHEDULER: + pass + elif role == KVConnectorRole.WORKER: + # Prefiller side sender + self._cpu_kv_sender = CPUKVSender(1024 * 1024 * 1024) # 1GB for debug + + # request_id -> prefill request trackers + self._prefill_reqs: dict[str, PrefillRequestTracker] = {} + + # gpu kv caches + self._gpu_kv_caches: dict[str, torch.Tensor] = {} + self._layer_name_to_id: dict[str, int] = {} + self._layer_id_to_name: dict[int, str] = {} + self._kv_page_shape: torch.Size = torch.Size([0]) + + # separate cuda streams + self._cuda_stream = torch.cuda.Stream() + + # prefill offload tasks + self._inflight_copy_tasks: list[CPUSendTask] = [] + + + ############################################################ + # Scheduler Side Methods + ############################################################ + def _build_prefiller_meta( + self, + scheduler_output: SchedulerOutput, + output_meta: CPUConnectorMetadata) -> None: + """Build the prefill request metadata from the scheduler output. + + Args: + scheduler_output (SchedulerOutput): The scheduler output. + output_meta (CPUConnectorMetadata): The output metadata. + """ + for finished_req_id in scheduler_output.finished_req_ids: + self._prefill_reqs.pop(finished_req_id, None) + + for request in scheduler_output.scheduled_new_reqs: + num_tokens_to_compute = request.num_computed_tokens + \ + scheduler_output.num_scheduled_tokens[request.req_id] + request_tracker = PrefillRequestTracker.from_new_request( + request, num_tokens_to_compute) + self._prefill_reqs[request.req_id] = request_tracker + + req_meta = PrefillReqMeta.from_request_tracker( + request_tracker, + self._block_size) + output_meta.add_prefill(req_meta) + + for request in scheduler_output.scheduled_cached_reqs: + request_tracker = self._prefill_reqs[request.req_id] + request_tracker.update(request) + + req_meta = PrefillReqMeta.from_request_tracker( + request_tracker, + self._block_size) + output_meta.add_prefill(req_meta) + + def build_decode_meta( + self, + scheduler_output: SchedulerOutput, + output_meta: CPUConnectorMetadata) -> None: + """Build the decode request metadata from the scheduler output. + + Args: + scheduler_output (SchedulerOutput): The scheduler output. + output_meta (CPUConnectorMetadata): The output metadata. + """ + logger.error("build_decode_meta() not implemented, running a debug implementation!") + pass + + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + return 0, False + + def update_state_after_alloc( + self, + request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int) -> None: + print("In update_state_after_alloc") + pass + + def build_connector_meta( + self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: + meta = CPUConnectorMetadata() + + if self.kv_role == "kv_producer": + self._build_prefiller_meta(scheduler_output, meta) + elif self.kv_role == "kv_consumer": + self.build_decode_meta(scheduler_output, meta) + else: + raise ValueError(f"Unknown kv_role: {self.kv_role}") + + return meta + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + print("In request_finished") + return False, None + + ############################################################# + # Worker Side Methods + ############################################################# + def _get_layer_id(self, layer_name: str) -> int: + assert layer_name in self._layer_name_to_id, \ + f"Layer {layer_name} not found in layer name to id map" + return self._layer_name_to_id[layer_name] + + def _get_layer_name(self, layer_id: int) -> str: + assert layer_id in self._layer_id_to_name, \ + f"Layer id {layer_id} not found in layer id to name map" + return self._layer_id_to_name[layer_id] + + def _get_kv_shape(self, num_blocks: int) -> torch.Size: + return torch.Size((2, num_blocks, ) + self._kv_page_shape) + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + self._gpu_kv_caches = kv_caches + idx = 0 + for layer_name in kv_caches.keys(): + self._layer_name_to_id[layer_name] = idx + self._layer_id_to_name[idx] = layer_name + idx += 1 + + self._kv_page_shape = kv_caches[list(kv_caches.keys())[0]].shape[2:] + + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + """ + Start loading the KV cache from the connector to vLLM's paged + KV buffer. This is called from the forward context before the + forward pass to enable async loading during model execution. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + + """ + pass + + + def wait_for_layer_load(self, layer_name: str) -> None: + """ + Block until the KV for a specific layer is loaded into vLLM's + paged buffer. This is called from within attention layer to ensure + async copying from start_load_kv is complete. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + pass + + @_lmcache_nvtx_annotate + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + """ + Start saving a layer of KV cache from vLLM's paged buffer + to the connector. This is called from within attention layer to + enable async copying during execution. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + meta = self._get_connector_metadata() + assert isinstance(meta, CPUConnectorMetadata), \ + "Connector metadata is not of type CPUConnectorMetadata" + + assert self._cpu_kv_sender is not None + + for prefill_req in meta.prefill_meta: + # TODO: add skip leading/trailing tokens into source_spec + # or maybe recompute it at the receiver side based on the + # token_range + source_spec = SourceSpec( + request_id = prefill_req.req_id, + layer_id = self._get_layer_id(layer_name), + token_range = prefill_req.token_range, + shape = self._get_kv_shape( + len(prefill_req.blocks_to_save)), + dtype = kv_layer.dtype + ) + + # Create a destination spec + # TODO: remove the hard-code here + dest_spec = DestinationSpec( + rank = get_tensor_model_parallel_rank(), + host = "localhost", + base_port = "54321", + ) + + # Create the send task + task = self._cpu_kv_sender.create_send_task( + source_spec=source_spec, + destination_spec=dest_spec, + ) + assert isinstance(task, CPUSendTask), \ + "Send task is not of type CPUSendTask" + + # Start copying the data to the CPU buffer + buffer = task.tensor + self._cuda_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(self._cuda_stream): + # Copy the data from the GPU to the CPU buffer page by page + d2h_page_copy( + src_layer=kv_layer, + dst_buffer=buffer, + block_ids=prefill_req.blocks_to_save + ) + + # record the cuda stream + task.cuda_event = torch.cuda.Event() + task.cuda_event.record(self._cuda_stream) + + self._inflight_copy_tasks.append(task) + + # Check the task states and send the tasks + self._cpu_kv_sender.progress() + + + @_lmcache_nvtx_annotate + def wait_for_save(self): + """ + Block until all the save operations is done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. + + This prevents overwrites of paged KV buffer before saving done. + """ + for task in self._inflight_copy_tasks: + if task.cuda_event is not None: + task.cuda_event.synchronize() + self._inflight_copy_tasks.clear() + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer, + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + return None, None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py b/vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py new file mode 100644 index 000000000000..66f87100ce83 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py @@ -0,0 +1,221 @@ +# SPDX-License-Identifier: Apache-2.0 +# Adpoted from LMCache https://github.com/LMCache/LMCache + +import threading +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +import torch +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.logger import init_logger +from vllm.utils import cdiv, make_zmq_socket +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheManager + from vllm.v1.core.sched.output import CachedRequestData, NewRequestData + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class LoadSpec: + # Number of tokens cached in vLLM + vllm_cached_tokens: int + # Number of tokens that are cached in LMCache + external_cached_tokens: int + # Whether the scheduler allow us to load the tokens + can_load: bool + + +@dataclass +class SaveSpec: + # Skip already saved tokens + skip_leading_tokens: int + # Whether the scheduler allow us to save the tokens + can_save: bool + + +@dataclass +class RequestTracker: + # Request id + req_id: str + + # The token ids that has been scheduled so far + token_ids: list[int] + + # The block ids that has been allocated so far + # NOTE: allocated blocks could be more than the number of tokens + # FIXME: need to check whether the block ids will be changed after + # preemption + allocated_block_ids: list[int] + + # The number of tokens that has been savd + num_saved_tokens: int = 0 + + @staticmethod + def from_new_request( + new_request: "NewRequestData", + num_tokens_to_compute: int, + ) -> "RequestTracker": + """Create the request tracker from a new request. + + Args: + new_request (NewRequestData): the new request data. + num_tokens_to_compute (int): the number of tokens that will + be 'computed', including the `num_computed_tokens` (vLLM's + local cache hit) and new tokens that will be scheduled. + + """ + # vLLM 0.9.0 update: request.block_ids changed from list[int] to + # list[list[int]] + # Need to check the type of request.block_ids + + unfolded_block_ids = [] + + if not isinstance(new_request.block_ids[0], list): + unfolded_block_ids = new_request.block_ids.copy() + else: + # According to the vLLM code + # (https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/ + # sched/scheduler.py#L943), + # only one KVCacheGroup is supported in connector for now. + + # TODO: Please support multiple KVCacheGroup in connector. + # NOTE: Also, `update` method in RequestTracker should be + # updated accordingly. + unfolded_block_ids = new_request.block_ids[0].copy() + + return RequestTracker( + req_id=new_request.req_id, + token_ids=new_request.prompt_token_ids[:num_tokens_to_compute]. + copy(), + allocated_block_ids=unfolded_block_ids, + num_saved_tokens=0, + ) + + def update( + self, + cached_request: "CachedRequestData", + ) -> None: + """Update the request tracker when a running request is + scheduled again + """ + self.token_ids.extend(cached_request.new_token_ids) + new_block_ids: list[int] + if not isinstance(cached_request.new_block_ids[0], list): + new_block_ids = cached_request.new_block_ids + else: + new_block_ids = cached_request.new_block_ids[0] + self.allocated_block_ids.extend(new_block_ids) + + +@dataclass +class ReqMeta: + # Request id + req_id: str + # Request tokens + token_ids: torch.Tensor + # Block ids + block_ids: torch.Tensor + # Slot mapping + slot_mapping: torch.Tensor + # Skip save or not + save_spec: Optional[SaveSpec] = None + # load_spec + load_spec: Optional[LoadSpec] = None + + @staticmethod + def from_request_tracker( + tracker: RequestTracker, + block_size: int, + load_spec: Optional[LoadSpec] = None, + skip_save: bool = False, + ) -> Optional["ReqMeta"]: + """Create the request metadata from a request tracker. + + Args: + tracker (RequestTracker): the request tracker. + block_size (int): the block size in vLLM. + load_spec (Optional[LoadSpec]): the load spec for KV cache loading. + skip_save (bool): whether to skip the save operation. + + Returns: + the request metadata if we need to perform load/save + operations, None otherwise. + """ + input_token_ids = tracker.token_ids + input_token_len = len(input_token_ids) + + if skip_save and load_spec is None: + return None + + num_tokens_to_save = input_token_len + skip_leading_tokens = tracker.num_saved_tokens + + # If we need to save, update the number of saved tokens + if not skip_save: + tracker.num_saved_tokens = num_tokens_to_save + save_spec = SaveSpec(skip_leading_tokens, not skip_save) + + # Calculate the token ids and slot mappings for load and save + # OPTIMIZATION: pre-allocate the buffer for token ids and block + # ids + token_ids = torch.tensor(input_token_ids)[:num_tokens_to_save] + num_blocks = len(tracker.allocated_block_ids) + block_ids = torch.tensor(tracker.allocated_block_ids, dtype=torch.long) + + if len(token_ids) > num_blocks * block_size: + logger.error( + "The number of tokens is more than the number of blocks." + "Something might be wrong in scheduling logic!") + logger.error("Num tokens: %d, num blocks: %d, block size: %d", + len(token_ids), num_blocks, block_size) + + block_offsets = torch.arange(0, block_size, dtype=torch.long) + slot_mapping = block_offsets.reshape((1, block_size)) + \ + block_ids.reshape((num_blocks, 1)) * block_size + + slot_mapping = slot_mapping.flatten()[:len(token_ids)] + assert slot_mapping.dtype == torch.long # TODO: this could be removed + + # For load operation: check whether the request is scheduled to load + if load_spec is not None and load_spec.can_load: + logger.debug("Scheduled to load %d tokens for request %s", + load_spec.external_cached_tokens, tracker.req_id) + else: + # Do not load if not in `can_load` state + load_spec = None + + return ReqMeta( + req_id=tracker.req_id, + token_ids=token_ids, + slot_mapping=slot_mapping, + save_spec=save_spec, + load_spec=load_spec, + ) + + +@dataclass +class GeneralKVConnectorMetadata(KVConnectorMetadata): + requests: list[ReqMeta] + + def __init__(self): + self.requests = [] + + def add_request(self, req_meta: ReqMeta) -> None: + """Add a request to the metadata. + + Args: + req_meta (ReqMeta): the request metadata. + """ + self.requests.append(req_meta) + + From b1e003ef1d0152b60505ff112262bf35dcfdde23 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Mon, 19 May 2025 05:27:27 +0000 Subject: [PATCH 03/28] [Add] adding tests for cpu kv pd Signed-off-by: ApostaC --- .../cpu_kv_integration/__init__.py | 1 + .../cpu_kv_integration/output.txt | 4 + .../cpu_kv_integration/run_nsys.sh | 5 + .../test_ring_buffer_allocator.py | 147 ++++++++++++++++++ .../cpu_kv_integration/toy_example.py | 57 +++++++ 5 files changed, 214 insertions(+) create mode 100644 tests/v1/kv_connector/cpu_kv_integration/__init__.py create mode 100644 tests/v1/kv_connector/cpu_kv_integration/output.txt create mode 100644 tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh create mode 100644 tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py create mode 100644 tests/v1/kv_connector/cpu_kv_integration/toy_example.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/__init__.py b/tests/v1/kv_connector/cpu_kv_integration/__init__.py new file mode 100644 index 000000000000..50135644f7bc --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/__init__.py @@ -0,0 +1 @@ +# Empty init file to mark directory as Python package \ No newline at end of file diff --git a/tests/v1/kv_connector/cpu_kv_integration/output.txt b/tests/v1/kv_connector/cpu_kv_integration/output.txt new file mode 100644 index 000000000000..24b680935413 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/output.txt @@ -0,0 +1,4 @@ +Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hello, my name isoplevel +Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey The capital of France isoplevel +Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Your name isoplevel +How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How The capital of China isoplevel diff --git a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh new file mode 100644 index 000000000000..57e28a58a577 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh @@ -0,0 +1,5 @@ +CUDA_VISIBLE_DEVICES=7 nsys profile \ + --trace=cuda,nvtx,osrt \ + --output=prefiller \ + --force-overwrite=true \ + python3 toy_example.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py new file mode 100644 index 000000000000..5e67c5972f0a --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch + +from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector import RingBufferAllocator + +def test_basic_allocation(): + """Test basic allocation and deallocation behavior.""" + # Create a buffer with 1024 bytes, aligned to 256 bytes + allocator = RingBufferAllocator(size=1024, align_to=256) + + # Allocate 100 bytes - should be aligned to 256 + addr1, buffer1 = allocator.allocate(100) + assert addr1 >= 0 # Valid address + assert buffer1 is not None + assert len(buffer1) == 100 + assert allocator.high_watermark == 256 # Aligned to 256 + assert allocator.low_watermark == 0 + + # Allocate another 100 bytes + addr2, buffer2 = allocator.allocate(100) + assert addr2 >= 0 # Valid address + assert buffer2 is not None + assert len(buffer2) == 100 + assert allocator.high_watermark == 512 # Next aligned position + + # Verify buffers don't overlap + assert buffer1.data_ptr() + len(buffer1) <= buffer2.data_ptr() + +def test_alignment(): + """Test that allocations are properly aligned.""" + allocator = RingBufferAllocator(size=1024, align_to=256) + + # Allocate various sizes and verify alignment + sizes = [10, 100, 200, 50] + addresses = [] + buffers = [] + + for size in sizes: + addr, buf = allocator.allocate(size) + assert addr >= 0 # Valid address + assert buf is not None + addresses.append(addr) + buffers.append(buf) + # High watermark should always be aligned to 256 + assert allocator.high_watermark % 256 == 0 + +def test_wraparound(): + """Test buffer wraparound behavior.""" + allocator = RingBufferAllocator(size=1024, align_to=256) + + # Fill most of the buffer + addr1, buffer1 = allocator.allocate(300) # Takes 512 bytes aligned + addr2, buffer2 = allocator.allocate(300) # Takes 512 bytes aligned + assert addr1 >= 0 and addr2 >= 0 # Valid addresses + assert buffer1 is not None and buffer2 is not None + + # This allocation should fail as we don't have enough contiguous space + addr3, buffer3 = allocator.allocate(300) + assert addr3 == -1 # Invalid address + assert buffer3 is None + + # Free the first buffer + allocator.free(addr1) # Free first 512 bytes + + # Now we should be able to allocate again by wrapping around + addr4, buffer4 = allocator.allocate(200) + assert addr4 >= 0 # Valid address + assert buffer4 is not None + assert allocator.high_watermark >= allocator._size # Wrapped around + assert allocator.high_watermark % allocator._size < 512 # Using freed space + +def test_fragmentation(): + """Test handling of fragmentation.""" + allocator = RingBufferAllocator(size=1024, align_to=256) + + # Allocate several buffers + addr1, buffer1 = allocator.allocate(100) # 256 bytes aligned + addr2, buffer2 = allocator.allocate(100) # 256 bytes aligned + addr3, buffer3 = allocator.allocate(100) # 256 bytes aligned + assert all(addr >= 0 for addr in [addr1, addr2, addr3]) # Valid addresses + assert all(buf is not None for buf in [buffer1, buffer2, buffer3]) + + # Free buffer2, creating a gap + allocator.free(addr2) # Free middle buffer + + # Try to allocate a buffer larger than the gap + addr4, buffer4 = allocator.allocate(300) + assert addr4 == -1 # Invalid address + assert buffer4 is None # Should fail due to fragmentation + + # Allocate a buffer that fits in the gap + # This should also fail as we don't track gaps in current implementation + addr5, buffer5 = allocator.allocate(100) + assert addr5 == -1 # Invalid address + assert buffer5 is None # Should fail due to fragmentation + + # Free buffer1 + allocator.free(addr1) # Free first buffer + + # Now we should be able to allocate again + addr6, buffer6 = allocator.allocate(100) + assert addr6 >= 0 # Valid address + assert buffer6 is not None + assert allocator.high_watermark >= allocator._size # Wrapped around + assert allocator.high_watermark % allocator._size < 512 # Using freed space + +def test_full_buffer(): + """Test behavior when buffer is completely full.""" + allocator = RingBufferAllocator(size=1024, align_to=256) + + # Fill the entire buffer + addresses = [] + buffers = [] + while True: + addr, buf = allocator.allocate(200) + if addr == -1: # Invalid address indicates allocation failure + break + addresses.append(addr) + buffers.append(buf) + + # Verify we can't allocate more + addr, buf = allocator.allocate(10) + assert addr == -1 + assert buf is None + + # Free everything + for addr in addresses: + allocator.free(addr) + + # Should be able to allocate again + addr, buffer = allocator.allocate(200) + assert addr >= 0 # Valid address + assert buffer is not None + +def test_invalid_free(): + """Test that freeing invalid addresses raises an error.""" + allocator = RingBufferAllocator(size=1024, align_to=256) + + # Allocate a buffer + addr, buffer = allocator.allocate(100) + assert addr >= 0 # Valid address + assert buffer is not None + + # Try to free an invalid address + with pytest.raises(AssertionError): + allocator.free(100) # Invalid address diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py new file mode 100644 index 000000000000..6528187a4df7 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os + +# VLLM_ENABLE_V1_MULTIPROCESSING=0 +# VLLM_WORKER_MULTIPROC_METHOD=spawn +os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +context = "Hi " * 1000 +context2 = "Hey " * 1000 +context3 = "Hello " * 1000 +context4 = "How " * 1000 +prompts = [ + context + "Hello, my name is", + context2+ "The capital of France is", + context3 + "Your name is", + context4 + "The capital of China is", +] + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector = "CPUConnector", + kv_role = "kv_producer", + kv_connector_extra_config = {}, + ), + load_format="dummy", + max_model_len=2048, + max_num_batched_tokens=2048, + block_size=64, + ) + +# 1ST generation (prefill instance) +outputs = llm.generate( + prompts, + sampling_params, +) + +new_prompts = [] +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +# Write new_prompts to output.txt +with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") +print(f"Saved {len(new_prompts)} prompts to output.txt") From ccc8c1dce1a6c1f90196d1a838fd48eaa4c0093a Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 20 May 2025 00:28:38 +0000 Subject: [PATCH 04/28] [Add] adding nixl transfer impl WIP Signed-off-by: ApostaC --- .../kv_connector/v1/cpu_connector.py | 497 +----------- .../kv_connector/v1/cpu_connector_utils.py | 333 ++++++++ .../kv_connector/v1/nixl_cpu_utils.py | 721 ++++++++++++++++++ 3 files changed, 1066 insertions(+), 485 deletions(-) create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index f1e3f1d452b9..c9a20a292f11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -18,6 +18,8 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( + CPUSendTask, CPUKVSender, SourceSpec, DestinationSpec) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) @@ -83,479 +85,6 @@ def h2d_page_copy( dst_k.copy_(src_k, non_blocking=True) dst_v.copy_(src_v, non_blocking=True) -@dataclass -class DestinationSpec: - """DestinationSpec is used to specify the destination of kv sending task. - - Attributes: - rank (int): The rank of the destination. - host (str): The path of the destination. - base_port (int): The base port of the destination. - """ - rank: int - host: str - base_port: int - - def __str__(self) -> str: - return f"DestinationSpec(rank={self.rank}, host={self.host}, base_port={self.base_port})" - -@dataclass -class SourceSpec: - """SourceSpec is used to specify the source of kv sending task. - """ - # The request id of the kv cache - request_id: str - - # The layer id of the kv cache - layer_id: int - - # The range of tokens to be offloaded - token_range: slice - - # The shape of the offloaded KV cache tensor - shape: torch.Size - - # The dtype of the offloaded KV cache tensor - dtype: torch.dtype - - def get_size(self) -> int: - """Get the size in bytes of the cooresponding kv cache. - """ - return self.shape.numel() * self.dtype.itemsize - - def __str__(self) -> str: - return f"SourceSpec(request_id={self.request_id}, " + \ - f"layer_id={self.layer_id}, " + \ - f"token_range={self.token_range}, shape={self.shape})" - -@dataclass -class SendTaskState: - """SendTaskState is used to track the state of a send task. - """ - sender_ready: bool = False - receiver_ready: bool = False - is_sending: bool = False - send_done: bool = False - - def __str__(self) -> str: - return (f"SendTaskState(sender_ready={self.sender_ready}, " - f"receiver_ready={self.receiver_ready}, " - f"is_sending={self.is_sending}, " - f"send_done={self.send_done})") - - def is_ready(self) -> bool: - """Check if the send task is ready to be sent. - - Returns: - bool: True if the send task is ready, False otherwise. - """ - return self.sender_ready and self.receiver_ready - - def is_done(self) -> bool: - """Check if the send task is done. - - Returns: - bool: True if the send task is done, False otherwise. - """ - return self.send_done - -@dataclass -class SendTask: - """Wraps a KV Cache sending task - """ - - # A flat buffer holding the tensor data - buffer: torch.Tensor - source_spec: SourceSpec - destination_spec: DestinationSpec - state: SendTaskState - - @property - def tensor(self) -> torch.Tensor: - """Get the tensor of the send task. - - Returns: - torch.Tensor: The tensor of the send task. - """ - num_elements = self.source_spec.shape.numel() - return self.buffer.view( - self.source_spec.dtype)[:num_elements].view( - self.source_spec.shape) - - def update_states(self) -> None: - """Update the states of the send task. This needs to be OVERWRITTEN in - subclasses to handle different types of send tasks. - - This function should be called periodically to ensure that the send - task is being processed. - """ - raise NotImplementedError - - def is_ready(self) -> bool: - """Check if the send task is ready to be sent. - - Returns: - bool: True if the send task is ready, False otherwise. - """ - return self.state.is_ready() - - def is_sending(self) -> bool: - """Check if the send task is currently sending. - - Returns: - bool: True if the send task is sending, False otherwise. - """ - return self.state.is_sending - - def is_done(self) -> bool: - """Check if the send task is done. - - Returns: - bool: True if the send task is done, False otherwise. - """ - return self.state.is_done() - -class KVSenderInterface(ABC): - """KVSenderInterface is an interface for sending KV cache data. - """ - - def __init__(self) -> None: - self._send_tasks: list[SendTask] = [] - - - def add_send_task(self, task: SendTask) -> None: - """Add a send task to the list of send tasks. - - Args: - task (SendTask): The send task to be added. - """ - self._send_tasks.append(task) - - def get_send_tasks(self) -> list[SendTask]: - """Get the list of send tasks. - - Returns: - list[SendTask]: The list of send tasks. - """ - return self._send_tasks - - @_lmcache_nvtx_annotate - def progress(self) -> None: - """A fast, non-blocking function to check and update the states of all - send tasks. This function should be called periodically to ensure that - the send tasks are being processed. - """ - # Update before going through all send tasks - self.pre_progress_hook() - - new_task_list = [] - - for task in self._send_tasks: - should_add = True - - if task.is_ready() and not task.is_sending(): - self._send(task) - - if task.is_done(): - self.free_task(task) - should_add = False - - if should_add: - new_task_list.append(task) - - self._send_tasks = new_task_list - - # Update after going through all send tasks - self.post_progress_hook() - - ###################################################### - # Abstract methods (to be implemented by subclasses) # - ###################################################### - - @abstractmethod - def create_send_task( - self, - source_spec: SourceSpec, - destination_spec: DestinationSpec, - ) -> SendTask: - """Create a non-ready send task with a CPU buffer allocated. - - Args: - source_spec (SourceSpec): The source specification of the send - task. - destination_spec (DestinationSpec): The destination - specification of the send task. - """ - raise NotImplementedError("create_send_task() not implemented") - - @abstractmethod - def free_task(self, task: SendTask) -> None: - """Free the send task. - - Args: - task (SendTask): The send task to be freed. - """ - raise NotImplementedError("free_task() not implemented") - - @abstractmethod - def send_task(self, task: SendTask) -> None: - """Send the send task after it is ready. - - Args: - task (SendTask): The send task to be sent. - """ - raise NotImplementedError("send_task() not implemented") - - @abstractmethod - def pre_progress_hook(self, task: SendTask) -> None: - """Hook to be called before processing the send task. - - Args: - task (SendTask): The send task to be processed. - """ - raise NotImplementedError("pre_progress_hook() not implemented") - - @abstractmethod - def post_progress_hook(self, task: SendTask) -> None: - """Hook to be called after processing the send task. - - Args: - task (SendTask): The send task to be processed. - """ - raise NotImplementedError("post_progress_hook() not implemented") - - -# DEBUG IMPLEMENTATION: NO REAL SEND BUT HAVE MEMORY MANAGEMENT AND D2H COPY -class RingBufferAllocator: - """RingBufferAllocator is a simple ring buffer allocator for managing - memory allocation and deallocation. - """ - - def __init__(self, size: int, align_to: int = 256) -> None: - """Initialize the ring buffer allocator with the given size. - - Args: - size (int): The size of the ring buffer (in bytes). - align_to (int): The alignment size (in bytes). Default is 8. - """ - self._size = size - self._buffer = torch.empty(size, dtype=torch.uint8) - self._high_watermark = 0 - self._low_watermark = 0 - self._align_to = align_to - - self._allocated = OrderedDict() # Track allocated buffers - - # Register pin memory - cudart = torch.cuda.cudart() - cudart.cudaHostRegister(self._buffer.data_ptr(), size, 0) - - def _align_size(self, base: int) -> int: - """Align the given size to the nearest multiple of the alignment size. - - Args: - base (int): The size to be aligned. - - Returns: - int: The aligned size. - """ - return ((base - 1) // self._align_to + 1) * self._align_to - - def allocate(self, size: int) -> Tuple[int, Optional[torch.Tensor]]: - """Allocate a buffer of the given size. - - Args: - size (int): The size of the buffer to be allocated. - - Returns: - Optional[Tuple[int, torch.Tensor]]: A tuple containing the address - of the allocated buffer and the buffer itself. If allocation - fails, returns None. - """ - # During allocation, we always make sure that high watermark and - # low watermark are aligned to the alignment size - aligned_size = self._align_size(size) # Align the requested size - turnaround_size = (self._high_watermark // self._size + 1) * self._size - - local_high = self._high_watermark % self._size - local_low = self._low_watermark % self._size - - if local_high >= local_low: - if local_high == local_low and \ - self._high_watermark > self._low_watermark: - # No space available - return -1, None - - # If high watermark + requested size is okay, directly allocate - if local_high + size < self._size: - address = self._high_watermark - self._allocated[address] = aligned_size - start = local_high - end = start + size - self._high_watermark += aligned_size - return address, self._buffer[start:end] - else: - # If high watermark + requested size is not okay, we need to - # wrap around and allocate again - self._high_watermark = turnaround_size - return self.allocate(size) - else: - # High watermark is below low watermark, check if we can allocate - if local_high + size < local_low: - address = self._high_watermark - self._allocated[address] = aligned_size - start = local_high - end = start + size - self._high_watermark += aligned_size - return address, self._buffer[start:end] - else: - # No space available - return -1, None - - def free(self, address: int) -> None: - """Free the buffer at the given address. - - Args: - address (int): The address of the buffer to be freed, which - is returned by the allocate() method. - """ - assert address in self._allocated, \ - "Address not found in allocated buffers" - - # Pop the address from the allocated dict, and update the - # low watermark - self._allocated.pop(address) - - # If there is nothing allocated, set low_watermark to high watermark - new_low_watermark = self._high_watermark - - # Else, set the low_watermark to the first address in the allocated - # dict - for addr in self._allocated.keys(): - new_low_watermark = addr - break - self._low_watermark = new_low_watermark - - @property - def high_watermark(self) -> int: - return self._high_watermark - - @property - def low_watermark(self) -> int: - return self._low_watermark - - -@dataclass -class CPUSendTask(SendTask): - """CPUSendTask is a send task that uses CPU memory for the buffer. - """ - buffer_addr: int - creation_time: float = 0.0 - cuda_event: Optional[torch.cuda.Event] = None - - dbg_send_time: Optional[float] = None - - def __post_init__(self) -> None: - self.creation_time = time.time() - - @_lmcache_nvtx_annotate - def update_states(self) -> None: - """Update the states of the send task. - """ - # Check the cuda event - if not self.state.sender_ready and self.cuda_event is not None \ - and self.cuda_event.query(): - self.state.sender_ready = True - - curr_time = time.time() - if curr_time - self.creation_time > 0.5: - self.state.receiver_ready = True - - if self.dbg_send_time is not None and \ - curr_time - self.dbg_send_time > 1: - self.state.send_done = True - - def dbg_mark_sending(self) -> None: - """Mark the send task as sending. - """ - self.state.is_sending = True - self.dbg_send_time = time.time() - -class CPUKVSender(KVSenderInterface): - """CPUKVSender is an implementation of KVSenderInterface that provides a - ring buffer allocator for managing pin memory allocation and deallocation. - """ - - def __init__(self, buffer_size: int) -> None: - super().__init__() - self._buffer_size = buffer_size - self._allocator = RingBufferAllocator(self._buffer_size) - - def create_send_task( - self, - source_spec: SourceSpec, - destination_spec: DestinationSpec, - ) -> SendTask: - """Create a non-ready send task with a CPU buffer allocated. - - Args: - source_spec (SourceSpec): The source specification of the send - task. - destination_spec (DestinationSpec): The destination - specification of the send task. - """ - # Allocate a buffer for the send task - size = source_spec.get_size() - address, buffer = self._allocator.allocate(size) - while address == -1: - # If allocation fails, wait for a while to process - # and try again - time.sleep(0.001) - self.progress() - address, buffer = self._allocator.allocate(size) - assert buffer is not None, "Buffer allocation failed" - - # Create a send task with the allocated buffer - task = CPUSendTask( - buffer=buffer, - source_spec=source_spec, - destination_spec=destination_spec, - state=SendTaskState(), - buffer_addr=address, - ) - self.add_send_task(task) - return task - - def free_task(self, task: SendTask) -> None: - """Free the send task. - - Args: - task (SendTask): The send task to be freed. - """ - # Free the buffer in the ring buffer allocator - self._allocator.free(task.buffer_addr) - - def send_task(self, task: SendTask) -> None: - """Send the send task after it is ready. - - Args: - task (SendTask): The send task to be sent. - """ - # DEBUG IMPLEMENTATION - logger.error("CPUKVSender.send_task() not implemented, running a debug implementation!") - task.dbg_mark_sending() - - def pre_progress_hook(self) -> None: - for task in self.get_send_tasks(): - task.update_states() - - def post_progress_hook(self) -> None: - pass - - def _send(self, task: SendTask) -> None: - # NO IMPLEMENTATION YET - pass - ##################################################################### # Connector related code @@ -926,24 +455,22 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, assert self._cpu_kv_sender is not None for prefill_req in meta.prefill_meta: - # TODO: add skip leading/trailing tokens into source_spec - # or maybe recompute it at the receiver side based on the - # token_range + # Create a source spec with serializable types source_spec = SourceSpec( - request_id = prefill_req.req_id, - layer_id = self._get_layer_id(layer_name), - token_range = prefill_req.token_range, - shape = self._get_kv_shape( - len(prefill_req.blocks_to_save)), - dtype = kv_layer.dtype + request_id=prefill_req.req_id, + layer_id=self._get_layer_id(layer_name), + start=prefill_req.token_range.start, + stop=prefill_req.token_range.stop, + shape=tuple(self._get_kv_shape(len(prefill_req.blocks_to_save))), + dtype_str=str(kv_layer.dtype).split('.')[-1] # Convert torch.float32 -> "float32" ) # Create a destination spec # TODO: remove the hard-code here dest_spec = DestinationSpec( - rank = get_tensor_model_parallel_rank(), - host = "localhost", - base_port = "54321", + rank=get_tensor_model_parallel_rank(), + host="localhost", + base_port=54321, # Changed from string to int to match the class definition ) # Create the send task diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py new file mode 100644 index 000000000000..d9febea1eaeb --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -0,0 +1,333 @@ +# SPDX-License-Identifier: Apache-2.0 +import contextlib +import math +import threading +import time +import uuid +from abc import ABC, abstractmethod +from collections import defaultdict, OrderedDict +from collections.abc import Iterator +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional, Tuple + +import msgspec +import torch +import zmq + +from vllm import envs +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, + get_tp_group) +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + DestinationSpec, SourceSpec, RingBufferAllocator) +from vllm.logger import init_logger +from vllm.utils import make_zmq_path, make_zmq_socket, round_down, cdiv +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.request import RequestStatus +from vllm import _custom_ops as ops + +from lmcache.utils import _lmcache_nvtx_annotate + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.core.sched.output import CachedRequestData, NewRequestData + from vllm.v1.request import Request + +logger = init_logger(__name__) + + + +@dataclass +class SendTaskState: + """SendTaskState is used to track the state of a send task. + """ + sender_ready: bool = False + receiver_ready: bool = False + is_sending: bool = False + send_done: bool = False + + def __str__(self) -> str: + return (f"SendTaskState(sender_ready={self.sender_ready}, " + f"receiver_ready={self.receiver_ready}, " + f"is_sending={self.is_sending}, " + f"send_done={self.send_done})") + + def is_ready(self) -> bool: + """Check if the send task is ready to be sent. + + Returns: + bool: True if the send task is ready, False otherwise. + """ + return self.sender_ready and self.receiver_ready + + def is_done(self) -> bool: + """Check if the send task is done. + + Returns: + bool: True if the send task is done, False otherwise. + """ + return self.send_done + +@dataclass +class SendTask: + """Wraps a KV Cache sending task + """ + + # A flat buffer holding the tensor data + buffer: torch.Tensor + source_spec: SourceSpec + destination_spec: DestinationSpec + state: SendTaskState + + @property + def tensor(self) -> torch.Tensor: + """Get the tensor of the send task. + + Returns: + torch.Tensor: The tensor of the send task. + """ + num_elements = self.source_spec.tensor_shape.numel() + return self.buffer.view( + self.source_spec.dtype)[:num_elements].view( + self.source_spec.tensor_shape) + + def update_states(self) -> None: + """Update the states of the send task. This needs to be OVERWRITTEN in + subclasses to handle different types of send tasks. + + This function should be called periodically to ensure that the send + task is being processed. + """ + raise NotImplementedError + + def is_ready(self) -> bool: + """Check if the send task is ready to be sent. + + Returns: + bool: True if the send task is ready, False otherwise. + """ + return self.state.is_ready() + + def is_sending(self) -> bool: + """Check if the send task is currently sending. + + Returns: + bool: True if the send task is sending, False otherwise. + """ + return self.state.is_sending + + def is_done(self) -> bool: + """Check if the send task is done. + + Returns: + bool: True if the send task is done, False otherwise. + """ + return self.state.is_done() + +class KVSenderInterface(ABC): + """KVSenderInterface is an interface for sending KV cache data. + """ + + def __init__(self) -> None: + self._send_tasks: list[SendTask] = [] + + + def add_send_task(self, task: SendTask) -> None: + """Add a send task to the list of send tasks. + + Args: + task (SendTask): The send task to be added. + """ + self._send_tasks.append(task) + + def get_send_tasks(self) -> list[SendTask]: + """Get the list of send tasks. + + Returns: + list[SendTask]: The list of send tasks. + """ + return self._send_tasks + + @_lmcache_nvtx_annotate + def progress(self) -> None: + """A fast, non-blocking function to check and update the states of all + send tasks. This function should be called periodically to ensure that + the send tasks are being processed. + """ + # Update before going through all send tasks + self.pre_progress_hook() + + new_task_list = [] + + for task in self._send_tasks: + should_add = True + + if task.is_ready() and not task.is_sending(): + self._send(task) + + if task.is_done(): + self.free_task(task) + should_add = False + + if should_add: + new_task_list.append(task) + + self._send_tasks = new_task_list + + # Update after going through all send tasks + self.post_progress_hook() + + ###################################################### + # Abstract methods (to be implemented by subclasses) # + ###################################################### + + @abstractmethod + def create_send_task( + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: + """Create a non-ready send task with a CPU buffer allocated. + + Args: + source_spec (SourceSpec): The source specification of the send + task. + destination_spec (DestinationSpec): The destination + specification of the send task. + """ + raise NotImplementedError("create_send_task() not implemented") + + @abstractmethod + def free_task(self, task: SendTask) -> None: + """Free the send task. + + Args: + task (SendTask): The send task to be freed. + """ + raise NotImplementedError("free_task() not implemented") + + @abstractmethod + def send_task(self, task: SendTask) -> None: + """Send the send task after it is ready. + + Args: + task (SendTask): The send task to be sent. + """ + raise NotImplementedError("send_task() not implemented") + + @abstractmethod + def pre_progress_hook(self, task: SendTask) -> None: + """Hook to be called before processing the send task. + + Args: + task (SendTask): The send task to be processed. + """ + raise NotImplementedError("pre_progress_hook() not implemented") + + @abstractmethod + def post_progress_hook(self, task: SendTask) -> None: + """Hook to be called after processing the send task. + + Args: + task (SendTask): The send task to be processed. + """ + raise NotImplementedError("post_progress_hook() not implemented") + + + +@dataclass +class CPUSendTask(SendTask): + """CPUSendTask is a send task that uses CPU memory for the buffer. + """ + buffer_addr: int + cuda_event: Optional[torch.cuda.Event] = None + + def __post_init__(self) -> None: + self.creation_time = time.time() + + @_lmcache_nvtx_annotate + def update_states(self) -> None: + """Update the states of the send task. + """ + # Check the cuda event + if not self.state.sender_ready and self.cuda_event is not None \ + and self.cuda_event.query(): + self.state.sender_ready = True + +class CPUKVSender(KVSenderInterface): + """CPUKVSender is an implementation of KVSenderInterface that provides a + ring buffer allocator for managing pin memory allocation and deallocation. + """ + + def __init__(self, buffer_size: int) -> None: + super().__init__() + self._buffer_size = buffer_size + self._allocator = RingBufferAllocator(self._buffer_size) + + def create_send_task( + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: + """Create a non-ready send task with a CPU buffer allocated. + + Args: + source_spec (SourceSpec): The source specification of the send + task. + destination_spec (DestinationSpec): The destination + specification of the send task. + """ + # Allocate a buffer for the send task + size = source_spec.get_size() + address, buffer = self._allocator.allocate(size) + while address == -1: + # If allocation fails, wait for a while to process + # and try again + time.sleep(0.001) + self.progress() + address, buffer = self._allocator.allocate(size) + assert buffer is not None, "Buffer allocation failed" + + # Create a send task with the allocated buffer + task = CPUSendTask( + buffer=buffer, + source_spec=source_spec, + destination_spec=destination_spec, + state=SendTaskState(), + buffer_addr=address, + ) + self.add_send_task(task) + return task + + def free_task(self, task: SendTask) -> None: + """Free the send task. + + Args: + task (SendTask): The send task to be freed. + """ + # Free the buffer in the ring buffer allocator + self._allocator.free(task.buffer_addr) + + def send_task(self, task: SendTask) -> None: + """Send the send task after it is ready. + + Args: + task (SendTask): The send task to be sent. + """ + # DEBUG IMPLEMENTATION + logger.error("CPUKVSender.send_task() not implemented, running a debug implementation!") + task.dbg_mark_sending() + + def pre_progress_hook(self) -> None: + for task in self.get_send_tasks(): + task.update_states() + + def post_progress_hook(self) -> None: + pass + + def _send(self, task: SendTask) -> None: + # NO IMPLEMENTATION YET + pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py new file mode 100644 index 000000000000..dc26f2f554ea --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -0,0 +1,721 @@ +# SPDX-License-Identifier: Apache-2.0 +import contextlib +import math +import threading +import time +import uuid +from collections import defaultdict, OrderedDict +from collections.abc import Iterator +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +import msgspec +import torch +import zmq + +from vllm import envs +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, + get_tp_group) +from vllm.logger import init_logger +from vllm.utils import make_zmq_path, make_zmq_socket, round_down +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.request import RequestStatus + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.request import Request + +logger = init_logger(__name__) + +try: + from nixl._api import nixl_agent as NixlWrapper + logger.info("NIXL is available") +except ImportError: + logger.warning("NIXL is not available") + NixlWrapper = None + +################################################################### +# Helper classes and functions +################################################################### + +def init_nixl_agent( + buffer_size: int, + buffer_ptr: int, + nixl_page_size: int = 4096, +) -> tuple[NixlWrapper, Any, Any]: + """Initialize the NIXL agent. + + Args: + buffer_size (int): The size of the buffer. + buffer_ptr (int): The pointer to the buffer. + nixl_page_size (int, optional): The page size of NIXL. Defaults to 4096. + + Returns: + NixlWrapper: The NIXL agent. + reg_dlist: the registered memory descriptor list. + xfer_dlist: the local transfer descriptor list. + """ + if NixlWrapper is None: + raise RuntimeError("NIXL is not available") + + # Create a NIXL agent + nixl_agent = NixlWrapper(str(uuid.uuid4())) + + # Register the memory + memory_desc = [(buffer_ptr, buffer_size, 0, "")] + reg_descs = nixl_agent.get_reg_descs(memory_desc, mem_type="DRAM") + nixl_agent.register_memory(reg_descs) + + # Create xfer handlers + xfer_desc = [] + for base_addr in range(buffer_ptr, + buffer_ptr + buffer_size, + nixl_page_size): + xfer_desc.append((base_addr, nixl_page_size, 0)) + + descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type="DRAM") + local_xfer_dlist = nixl_agent.prep_xfer_dlist( + "", descs, mem_type="DRAM") + + return nixl_agent, reg_descs, local_xfer_dlist + +@dataclass +class DestinationSpec: + """DestinationSpec is used to specify the destination of kv sending task. + + Attributes: + rank (int): The rank of the destination. + host (str): The path of the destination. + base_port (int): The base port of the destination. + """ + rank: int + host: str + base_port: int + + def __str__(self) -> str: + return f"DestinationSpec(rank={self.rank}, host={self.host}, base_port={self.base_port})" + + def get_id(self) -> str: + """Get the id of the destination spec. + + Returns: + str: The id of the destination spec. + """ + return f"{self.rank}_{self.host}_{self.base_port}" + +class SourceSpec(msgspec.Struct): + """SourceSpec is used to specify the source of kv sending task. + """ + # The request id of the kv cache + request_id: str + + # The layer id of the kv cache + layer_id: int + + # The range of tokens to be offloaded + start: int # For token_range slice + stop: int # For token_range slice + + # The shape of the offloaded KV cache tensor as a tuple + shape: tuple[int, ...] + + # The dtype of the offloaded KV cache tensor as a string + dtype_str: str + + @property + def token_range(self) -> slice: + """Get the token range as a slice object.""" + return slice(self.start, self.stop) + + @property + def tensor_shape(self) -> torch.Size: + """Get the shape as a torch.Size object.""" + return torch.Size(self.shape) + + @property + def dtype(self) -> torch.dtype: + """Get the dtype as a torch.dtype object.""" + return getattr(torch, self.dtype_str) + + def get_size(self) -> int: + """Get the size in bytes of the cooresponding kv cache.""" + return math.prod(self.shape) * self.dtype.itemsize + + def __str__(self) -> str: + return (f"SourceSpec(request_id={self.request_id}, " + f"layer_id={self.layer_id}, " + f"token_range={self.token_range}, shape={self.tensor_shape})") + +class RingBufferAllocator: + """RingBufferAllocator is a simple ring buffer allocator for managing + memory allocation and deallocation. + """ + + def __init__(self, size: int, align_to: int = 256) -> None: + """Initialize the ring buffer allocator with the given size. + + Args: + size (int): The size of the ring buffer (in bytes). + align_to (int): The alignment size (in bytes). Default is 8. + """ + self._size = size + self._buffer = torch.empty(size, dtype=torch.uint8) + self._high_watermark = 0 + self._low_watermark = 0 + self._align_to = align_to + + self._allocated = OrderedDict() # Track allocated buffers + + # Register pin memory + cudart = torch.cuda.cudart() + cudart.cudaHostRegister(self._buffer.data_ptr(), size, 0) + + def _align_size(self, base: int) -> int: + """Align the given size to the nearest multiple of the alignment size. + + Args: + base (int): The size to be aligned. + + Returns: + int: The aligned size. + """ + return ((base - 1) // self._align_to + 1) * self._align_to + + def allocate(self, size: int) -> tuple[int, Optional[torch.Tensor]]: + """Allocate a buffer of the given size. + + Args: + size (int): The size of the buffer to be allocated. + + Returns: + Optional[tuple[int, torch.Tensor]]: A tuple containing the virtual + address of the allocated buffer and the buffer itself. If + allocation fails, returns None. + """ + # During allocation, we always make sure that high watermark and + # low watermark are aligned to the alignment size + aligned_size = self._align_size(size) # Align the requested size + turnaround_size = (self._high_watermark // self._size + 1) * self._size + + local_high = self._high_watermark % self._size + local_low = self._low_watermark % self._size + + if local_high >= local_low: + if local_high == local_low and \ + self._high_watermark > self._low_watermark: + # No space available + return -1, None + + # If high watermark + requested size is okay, directly allocate + if local_high + size < self._size: + address = self._high_watermark + self._allocated[address] = aligned_size + start = local_high + end = start + size + self._high_watermark += aligned_size + return address, self._buffer[start:end] + else: + # If high watermark + requested size is not okay, we need to + # wrap around and allocate again + self._high_watermark = turnaround_size + return self.allocate(size) + else: + # High watermark is below low watermark, check if we can allocate + if local_high + size < local_low: + address = self._high_watermark + self._allocated[address] = aligned_size + start = local_high + end = start + size + self._high_watermark += aligned_size + return address, self._buffer[start:end] + else: + # No space available + return -1, None + + def free(self, address: int) -> None: + """Free the buffer at the given address. + + Args: + address (int): The virtual address of the buffer to be freed, + which is returned by the allocate() method. + """ + assert address in self._allocated, \ + "Address not found in allocated buffers" + + # Pop the address from the allocated dict, and update the + # low watermark + self._allocated.pop(address) + + # If there is nothing allocated, set low_watermark to high watermark + new_low_watermark = self._high_watermark + + # Else, set the low_watermark to the first address in the allocated + # dict + for addr in self._allocated.keys(): + new_low_watermark = addr + break + self._low_watermark = new_low_watermark + + @property + def high_watermark(self) -> int: + return self._high_watermark + + @property + def low_watermark(self) -> int: + return self._low_watermark + + def virtual_to_physical(self, vaddr: int) -> torch.Tensor: + """Convert a virtual address to a physical address. + + Args: + vaddr (int): The virtual address to be converted. + + Returns: + torch.Tensor: The physical address of the buffer. + """ + return vaddr + self._size + + def get_size(self) -> int: + """Get the size of the ring buffer. + + Returns: + int: The size of the ring buffer. + """ + return self._size + + def get_buffer_ptr(self) -> int: + """Get the pointer to the buffer. + + Returns: + int: The pointer to the buffer. + """ + return self._buffer.data_ptr() + +################################################################### +# NIXL Related Classes +################################################################### + +class NixlProtocolMsg(msgspec.Struct): + msg_type: str + req_uuid: str + source_spec: Optional[SourceSpec] = None + receiver_addr: Optional[int] = None + + + +def make_send_req_msg( + source_spec: SourceSpec +) -> bytes: + """Make the send request message. + + Args: + source_spec (SourceSpec): The source spec. + + Returns: + bytes: The send request message. + """ + # Create the request message + msg_type = "REQMSG" + req_uuid = str(uuid.uuid4()) + receiver_addr = None + send_req_msg = NixlProtocolMsg( + msg_type=msg_type, + req_uuid=req_uuid, + source_spec=source_spec, + receiver_addr=receiver_addr + ) + # Encode the message + send_req_msg_bytes = msgspec.msgpack.encode(send_req_msg) + return send_req_msg_bytes + +def make_receive_ready_msg( + req_uuid: str, + receiver_addr: int, +) -> bytes: + """Make the receive ready message. + + Args: + req_uuid (str): The request uuid. + receiver_addr (int): The receiver address. + + Returns: + bytes: The receive ready message. + """ + # Create the request message + msg_type = "READYMSG" + source_spec = None + receive_ready_msg = NixlProtocolMsg( + msg_type=msg_type, + req_uuid=req_uuid, + source_spec=source_spec, + receiver_addr=receiver_addr + ) + # Encode the message + receive_ready_msg_bytes = msgspec.msgpack.encode(receive_ready_msg) + return receive_ready_msg_bytes + +def make_send_finish_msg( + req_uuid: str, +) -> bytes: + """Make the send finish message. + + Args: + req_uuid (str): The request uuid. + + Returns: + bytes: The send finish message. + """ + # Create the request message + msg_type = "FINISHMSG" + source_spec = None + receiver_addr = None + send_finish_msg = NixlProtocolMsg( + msg_type=msg_type, + req_uuid=req_uuid, + source_spec=source_spec, + receiver_addr=receiver_addr + ) + # Encode the message + send_finish_msg_bytes = msgspec.msgpack.encode(send_finish_msg) + return send_finish_msg_bytes + + +class NixlCPUSender: + def __init__( + self, + buffer_size: int, + buffer_ptr: int, + nixl_page_size: int = 4096, + ) -> None: + self._buffer_size = buffer_size + self._buffer_ptr = buffer_ptr + self._nixl_page_size = nixl_page_size + + # Destination spec id -> peer name + self._remote_agents: dict[str, str] = {} + + self._nixl_wrapper, self._reg_dlist, self._local_xfer_dlist = \ + init_nixl_agent(buffer_size, buffer_ptr, nixl_page_size) + + # Add ZMQ context for handshakes + self._zmq_ctx = zmq.Context() + + # Requests that are ready to send + # uuid -> remote agent name + self._ready_requests: dict[str, str] = {} + + # NOTE(ApostaC): we don't track the requests that are waiting for the + # receiver to be ready, and may want to add this in the future + + # Msg decoder + self._msg_decoder = msgspec.msgpack.Decoder(NixlProtocolMsg) + + def send( + self, + src_addr: int, + dst_addr: int, + data_size: int + ) -> None: + """Send data from src_addr to dst_addr using NIXL. + + Args: + src_addr (int): Source address. + dst_addr (int): Destination address. + data_size (int): Size of the data in bytes to be sent. + """ + pass + + def prepare_send( + self, + destination_spec: DestinationSpec, + source_spec: SourceSpec, + ) -> str: + """Prepare the send operation by allocation the receive buffer + on the destination side. + + Args: + destination_spec (DestinationSpec): The destination spec. + source_spec (SourceSpec): The source spec. + + Returns: + str: The uuid of the prepared send + """ + dest_id = destination_spec.get_id() + if dest_id not in self._remote_agents: + # Perform handshake with the destination + self._nixl_handshake(destination_spec) + + remote_agent_name = self._remote_agents[dest_id] + + # Create the request message + msg = make_send_req_msg(source_spec) + + # Send it to the remote agent + self._nixl_wrapper.send_notif(remote_agent_name, msg) + + def check_and_remove_prepared_send( + self, + send_uuid: str, + ) -> Optional[str]: + """Check if the prepared send is ready to be sent. + If the send is ready, remove it from the ready requests. + + Args: + send_uuid (str): The uuid of the prepared send. + + Returns: + Optional[str]: The remote agent name if the send is ready, + None otherwise. + """ + # Update the ready requests + notifs = self._nixl_wrapper.get_new_notifs() + for remote_agent_name in notifs: + for msg in notifs[remote_agent_name]: + # Decode the message + obj = self._msg_decoder.decode(msg) + if msg.msg_type == "READYMSG": + # Add the request to the ready requests + self._ready_requests[obj.req_uuid] = remote_agent_name + else: + logger.error("Unexpected message type: %s", msg.msg_type) + continue + + # Check if the send uuid is in the ready requests + if send_uuid in self._ready_requests: + # Remove the request from the ready requests + remote_agent_name = self._ready_requests.pop(send_uuid) + return remote_agent_name + else: + return None + + def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: + """Perform handshake with a remote NIXL CPU instance. + + Args: + destination_spec (DestinationSpec): The destination spec. + """ + assert get_tensor_model_parallel_rank() == destination_spec.rank, \ + "Got different rank in destination spec and current rank" + + port = destination_spec.base_port + destination_spec.rank + path = make_zmq_path("tcp", destination_spec.host, port) + + local_meta = self._nixl_wrapper.get_agent_metadata() + with zmq_ctx(zmq.REQ, path) as sock: + # Send query for metadata + logger.info("Sending handshake request to %s", destination_spec) + sock.send(local_meta) + + logger.info("Waiting for handshake response from %s", destination_spec) + metadata_bytes = sock.recv() + + # Get remote agent name and register it + remote_agent_name = self._nixl_wrapper.add_remote_agent( + metadata_bytes) + + # Store remote agent info + self._remote_agents[destination_spec.get_id()] = remote_agent_name + + logger.info("Successfully completed handshake with %s", + destination_spec) + + +class NixlCPUReceiver: + def __init__( + self, + allocator: RingBufferAllocator = None, + nixl_page_size: int = 4096, + ) -> None: + self._buffer_size = allocator.get_size() + self._buffer_ptr = allocator.get_buffer_ptr() + self._nixl_page_size = nixl_page_size + self._allocator = allocator + + assert self._allocator is not None, "Allocator is required" + + # Requests that are pending for allocation + # uuid -> tuple[SourceSpec, peer name] + self._pending_allocation: dict[str, tuple[SourceSpec, str]] = {} + + # Already allocated requests + # uuid -> SourceSpec and uuid -> virtual address + self._inflight_requests: dict[str, SourceSpec] = {} + self._inflight_request_vaddr: dict[str, int] = {} + + # Finished requests + # uuid -> tuple[SourceSpec, virtual address] + self._finished_requests: dict[str, tuple[SourceSpec, int]] = {} + + # source zmq id -> peer name + self._remote_agents: dict[str, str] = {} + + self._nixl_wrapper, self._reg_dlist, self._local_xfer_dlist = \ + init_nixl_agent(buffer_size, buffer_ptr, nixl_page_size) + + # Add handshake listener thread + self._handshake_listener_t: Optional[threading.Thread] = None + self._stop_listener = threading.Event() + + def _process_msgs(self): + """Process the received messages from the NIXL agent.""" + notifs = self._nixl_wrapper.get_new_notifs() + for remote_agent_name in notifs: + for msg in notifs[remote_agent_name]: + # Decode the message + obj = self._msg_decoder.decode(msg) + if msg.msg_type == "REQMSG": + # Add the request to the pending allocation + self._pending_allocation[obj.req_uuid] = (obj.source_spec, + remote_agent_name) + elif msg.msg_type == "FINISHMSG": + # Add the request to the finished requests + if obj.req_uuid in self._inflight_requests: + source_spec = self._inflight_requests.pop(obj.req_uuid) + vaddr = self._inflight_request_vaddr.pop(obj.req_uuid) + self._finished_requests[obj.req_uuid] = (source_spec, vaddr) + else: + logger.error("Request %s not found in inflight requests", + obj.req_uuid) + else: + logger.error("Unexpected message type: %s", msg.msg_type) + continue + + def _process_allocation_requests(self): + """Process the allocation requests and allocate the buffers.""" + allocated_requests = [] + for req_uuid, (source_spec, peer_name) in \ + self._pending_allocation.items(): + # Try to allocate the buffer + vaddr, buffer = self._allocator.allocate(source_spec.get_size()) + if vaddr == -1: + # No space available, skip all the requests + + # NOTE: an alternative is to try allocation for other requests + # and then come back to this one, but this may create + # starvation + break + + # Add the request to the inflight requests + self._inflight_requests[req_uuid] = source_spec + self._inflight_request_vaddr[req_uuid] = vaddr + + # Send back the ready message + paddr = self._allocator.virtual_to_physical(vaddr) + ready_msg = make_receive_ready_msg(req_uuid, paddr) + self._nixl_wrapper.send_notif(peer_name, ready_msg) + + # Add the request to the allocated requests + allocated_requests.append(req_uuid) + + # Remove the allocated requests from the pending allocation + for req_uuid in allocated_requests: + del self._pending_allocation[req_uuid] + + def progress(self) -> None: + """Process the received requests and the data + """ + self._process_msgs() + self._process_allocation_requests() + + def get_finished(self) -> list[tuple[SourceSpec, int]]: + """Get the requests that finishes receiving. + + Returns: + list[tuple[SourceSpec, int]]: A list of tuples containing the source + spec and the address. + """ + pass + + def start_handshake_listener(self, host: str, base_port: int) -> None: + """Start the background thread that listens for handshake requests. + + Args: + host (str): Host address to listen on + base_port (int): Base port number to listen on + """ + ready_event = threading.Event() + self._handshake_listener_t = threading.Thread( + target=self._nixl_handshake_listener, + args=(host, base_port, ready_event), + daemon=True, + name="nixl_cpu_handshake_listener" + ) + self._handshake_listener_t.start() + ready_event.wait() + + def _nixl_handshake_listener( + self, + host: str, + base_port: int, + ready_event: threading.Event + ) -> None: + """Background thread that listens for and responds to handshake requests. + + Args: + host (str): Host address to listen on + base_port (int): Base port number to listen on + ready_event (threading.Event): Event to signal when listener is ready + """ + # Prepare metadata + local_meta = self._nixl_wrapper.get_agent_metadata() + + # Setup ZMQ socket + port = base_port + get_tensor_model_parallel_rank() + path = make_zmq_path("tcp", host, port) + logger.info("Starting handshake listener on path: %s", path) + + with zmq_ctx(zmq.ROUTER, path) as sock: + ready_event.set() + logger.info("Handshake listener is ready") + + while not self._stop_listener.is_set(): + logger.info("Waiting for handshake request") + try: + identity, _, msg = sock.recv_multipart(flags=zmq.NOBLOCK) + remote_agent_name = self._nixl_wrapper.add_remote_agent( + msg) + self._remote_agents[identity] = remote_agent_name + logger.info("Successfully received handshake from %s", + identity) + # Send back the local metadata to the sender + sock.send_multipart([identity, b"", local_meta]) + logger.info("Sent local metadata back to %s", identity) + except zmq.error.Again: + # No message available + time.sleep(0.1) + except Exception as e: + logger.error("Error in handshake listener: %s", e) + break + logger.info("Stopping handshake listener") + + def stop_handshake_listener(self) -> None: + """Stop the handshake listener thread.""" + if self._handshake_listener_t is not None: + self._stop_listener.set() + self._handshake_listener_t.join() + self._handshake_listener_t = None + + +@contextlib.contextmanager +def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: + """Context manager for a ZMQ socket""" + + if socket_type not in (zmq.ROUTER, zmq.REQ): + raise ValueError(f"Unexpected socket type: {socket_type}") + + ctx: Optional[zmq.Context] = None + try: + ctx = zmq.Context() # type: ignore[attr-defined] + yield make_zmq_socket(ctx=ctx, + path=addr, + socket_type=socket_type, + bind=socket_type == zmq.ROUTER) + finally: + if ctx is not None: + ctx.destroy(linger=0) From 050cfe634ce51db241f98447996c5f512a9eadd8 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 20 May 2025 00:28:57 +0000 Subject: [PATCH 05/28] [Add] tests for nixl Signed-off-by: ApostaC --- .../cpu_kv_integration/test_nixl_cpu_utils.py | 341 ++++++++++++++++++ .../test_ring_buffer_allocator.py | 2 +- .../cpu_kv_integration/test_toy_example.py | 80 ++++ 3 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py create mode 100644 tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py new file mode 100644 index 000000000000..e5e6320d669f --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -0,0 +1,341 @@ +# SPDX-License-Identifier: Apache-2.0 +import multiprocessing as mp +import pytest +import torch +import threading +import time + +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlCPUSender, NixlCPUReceiver, SourceSpec, DestinationSpec, + RingBufferAllocator +) + +try: + from nixl._api import nixl_agent as NixlWrapper + NIXL_AVAILABLE = True +except ImportError: + NIXL_AVAILABLE = False + +def run_receiver(buffer_config, host, base_port, rank, ready_event, stop_event): + """Process function for running the receiver.""" + try: + # Mock tensor_model_parallel_rank for this process + import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils + utils.get_tensor_model_parallel_rank = lambda: rank + + # Create ring buffer allocator + allocator = utils.RingBufferAllocator( + size=buffer_config['buffer_size'], + align_to=buffer_config['nixl_page_size'] + ) + + # Create and start receiver + receiver = NixlCPUReceiver( + allocator=allocator, + nixl_page_size=buffer_config['nixl_page_size'] + ) + receiver.start_handshake_listener(host, base_port) + + # Signal receiver is ready + ready_event.set() + + # Wait for stop signal + stop_event.wait() + + # Cleanup + receiver.stop_handshake_listener() + + except Exception as e: + print(f"Receiver process error: {e}") + raise + +def run_sender(buffer_config, host, base_port, rank, receiver_ready_event): + """Process function for running the sender.""" + try: + # Mock tensor_model_parallel_rank for this process + import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils + utils.get_tensor_model_parallel_rank = lambda: rank + + # Wait for receiver to be ready + receiver_ready_event.wait() + + # Create sender and perform handshake + sender = NixlCPUSender( + buffer_size=buffer_config['buffer_size'], + buffer_ptr=buffer_config['buffer_ptr'], + nixl_page_size=buffer_config['nixl_page_size'] + ) + + dest_spec = DestinationSpec( + rank=rank, + host=host, + base_port=base_port + ) + sender._nixl_handshake(dest_spec) + + # Verify handshake results + assert dest_spec.get_id() in sender._remote_agents + assert sender._remote_agents[dest_spec.get_id()] is not None + + return True + except Exception as e: + print(f"Sender process error: {e}") + return False + +def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event, stop_event, progress_interval=0.001): + """Process function for running the receiver with progress loop.""" + try: + # Mock tensor_model_parallel_rank for this process + import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils + utils.get_tensor_model_parallel_rank = lambda: rank + + # Create ring buffer allocator + allocator = utils.RingBufferAllocator( + size=buffer_config['buffer_size'], + align_to=buffer_config['nixl_page_size'] + ) + + # Create and start receiver + receiver = NixlCPUReceiver( + allocator=allocator, + nixl_page_size=buffer_config['nixl_page_size'] + ) + receiver.start_handshake_listener(host, base_port) + + # Signal receiver is ready + ready_event.set() + + # Run progress loop until stop signal + while not stop_event.is_set(): + receiver.progress() + time.sleep(progress_interval) + + # Cleanup + receiver.stop_handshake_listener() + + except Exception as e: + print(f"Receiver process error: {e}") + raise + +def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_ready_event, success_event): + """Process function for running the sender with protocol communication.""" + try: + # Mock tensor_model_parallel_rank for this process + import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils + utils.get_tensor_model_parallel_rank = lambda: rank + + # Wait for receiver to be ready + receiver_ready_event.wait() + + # Create sender + sender = NixlCPUSender( + buffer_size=buffer_config['buffer_size'], + buffer_ptr=buffer_config['buffer_ptr'], + nixl_page_size=buffer_config['nixl_page_size'] + ) + + # Create destination spec and perform handshake + dest_spec = DestinationSpec( + rank=rank, + host=host, + base_port=base_port + ) + sender._nixl_handshake(dest_spec) + + # Create source spec and prepare send + source_spec = SourceSpec( + request_id="test_request", + layer_id=0, + start=0, + stop=16, # Assuming we want to send 16 tokens + shape=(2, 1, 16, 8, 128), # Example shape + dtype_str="bfloat16" # Example dtype + ) + + # Prepare send and wait for completion + sender.prepare_send(source_spec, dest_spec) + + max_retries = 100 + retry_count = 0 + remote_agent = None + + while retry_count < max_retries: + remote_agent = sender.check_and_remove_prepared_send(source_spec, dest_spec) + if remote_agent is not None: + break + time.sleep(0.1) + retry_count += 1 + + if remote_agent is not None: + success_event.set() + + except Exception as e: + print(f"Sender process error: {e}") + raise + +@pytest.mark.skipif(not NIXL_AVAILABLE, reason="NIXL is not available") +class TestNixlCPUUtils: + """Test cases for NixlCPUSender and NixlCPUReceiver.""" + + @pytest.fixture + def buffer_config(self): + """Common buffer configuration for tests.""" + buffer_size = 1 << 20 # 1MB + torch_buffer = torch.zeros(buffer_size, dtype=torch.uint8, device='cpu') + + return { + 'buffer_size': buffer_size, + 'buffer_ptr': torch_buffer.data_ptr(), + 'nixl_page_size': 4096 # Standard page size + } + + def test_sender_creation(self, buffer_config): + """Test creation of NixlCPUSender.""" + sender = NixlCPUSender( + buffer_size=buffer_config['buffer_size'], + buffer_ptr=buffer_config['buffer_ptr'], + nixl_page_size=buffer_config['nixl_page_size'] + ) + + # Verify internal state + assert sender._buffer_size == buffer_config['buffer_size'] + assert sender._buffer_ptr == buffer_config['buffer_ptr'] + assert sender._nixl_page_size == buffer_config['nixl_page_size'] + assert isinstance(sender._remote_agents, dict) + + # Verify NIXL initialization + assert sender._nixl_wrapper is not None + assert sender._reg_dlist is not None + assert sender._local_xfer_dlist is not None + + def test_receiver_creation(self, buffer_config): + """Test creation of NixlCPUReceiver.""" + # Create ring buffer allocator + allocator = RingBufferAllocator( + size=buffer_config['buffer_size'], + align_to=buffer_config['nixl_page_size'] + ) + + receiver = NixlCPUReceiver( + allocator=allocator, + nixl_page_size=buffer_config['nixl_page_size'] + ) + + # Verify internal state + assert receiver._buffer_size == buffer_config['buffer_size'] + assert receiver._buffer_ptr == allocator.get_buffer_ptr() + assert receiver._nixl_page_size == buffer_config['nixl_page_size'] + assert isinstance(receiver._inflight_requests, dict) + assert isinstance(receiver._inflight_request_vaddr, dict) + assert receiver._allocator is allocator + + # Verify NIXL initialization + assert receiver._nixl_wrapper is not None + assert receiver._reg_dlist is not None + assert receiver._local_xfer_dlist is not None + + def test_creation_with_invalid_buffer_size(self, buffer_config): + """Test creation with invalid buffer size.""" + with pytest.raises(Exception): # Specific exception type depends on NIXL implementation + # Create allocator with invalid size + allocator = RingBufferAllocator( + size=0, # Invalid size + align_to=buffer_config['nixl_page_size'] + ) + + NixlCPUReceiver( + allocator=allocator, + nixl_page_size=buffer_config['nixl_page_size'] + ) + + def test_nixl_handshake_multiprocess(self, buffer_config): + """Test NIXL handshake between sender and receiver in separate processes.""" + # Setup test parameters + test_host = "127.0.0.1" + test_base_port = 50051 + test_rank = 0 + + # Create events for process synchronization + receiver_ready = mp.Event() + stop_receiver = mp.Event() + + # Start receiver process + receiver_process = mp.Process( + target=run_receiver, + args=(buffer_config, test_host, test_base_port, + test_rank, receiver_ready, stop_receiver) + ) + receiver_process.start() + + # Start sender process + sender_process = mp.Process( + target=run_sender, + args=(buffer_config, test_host, test_base_port, + test_rank, receiver_ready) + ) + sender_process.start() + + try: + # Wait for processes to complete + sender_process.join(timeout = 20) + assert sender_process.exitcode == 0, "Sender process failed" + + finally: + # Cleanup + stop_receiver.set() + receiver_process.join(timeout=5) + + # Force terminate if processes haven't exited + if receiver_process.is_alive(): + receiver_process.terminate() + if sender_process.is_alive(): + sender_process.terminate() + + def test_nixl_protocol_communication(self, buffer_config): + """Test the full protocol communication between sender and receiver.""" + # Setup test parameters + test_host = "127.0.0.1" + test_base_port = 50052 + test_rank = 0 + + # Create events for process synchronization + receiver_ready = mp.Event() + stop_receiver = mp.Event() + protocol_success = mp.Event() + + # Start receiver process with progress loop + receiver_process = mp.Process( + target=run_receiver_with_progress, + args=(buffer_config, test_host, test_base_port, + test_rank, receiver_ready, stop_receiver) + ) + receiver_process.start() + + # Start sender process with protocol communication + sender_process = mp.Process( + target=run_sender_with_protocol, + args=(buffer_config, test_host, test_base_port, + test_rank, receiver_ready, protocol_success) + ) + sender_process.start() + + try: + # Wait for protocol communication to complete + protocol_complete = protocol_success.wait(timeout=20) + assert protocol_complete, "Protocol communication failed or timed out" + + # Wait for sender process to complete + sender_process.join(timeout=5) + assert sender_process.exitcode == 0, "Sender process failed" + + finally: + # Cleanup + stop_receiver.set() + receiver_process.join(timeout=5) + + # Force terminate if processes haven't exited + if receiver_process.is_alive(): + receiver_process.terminate() + if sender_process.is_alive(): + sender_process.terminate() + diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py index 5e67c5972f0a..a183f76308ce 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py @@ -2,7 +2,7 @@ import pytest import torch -from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector import RingBufferAllocator +from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import RingBufferAllocator def test_basic_allocation(): """Test basic allocation and deallocation behavior.""" diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py new file mode 100644 index 000000000000..1d8e914c9d43 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import pytest +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +@pytest.fixture +def env_setup(): + """Set up required environment variables""" + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +@pytest.fixture +def input_prompts(): + """Create test prompts""" + context = "Hi " * 10 # Reduced size for testing + context2 = "Hey " * 10 + context3 = "Hello " * 10 + context4 = "How " * 10 + return [ + context + "Hello, my name is", + context2 + "The capital of France is", + context3 + "Your name is", + context4 + "The capital of China is", + ] + +@pytest.fixture +def llm_instance(): + """Create LLM instance with test configuration""" + return LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector="CPUConnector", + kv_role="kv_producer", + kv_connector_extra_config={}, + ), + load_format="dummy", + max_model_len=2048, + max_num_batched_tokens=2048, + block_size=64, + ) + +def test_llm_generation(env_setup, input_prompts, llm_instance, tmp_path): + """Test LLM generation and output saving""" + # Configure sampling parameters + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + + # Generate outputs + outputs = llm_instance.generate(input_prompts, sampling_params) + + # Verify outputs + assert len(outputs) == len(input_prompts), "Number of outputs should match number of prompts" + + # Process outputs + new_prompts = [] + for output in outputs: + assert hasattr(output, 'prompt'), "Output should have prompt attribute" + assert hasattr(output, 'outputs'), "Output should have outputs attribute" + assert len(output.outputs) > 0, "Output should have generated text" + + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + + # Test file writing + output_file = tmp_path / "output.txt" + with open(output_file, "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") + + # Verify file contents + assert output_file.exists(), "Output file should be created" + with open(output_file, "r") as f: + lines = f.readlines() + assert len(lines) == len(input_prompts), "File should contain all prompts" + for line in lines: + assert line.strip(), "Lines should not be empty" From e5034b037168404dedd63ec18b65cc1782bda6d2 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 20 May 2025 04:43:39 +0000 Subject: [PATCH 06/28] Passed the nixl protocol unit tests Signed-off-by: ApostaC --- .../cpu_kv_integration/test_nixl_cpu_utils.py | 38 +++++++++++++-- .../kv_connector/v1/nixl_cpu_utils.py | 46 ++++++++++--------- 2 files changed, 58 insertions(+), 26 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py index e5e6320d669f..3188cec38e2c 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import multiprocessing as mp +import torch.multiprocessing as mp import pytest import torch import threading @@ -56,13 +56,19 @@ def run_sender(buffer_config, host, base_port, rank, receiver_ready_event): import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: rank + # Create ring buffer allocator + allocator = utils.RingBufferAllocator( + size=buffer_config['buffer_size'], + align_to=buffer_config['nixl_page_size'] + ) + # Wait for receiver to be ready receiver_ready_event.wait() # Create sender and perform handshake sender = NixlCPUSender( buffer_size=buffer_config['buffer_size'], - buffer_ptr=buffer_config['buffer_ptr'], + buffer_ptr=allocator.get_buffer_ptr(), nixl_page_size=buffer_config['nixl_page_size'] ) @@ -124,13 +130,19 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: rank + # Create ring buffer allocator + allocator = utils.RingBufferAllocator( + size=buffer_config['buffer_size'], + align_to=buffer_config['nixl_page_size'] + ) + # Wait for receiver to be ready receiver_ready_event.wait() # Create sender sender = NixlCPUSender( buffer_size=buffer_config['buffer_size'], - buffer_ptr=buffer_config['buffer_ptr'], + buffer_ptr=allocator.get_buffer_ptr(), nixl_page_size=buffer_config['nixl_page_size'] ) @@ -153,14 +165,14 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read ) # Prepare send and wait for completion - sender.prepare_send(source_spec, dest_spec) + uid = sender.prepare_send(source_spec, dest_spec) max_retries = 100 retry_count = 0 remote_agent = None while retry_count < max_retries: - remote_agent = sender.check_and_remove_prepared_send(source_spec, dest_spec) + remote_agent = sender.check_and_remove_prepared_send(uid) if remote_agent is not None: break time.sleep(0.1) @@ -177,6 +189,11 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read class TestNixlCPUUtils: """Test cases for NixlCPUSender and NixlCPUReceiver.""" + @classmethod + def setup_class(cls): + """Set up the test class.""" + pass + @pytest.fixture def buffer_config(self): """Common buffer configuration for tests.""" @@ -255,6 +272,9 @@ def test_nixl_handshake_multiprocess(self, buffer_config): test_base_port = 50051 test_rank = 0 + old_start_method = mp.get_start_method(allow_none=True) + mp.set_start_method("spawn", force=True) + # Create events for process synchronization receiver_ready = mp.Event() stop_receiver = mp.Event() @@ -291,6 +311,8 @@ def test_nixl_handshake_multiprocess(self, buffer_config): if sender_process.is_alive(): sender_process.terminate() + mp.set_start_method(old_start_method, force=True) + def test_nixl_protocol_communication(self, buffer_config): """Test the full protocol communication between sender and receiver.""" # Setup test parameters @@ -298,6 +320,10 @@ def test_nixl_protocol_communication(self, buffer_config): test_base_port = 50052 test_rank = 0 + # Set multiprocessing start method + old_start_method = mp.get_start_method(allow_none=True) + mp.set_start_method("spawn", force=True) + # Create events for process synchronization receiver_ready = mp.Event() stop_receiver = mp.Event() @@ -338,4 +364,6 @@ def test_nixl_protocol_communication(self, buffer_config): receiver_process.terminate() if sender_process.is_alive(): sender_process.terminate() + + mp.set_start_method(old_start_method, force=True) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index dc26f2f554ea..ec69e026aeaa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -310,7 +310,8 @@ class NixlProtocolMsg(msgspec.Struct): def make_send_req_msg( - source_spec: SourceSpec + source_spec: SourceSpec, + req_uuid: str ) -> bytes: """Make the send request message. @@ -322,7 +323,6 @@ def make_send_req_msg( """ # Create the request message msg_type = "REQMSG" - req_uuid = str(uuid.uuid4()) receiver_addr = None send_req_msg = NixlProtocolMsg( msg_type=msg_type, @@ -433,15 +433,15 @@ def send( def prepare_send( self, - destination_spec: DestinationSpec, source_spec: SourceSpec, + destination_spec: DestinationSpec, ) -> str: """Prepare the send operation by allocation the receive buffer on the destination side. Args: - destination_spec (DestinationSpec): The destination spec. source_spec (SourceSpec): The source spec. + destination_spec (DestinationSpec): The destination spec. Returns: str: The uuid of the prepared send @@ -454,11 +454,14 @@ def prepare_send( remote_agent_name = self._remote_agents[dest_id] # Create the request message - msg = make_send_req_msg(source_spec) + req_uuid = str(uuid.uuid4()) + msg = make_send_req_msg(source_spec, req_uuid) # Send it to the remote agent self._nixl_wrapper.send_notif(remote_agent_name, msg) + return req_uuid + def check_and_remove_prepared_send( self, send_uuid: str, @@ -479,11 +482,11 @@ def check_and_remove_prepared_send( for msg in notifs[remote_agent_name]: # Decode the message obj = self._msg_decoder.decode(msg) - if msg.msg_type == "READYMSG": + if obj.msg_type == "READYMSG": # Add the request to the ready requests self._ready_requests[obj.req_uuid] = remote_agent_name else: - logger.error("Unexpected message type: %s", msg.msg_type) + logger.error("Unexpected message type: %s", obj.msg_type) continue # Check if the send uuid is in the ready requests @@ -509,10 +512,9 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: local_meta = self._nixl_wrapper.get_agent_metadata() with zmq_ctx(zmq.REQ, path) as sock: # Send query for metadata - logger.info("Sending handshake request to %s", destination_spec) + logger.debug("Sending handshake request to %s", destination_spec) sock.send(local_meta) - logger.info("Waiting for handshake response from %s", destination_spec) metadata_bytes = sock.recv() # Get remote agent name and register it @@ -522,7 +524,7 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: # Store remote agent info self._remote_agents[destination_spec.get_id()] = remote_agent_name - logger.info("Successfully completed handshake with %s", + logger.debug("Successfully completed handshake with %s", destination_spec) @@ -556,24 +558,28 @@ def __init__( self._remote_agents: dict[str, str] = {} self._nixl_wrapper, self._reg_dlist, self._local_xfer_dlist = \ - init_nixl_agent(buffer_size, buffer_ptr, nixl_page_size) + init_nixl_agent(self._buffer_size, self._buffer_ptr, + nixl_page_size) # Add handshake listener thread self._handshake_listener_t: Optional[threading.Thread] = None self._stop_listener = threading.Event() + # Msg decoder + self._msg_decoder = msgspec.msgpack.Decoder(NixlProtocolMsg) + def _process_msgs(self): """Process the received messages from the NIXL agent.""" notifs = self._nixl_wrapper.get_new_notifs() for remote_agent_name in notifs: for msg in notifs[remote_agent_name]: - # Decode the message + # Decode the messag obj = self._msg_decoder.decode(msg) - if msg.msg_type == "REQMSG": + if obj.msg_type == "REQMSG": # Add the request to the pending allocation self._pending_allocation[obj.req_uuid] = (obj.source_spec, remote_agent_name) - elif msg.msg_type == "FINISHMSG": + elif obj.msg_type == "FINISHMSG": # Add the request to the finished requests if obj.req_uuid in self._inflight_requests: source_spec = self._inflight_requests.pop(obj.req_uuid) @@ -583,7 +589,7 @@ def _process_msgs(self): logger.error("Request %s not found in inflight requests", obj.req_uuid) else: - logger.error("Unexpected message type: %s", msg.msg_type) + logger.error("Unexpected message type: %s", obj.msg_type) continue def _process_allocation_requests(self): @@ -668,31 +674,29 @@ def _nixl_handshake_listener( # Setup ZMQ socket port = base_port + get_tensor_model_parallel_rank() path = make_zmq_path("tcp", host, port) - logger.info("Starting handshake listener on path: %s", path) + logger.debug("Starting handshake listener on path: %s", path) with zmq_ctx(zmq.ROUTER, path) as sock: ready_event.set() - logger.info("Handshake listener is ready") while not self._stop_listener.is_set(): - logger.info("Waiting for handshake request") try: identity, _, msg = sock.recv_multipart(flags=zmq.NOBLOCK) remote_agent_name = self._nixl_wrapper.add_remote_agent( msg) self._remote_agents[identity] = remote_agent_name - logger.info("Successfully received handshake from %s", + logger.debug("Successfully received handshake from %s", identity) # Send back the local metadata to the sender sock.send_multipart([identity, b"", local_meta]) - logger.info("Sent local metadata back to %s", identity) + logger.debug("Sent local metadata back to %s", identity) except zmq.error.Again: # No message available time.sleep(0.1) except Exception as e: logger.error("Error in handshake listener: %s", e) break - logger.info("Stopping handshake listener") + logger.debug("Stopping handshake listener") def stop_handshake_listener(self) -> None: """Stop the handshake listener thread.""" From 81e31b895f8ce4f2bc187e2741f9ac78597f0790 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 20 May 2025 06:09:08 +0000 Subject: [PATCH 07/28] [Add] correct nixl data plane functionality Signed-off-by: ApostaC --- .../cpu_kv_integration/test_nixl_cpu_utils.py | 47 ++++- .../kv_connector/v1/nixl_cpu_utils.py | 190 ++++++++++++++---- 2 files changed, 194 insertions(+), 43 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py index 3188cec38e2c..79031277dda4 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -82,11 +82,13 @@ def run_sender(buffer_config, host, base_port, rank, receiver_ready_event): # Verify handshake results assert dest_spec.get_id() in sender._remote_agents assert sender._remote_agents[dest_spec.get_id()] is not None + peer_name = sender._remote_agents[dest_spec.get_id()] + assert sender._remote_xfer_handlers[peer_name] is not None return True except Exception as e: print(f"Sender process error: {e}") - return False + raise def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event, stop_event, progress_interval=0.001): """Process function for running the receiver with progress loop.""" @@ -100,6 +102,7 @@ def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event size=buffer_config['buffer_size'], align_to=buffer_config['nixl_page_size'] ) + allocator._buffer.fill_(0) # Create and start receiver receiver = NixlCPUReceiver( @@ -112,6 +115,26 @@ def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event ready_event.set() # Run progress loop until stop signal + while not receiver.get_finished(): + receiver.progress() + time.sleep(progress_interval) + + finished = receiver.get_finished(clear = True) + assert len(finished) == 1 + source_spec, vaddr = finished[0] + paddr = allocator.virtual_to_physical(vaddr) + + # Check if the numbers are all correct (should be uint8 all 1) + num_elements = source_spec.get_size() + should_1 = allocator._buffer[paddr : paddr + num_elements] + should_0_a = allocator._buffer[:paddr] + should_0_b = allocator._buffer[paddr + num_elements:] + assert (should_1 == 1).all(), "Buffer data mismatch" + if len(should_0_a) > 0: + assert (should_0_a == 0).all(), "Buffer data mismatch" + if len(should_0_b) > 0: + assert (should_0_b == 0).all(), "Buffer data mismatch" + while not stop_event.is_set(): receiver.progress() time.sleep(progress_interval) @@ -172,12 +195,30 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read remote_agent = None while retry_count < max_retries: - remote_agent = sender.check_and_remove_prepared_send(uid) + remote_agent, receiver_paddr = \ + sender.check_and_remove_prepared_send(uid) if remote_agent is not None: break time.sleep(0.1) retry_count += 1 - + + assert remote_agent is not None, "Failed to get remote agent" + assert receiver_paddr != -1, "Failed to get receiver virtual address" + + # Test the real send + vaddr, buffer = allocator.allocate(source_spec.get_size()) + paddr = allocator.virtual_to_physical(vaddr) + + buffer.fill_(1) # Fill with dummy data + + handle = sender.send( + paddr, receiver_paddr, source_spec.get_size(), + uid, dest_spec) + + while not sender.is_send_finished(handle): + time.sleep(0.1) + print("Send completed successfully") + if remote_agent is not None: success_event.set() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index ec69e026aeaa..50c8ac447958 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -35,10 +35,12 @@ try: from nixl._api import nixl_agent as NixlWrapper + from nixl._api import nixl_xfer_handle logger.info("NIXL is available") except ImportError: logger.warning("NIXL is not available") NixlWrapper = None + nixl_xfer_handle = int ################################################################### # Helper classes and functions @@ -48,7 +50,7 @@ def init_nixl_agent( buffer_size: int, buffer_ptr: int, nixl_page_size: int = 4096, -) -> tuple[NixlWrapper, Any, Any]: +) -> tuple[NixlWrapper, Any, Any, Any]: """Initialize the NIXL agent. Args: @@ -60,6 +62,7 @@ def init_nixl_agent( NixlWrapper: The NIXL agent. reg_dlist: the registered memory descriptor list. xfer_dlist: the local transfer descriptor list. + prepped_xfer_handler: the prepped transfer handler. """ if NixlWrapper is None: raise RuntimeError("NIXL is not available") @@ -79,11 +82,11 @@ def init_nixl_agent( nixl_page_size): xfer_desc.append((base_addr, nixl_page_size, 0)) - descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type="DRAM") - local_xfer_dlist = nixl_agent.prep_xfer_dlist( - "", descs, mem_type="DRAM") + xfer_descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type="DRAM") + xfer_handler = nixl_agent.prep_xfer_dlist( + "", xfer_descs, mem_type="DRAM") - return nixl_agent, reg_descs, local_xfer_dlist + return nixl_agent, reg_descs, xfer_descs, xfer_handler @dataclass class DestinationSpec: @@ -279,7 +282,7 @@ def virtual_to_physical(self, vaddr: int) -> torch.Tensor: Returns: torch.Tensor: The physical address of the buffer. """ - return vaddr + self._size + return vaddr % self._size def get_size(self) -> int: """Get the size of the ring buffer. @@ -305,7 +308,7 @@ class NixlProtocolMsg(msgspec.Struct): msg_type: str req_uuid: str source_spec: Optional[SourceSpec] = None - receiver_addr: Optional[int] = None + receiver_paddr: Optional[int] = None @@ -323,12 +326,12 @@ def make_send_req_msg( """ # Create the request message msg_type = "REQMSG" - receiver_addr = None + receiver_paddr = None send_req_msg = NixlProtocolMsg( msg_type=msg_type, req_uuid=req_uuid, source_spec=source_spec, - receiver_addr=receiver_addr + receiver_paddr=receiver_paddr ) # Encode the message send_req_msg_bytes = msgspec.msgpack.encode(send_req_msg) @@ -336,13 +339,13 @@ def make_send_req_msg( def make_receive_ready_msg( req_uuid: str, - receiver_addr: int, + receiver_paddr: int, ) -> bytes: """Make the receive ready message. Args: req_uuid (str): The request uuid. - receiver_addr (int): The receiver address. + receiver_paddr (int): The receiver's physical address. Returns: bytes: The receive ready message. @@ -354,7 +357,7 @@ def make_receive_ready_msg( msg_type=msg_type, req_uuid=req_uuid, source_spec=source_spec, - receiver_addr=receiver_addr + receiver_paddr=receiver_paddr ) # Encode the message receive_ready_msg_bytes = msgspec.msgpack.encode(receive_ready_msg) @@ -374,12 +377,12 @@ def make_send_finish_msg( # Create the request message msg_type = "FINISHMSG" source_spec = None - receiver_addr = None + receiver_paddr = None send_finish_msg = NixlProtocolMsg( msg_type=msg_type, req_uuid=req_uuid, source_spec=source_spec, - receiver_addr=receiver_addr + receiver_paddr=receiver_paddr ) # Encode the message send_finish_msg_bytes = msgspec.msgpack.encode(send_finish_msg) @@ -400,15 +403,21 @@ def __init__( # Destination spec id -> peer name self._remote_agents: dict[str, str] = {} - self._nixl_wrapper, self._reg_dlist, self._local_xfer_dlist = \ + self._nixl_wrapper, \ + self._reg_dlist, \ + self._local_xfer_dlist, \ + self._local_xfer_handlers = \ init_nixl_agent(buffer_size, buffer_ptr, nixl_page_size) + # Remote xfer dlists, peer name -> prepped xfer handlers + self._remote_xfer_handlers: dict[str, Any] = {} + # Add ZMQ context for handshakes self._zmq_ctx = zmq.Context() # Requests that are ready to send - # uuid -> remote agent name - self._ready_requests: dict[str, str] = {} + # uuid -> (remote agent name, receiver paddr) + self._ready_requests: dict[str, tuple[str, int]] = {} # NOTE(ApostaC): we don't track the requests that are waiting for the # receiver to be ready, and may want to add this in the future @@ -416,20 +425,83 @@ def __init__( # Msg decoder self._msg_decoder = msgspec.msgpack.Decoder(NixlProtocolMsg) + + def _get_desc_idxs(self, paddr: int, size: int) -> list[int]: + """Get the sender descriptor indexes for the given physical address + and size. + + Args: + paddr (int): The physical address. + size (int): The size of the data. + + Returns: + list[int]: The list of sender descriptor indexes. + """ + # Get the sender descriptor indexes + assert paddr % self._nixl_page_size == 0, \ + "Physical address is not aligned to the page size" + start_idx = paddr // self._nixl_page_size + end_idx = (paddr + size) // self._nixl_page_size + return [i for i in range(start_idx, end_idx)] + def send( self, - src_addr: int, - dst_addr: int, - data_size: int - ) -> None: + src_paddr: int, + dst_paddr: int, + data_size: int, + req_uuid: int, + destination_spec: DestinationSpec, + ) -> nixl_xfer_handle: """Send data from src_addr to dst_addr using NIXL. Args: - src_addr (int): Source address. - dst_addr (int): Destination address. + src_paddr (int): Source physical address. + dst_paddr (int): Destination physical address. data_size (int): Size of the data in bytes to be sent. + req_uuid (int): The request uuid. + destination_spec (DestinationSpec): The destination spec. + + Returns: + nixl_xfer_handle: The handle of the transfer. + """ + # Get the sender descriptor indexes + desc_idxs = self._get_desc_idxs(src_paddr, data_size) + # Get the receiver descriptor indexes + r_desc_idxs = self._get_desc_idxs(dst_paddr, data_size) + # Get the remote agent name + remote_agent_name = self._remote_agents[destination_spec.get_id()] + # Get the remote xfer dlist + remote_xfer_handlers = self._remote_xfer_handlers[remote_agent_name] + # Notif msg + notif_msg = make_send_finish_msg(req_uuid) + # Transfer + handle = self._nixl_wrapper.make_prepped_xfer( + "WRITE", + self._local_xfer_handlers, + desc_idxs, + remote_xfer_handlers, + r_desc_idxs, + notif_msg + ) + + self._nixl_wrapper.transfer(handle) + + return handle + + def is_send_finished(self, handle: "nixl_xfer_handle") -> bool: + """Check if the send operation is finished. + + Args: + handle (nixl_xfer_handle): The handle of the transfer. + + Returns: + bool: True if the send operation is finished, False otherwise. """ - pass + status = self._nixl_wrapper.check_xfer_state(handle) + if status == "ERR": + logger.error("Error in send operation") + return False + return status == "DONE" def prepare_send( self, @@ -465,7 +537,7 @@ def prepare_send( def check_and_remove_prepared_send( self, send_uuid: str, - ) -> Optional[str]: + ) -> tuple[Optional[str], int]: """Check if the prepared send is ready to be sent. If the send is ready, remove it from the ready requests. @@ -475,6 +547,8 @@ def check_and_remove_prepared_send( Returns: Optional[str]: The remote agent name if the send is ready, None otherwise. + int: The virtual address of the receiver if the send is ready, + -1 otherwise. """ # Update the ready requests notifs = self._nixl_wrapper.get_new_notifs() @@ -482,9 +556,13 @@ def check_and_remove_prepared_send( for msg in notifs[remote_agent_name]: # Decode the message obj = self._msg_decoder.decode(msg) + if obj.msg_type == "READYMSG": # Add the request to the ready requests - self._ready_requests[obj.req_uuid] = remote_agent_name + assert obj.receiver_paddr is not None, \ + "Receiver address is None in READYMSG" + self._ready_requests[obj.req_uuid] = (remote_agent_name, + obj.receiver_paddr) else: logger.error("Unexpected message type: %s", obj.msg_type) continue @@ -492,10 +570,10 @@ def check_and_remove_prepared_send( # Check if the send uuid is in the ready requests if send_uuid in self._ready_requests: # Remove the request from the ready requests - remote_agent_name = self._ready_requests.pop(send_uuid) - return remote_agent_name + remote_agent_name, vaddr = self._ready_requests.pop(send_uuid) + return remote_agent_name, vaddr else: - return None + return None, -1 def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: """Perform handshake with a remote NIXL CPU instance. @@ -523,6 +601,18 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: # Store remote agent info self._remote_agents[destination_spec.get_id()] = remote_agent_name + + sock.send(b"get_xfer_descs") + # Receive the remote xfer descs + s_remote_xfer_descs = sock.recv() + remote_xfer_dlist = self._nixl_wrapper.deserialize_descs( + s_remote_xfer_descs) + + + remote_xfer_handlers = self._nixl_wrapper.prep_xfer_dlist( + remote_agent_name, remote_xfer_dlist, mem_type="DRAM") + + self._remote_xfer_handlers[remote_agent_name] = remote_xfer_handlers logger.debug("Successfully completed handshake with %s", destination_spec) @@ -557,7 +647,10 @@ def __init__( # source zmq id -> peer name self._remote_agents: dict[str, str] = {} - self._nixl_wrapper, self._reg_dlist, self._local_xfer_dlist = \ + self._nixl_wrapper, \ + self._reg_dlist, \ + self._local_xfer_dlist, \ + self._local_xfer_handlers = \ init_nixl_agent(self._buffer_size, self._buffer_ptr, nixl_page_size) @@ -629,14 +722,21 @@ def progress(self) -> None: self._process_msgs() self._process_allocation_requests() - def get_finished(self) -> list[tuple[SourceSpec, int]]: + def get_finished(self, clear = False) -> list[tuple[SourceSpec, int]]: """Get the requests that finishes receiving. + Args: + clear (bool): Whether to clear the finished requests or not. + Returns: list[tuple[SourceSpec, int]]: A list of tuples containing the source spec and the address. """ - pass + ret = [(source_spec, vaddr) for source_spec, vaddr in + self._finished_requests.values()] + if clear: + self._finished_requests.clear() + return ret def start_handshake_listener(self, host: str, base_port: int) -> None: """Start the background thread that listens for handshake requests. @@ -682,14 +782,24 @@ def _nixl_handshake_listener( while not self._stop_listener.is_set(): try: identity, _, msg = sock.recv_multipart(flags=zmq.NOBLOCK) - remote_agent_name = self._nixl_wrapper.add_remote_agent( - msg) - self._remote_agents[identity] = remote_agent_name - logger.debug("Successfully received handshake from %s", - identity) - # Send back the local metadata to the sender - sock.send_multipart([identity, b"", local_meta]) - logger.debug("Sent local metadata back to %s", identity) + + if msg == b"get_xfer_descs": + # Send back the local xfer descs + s_local_xfer_descs = self._nixl_wrapper.get_serialized_descs( + self._local_xfer_dlist) + sock.send_multipart([identity, b"", s_local_xfer_descs]) + logger.debug("Sent back the local xfer descs to %s", identity) + else: + # Send the agent metadata + remote_agent_name = self._nixl_wrapper.add_remote_agent( + msg) + self._remote_agents[identity] = remote_agent_name + logger.debug("Successfully received handshake from %s", + identity) + # Send back the local metadata to the sender + sock.send_multipart([identity, b"", local_meta]) + logger.debug("Sent local metadata back to %s", identity) + except zmq.error.Again: # No message available time.sleep(0.1) From a6ffb260777bbc32d76f37258485dd3435255c39 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 21 May 2025 01:15:55 +0000 Subject: [PATCH 08/28] [Add] sender to receiver data plane finished Signed-off-by: ApostaC --- .../test_ring_buffer_allocator.py | 2 +- .../cpu_kv_integration/toy_receiver.py | 76 ++++++ .../kv_connector/v1/cpu_connector.py | 21 +- .../kv_connector/v1/cpu_connector_utils.py | 173 ++++++------- .../kv_connector/v1/nixl_cpu_utils.py | 242 +++++++++++++----- 5 files changed, 339 insertions(+), 175 deletions(-) create mode 100644 tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py index a183f76308ce..ead0aae2f921 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py @@ -2,7 +2,7 @@ import pytest import torch -from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import RingBufferAllocator +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import RingBufferAllocator def test_basic_allocation(): """Test basic allocation and deallocation behavior.""" diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py new file mode 100644 index 000000000000..e6d66f2cc093 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 + +import torch.multiprocessing as mp +import time + +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlCPUReceiver, RingBufferAllocator +) + +def main(): + """Main function to run the receiver.""" + # Setup test parameters + test_host = "127.0.0.1" + test_base_port = 54321 + test_rank = 0 + + # Buffer configuration + buffer_size = 1 << 30 # 1GB + nixl_page_size = 4096 # Standard page size + + try: + # Mock tensor_model_parallel_rank for this process + import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils + utils.get_tensor_model_parallel_rank = lambda: test_rank + + # Create ring buffer allocator + allocator = RingBufferAllocator( + size=buffer_size, + align_to=nixl_page_size + ) + allocator._buffer.fill_(0) + + # Create and start receiver + receiver = NixlCPUReceiver( + allocator=allocator, + nixl_page_size=nixl_page_size + ) + receiver.start_handshake_listener(test_host, test_base_port) + + print(f"Receiver started on {test_host}:{test_base_port}") + + # Run progress loop until interrupted + try: + while True: + receiver.progress() + + # Check for finished requests + finished = receiver.get_finished(clear=True) + if finished: + for source_spec, vaddr in finished: + print(f"Received data from request {source_spec.request_id}") + paddr = allocator.virtual_to_physical(vaddr) + + # Verify received data + num_elements = source_spec.get_size() + received_data = allocator._buffer[paddr : paddr + num_elements] + print(f"Received {num_elements} elements") + print(f"First few values: {received_data[:10]}") + + time.sleep(0.001) # Small sleep to prevent busy waiting + + except KeyboardInterrupt: + print("\nShutting down receiver...") + + # Cleanup + receiver.stop_handshake_listener() + print("Receiver stopped") + + except Exception as e: + print(f"Receiver error: {e}") + raise + +if __name__ == "__main__": + # Set multiprocessing start method + mp.set_start_method("spawn", force=True) + main() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index c9a20a292f11..3679e86940fb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -19,7 +19,10 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( - CPUSendTask, CPUKVSender, SourceSpec, DestinationSpec) + SourceSpec, DestinationSpec) +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlSendTask, NixlKVSender) + from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) @@ -271,7 +274,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: pass elif role == KVConnectorRole.WORKER: # Prefiller side sender - self._cpu_kv_sender = CPUKVSender(1024 * 1024 * 1024) # 1GB for debug + self._kv_sender = NixlKVSender(1024 * 1024 * 1024) # 1GB for debug # request_id -> prefill request trackers self._prefill_reqs: dict[str, PrefillRequestTracker] = {} @@ -286,7 +289,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: self._cuda_stream = torch.cuda.Stream() # prefill offload tasks - self._inflight_copy_tasks: list[CPUSendTask] = [] + self._inflight_copy_tasks: list[NixlSendTask] = [] ############################################################ @@ -452,7 +455,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, assert isinstance(meta, CPUConnectorMetadata), \ "Connector metadata is not of type CPUConnectorMetadata" - assert self._cpu_kv_sender is not None + assert self._kv_sender is not None for prefill_req in meta.prefill_meta: # Create a source spec with serializable types @@ -474,12 +477,12 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, ) # Create the send task - task = self._cpu_kv_sender.create_send_task( + task = self._kv_sender.create_send_task( source_spec=source_spec, destination_spec=dest_spec, ) - assert isinstance(task, CPUSendTask), \ - "Send task is not of type CPUSendTask" + assert isinstance(task, NixlSendTask), \ + "Send task is not of type NixlSendTask" # Start copying the data to the CPU buffer buffer = task.tensor @@ -499,7 +502,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, self._inflight_copy_tasks.append(task) # Check the task states and send the tasks - self._cpu_kv_sender.progress() + self._kv_sender.progress() @_lmcache_nvtx_annotate @@ -516,6 +519,8 @@ def wait_for_save(self): task.cuda_event.synchronize() self._inflight_copy_tasks.clear() + self._kv_sender.wait_for_all_tasks() + def get_finished( self, finished_req_ids: set[str] ) -> tuple[Optional[set[str]], Optional[set[str]]]: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index d9febea1eaeb..4b11b32c2295 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -19,8 +19,6 @@ from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) -from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - DestinationSpec, SourceSpec, RingBufferAllocator) from vllm.logger import init_logger from vllm.utils import make_zmq_path, make_zmq_socket, round_down, cdiv from vllm.v1.core.sched.output import SchedulerOutput @@ -39,6 +37,74 @@ logger = init_logger(__name__) +@dataclass +class DestinationSpec: + """DestinationSpec is used to specify the destination of kv sending task. + + Attributes: + rank (int): The rank of the destination. + host (str): The path of the destination. + base_port (int): The base port of the destination. + """ + rank: int + host: str + base_port: int + + def __str__(self) -> str: + return f"DestinationSpec(rank={self.rank}, host={self.host}, base_port={self.base_port})" + + def get_id(self) -> str: + """Get the id of the destination spec. + + Returns: + str: The id of the destination spec. + """ + return f"{self.rank}_{self.host}_{self.base_port}" + +class SourceSpec(msgspec.Struct): + """SourceSpec is used to specify the source of kv sending task. + """ + # The request id of the kv cache + request_id: str + + # The layer id of the kv cache + layer_id: int + + # The range of tokens to be offloaded + start: int # For token_range slice + stop: int # For token_range slice + + # The shape of the offloaded KV cache tensor as a tuple + shape: tuple[int, ...] + + # The dtype of the offloaded KV cache tensor as a string + dtype_str: str + + @property + def token_range(self) -> slice: + """Get the token range as a slice object.""" + return slice(self.start, self.stop) + + @property + def tensor_shape(self) -> torch.Size: + """Get the shape as a torch.Size object.""" + return torch.Size(self.shape) + + @property + def dtype(self) -> torch.dtype: + """Get the dtype as a torch.dtype object.""" + return getattr(torch, self.dtype_str) + + def get_size(self) -> int: + """Get the size in bytes of the cooresponding kv cache.""" + return math.prod(self.shape) * self.dtype.itemsize + + def __str__(self) -> str: + return (f"SourceSpec(request_id={self.request_id}, " + f"layer_id={self.layer_id}, " + f"token_range={self.token_range}, shape={self.tensor_shape})") + + @dataclass class SendTaskState: @@ -127,6 +193,11 @@ def is_done(self) -> bool: """ return self.state.is_done() + def mark_sending(self) -> None: + """Mark the send task as sending. + """ + self.state.is_sending = True + class KVSenderInterface(ABC): """KVSenderInterface is an interface for sending KV cache data. """ @@ -166,7 +237,7 @@ def progress(self) -> None: should_add = True if task.is_ready() and not task.is_sending(): - self._send(task) + self.send_task(task) if task.is_done(): self.free_task(task) @@ -203,6 +274,7 @@ def create_send_task( @abstractmethod def free_task(self, task: SendTask) -> None: """Free the send task. + Will be called in the pre-implemented progress() method. Args: task (SendTask): The send task to be freed. @@ -212,6 +284,7 @@ def free_task(self, task: SendTask) -> None: @abstractmethod def send_task(self, task: SendTask) -> None: """Send the send task after it is ready. + Will be called in the pre-implemented progress() method. Args: task (SendTask): The send task to be sent. @@ -237,97 +310,3 @@ def post_progress_hook(self, task: SendTask) -> None: raise NotImplementedError("post_progress_hook() not implemented") - -@dataclass -class CPUSendTask(SendTask): - """CPUSendTask is a send task that uses CPU memory for the buffer. - """ - buffer_addr: int - cuda_event: Optional[torch.cuda.Event] = None - - def __post_init__(self) -> None: - self.creation_time = time.time() - - @_lmcache_nvtx_annotate - def update_states(self) -> None: - """Update the states of the send task. - """ - # Check the cuda event - if not self.state.sender_ready and self.cuda_event is not None \ - and self.cuda_event.query(): - self.state.sender_ready = True - -class CPUKVSender(KVSenderInterface): - """CPUKVSender is an implementation of KVSenderInterface that provides a - ring buffer allocator for managing pin memory allocation and deallocation. - """ - - def __init__(self, buffer_size: int) -> None: - super().__init__() - self._buffer_size = buffer_size - self._allocator = RingBufferAllocator(self._buffer_size) - - def create_send_task( - self, - source_spec: SourceSpec, - destination_spec: DestinationSpec, - ) -> SendTask: - """Create a non-ready send task with a CPU buffer allocated. - - Args: - source_spec (SourceSpec): The source specification of the send - task. - destination_spec (DestinationSpec): The destination - specification of the send task. - """ - # Allocate a buffer for the send task - size = source_spec.get_size() - address, buffer = self._allocator.allocate(size) - while address == -1: - # If allocation fails, wait for a while to process - # and try again - time.sleep(0.001) - self.progress() - address, buffer = self._allocator.allocate(size) - assert buffer is not None, "Buffer allocation failed" - - # Create a send task with the allocated buffer - task = CPUSendTask( - buffer=buffer, - source_spec=source_spec, - destination_spec=destination_spec, - state=SendTaskState(), - buffer_addr=address, - ) - self.add_send_task(task) - return task - - def free_task(self, task: SendTask) -> None: - """Free the send task. - - Args: - task (SendTask): The send task to be freed. - """ - # Free the buffer in the ring buffer allocator - self._allocator.free(task.buffer_addr) - - def send_task(self, task: SendTask) -> None: - """Send the send task after it is ready. - - Args: - task (SendTask): The send task to be sent. - """ - # DEBUG IMPLEMENTATION - logger.error("CPUKVSender.send_task() not implemented, running a debug implementation!") - task.dbg_mark_sending() - - def pre_progress_hook(self) -> None: - for task in self.get_send_tasks(): - task.update_states() - - def post_progress_hook(self) -> None: - pass - - def _send(self, task: SendTask) -> None: - # NO IMPLEMENTATION YET - pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 50c8ac447958..5ce9aaf3865c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -13,10 +13,15 @@ import torch import zmq +from lmcache.utils import _lmcache_nvtx_annotate + from vllm import envs from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( + SendTask, KVSenderInterface, SourceSpec, DestinationSpec, + SendTaskState) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) @@ -88,73 +93,6 @@ def init_nixl_agent( return nixl_agent, reg_descs, xfer_descs, xfer_handler -@dataclass -class DestinationSpec: - """DestinationSpec is used to specify the destination of kv sending task. - - Attributes: - rank (int): The rank of the destination. - host (str): The path of the destination. - base_port (int): The base port of the destination. - """ - rank: int - host: str - base_port: int - - def __str__(self) -> str: - return f"DestinationSpec(rank={self.rank}, host={self.host}, base_port={self.base_port})" - - def get_id(self) -> str: - """Get the id of the destination spec. - - Returns: - str: The id of the destination spec. - """ - return f"{self.rank}_{self.host}_{self.base_port}" - -class SourceSpec(msgspec.Struct): - """SourceSpec is used to specify the source of kv sending task. - """ - # The request id of the kv cache - request_id: str - - # The layer id of the kv cache - layer_id: int - - # The range of tokens to be offloaded - start: int # For token_range slice - stop: int # For token_range slice - - # The shape of the offloaded KV cache tensor as a tuple - shape: tuple[int, ...] - - # The dtype of the offloaded KV cache tensor as a string - dtype_str: str - - @property - def token_range(self) -> slice: - """Get the token range as a slice object.""" - return slice(self.start, self.stop) - - @property - def tensor_shape(self) -> torch.Size: - """Get the shape as a torch.Size object.""" - return torch.Size(self.shape) - - @property - def dtype(self) -> torch.dtype: - """Get the dtype as a torch.dtype object.""" - return getattr(torch, self.dtype_str) - - def get_size(self) -> int: - """Get the size in bytes of the cooresponding kv cache.""" - return math.prod(self.shape) * self.dtype.itemsize - - def __str__(self) -> str: - return (f"SourceSpec(request_id={self.request_id}, " - f"layer_id={self.layer_id}, " - f"token_range={self.token_range}, shape={self.tensor_shape})") - class RingBufferAllocator: """RingBufferAllocator is a simple ring buffer allocator for managing memory allocation and deallocation. @@ -485,6 +423,7 @@ def send( ) self._nixl_wrapper.transfer(handle) + logger.info("Start trasnfer of the request %s", req_uuid) return handle @@ -547,7 +486,7 @@ def check_and_remove_prepared_send( Returns: Optional[str]: The remote agent name if the send is ready, None otherwise. - int: The virtual address of the receiver if the send is ready, + int: The physical address of the receiver if the send is ready, -1 otherwise. """ # Update the ready requests @@ -668,6 +607,8 @@ def _process_msgs(self): for msg in notifs[remote_agent_name]: # Decode the messag obj = self._msg_decoder.decode(msg) + logger.info("Received message from %s: %s %s", + remote_agent_name, obj.msg_type, obj.req_uuid) if obj.msg_type == "REQMSG": # Add the request to the pending allocation self._pending_allocation[obj.req_uuid] = (obj.source_spec, @@ -691,8 +632,16 @@ def _process_allocation_requests(self): for req_uuid, (source_spec, peer_name) in \ self._pending_allocation.items(): # Try to allocate the buffer - vaddr, buffer = self._allocator.allocate(source_spec.get_size()) + requested_size = source_spec.get_size() + if requested_size > self._buffer_size: + raise RuntimeError( + f"Requested size {requested_size} is larger than the " + f"nixl receiver buffer size {self._buffer_size}" + ) + + vaddr, buffer = self._allocator.allocate(requested_size) if vaddr == -1: + logger.info("No space available for request %s", req_uuid) # No space available, skip all the requests # NOTE: an alternative is to try allocation for other requests @@ -703,6 +652,7 @@ def _process_allocation_requests(self): # Add the request to the inflight requests self._inflight_requests[req_uuid] = source_spec self._inflight_request_vaddr[req_uuid] = vaddr + logger.info("Adding %s to inflight requests", req_uuid) # Send back the ready message paddr = self._allocator.virtual_to_physical(vaddr) @@ -833,3 +783,157 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: finally: if ctx is not None: ctx.destroy(linger=0) + + +@dataclass +class NixlSendTask(SendTask): + """NixlSendTask is a send task that uses CPU memory for the buffer and + Nixl for sending. + """ + # Required fields + # virtual address of the src buffer + buffer_vaddr: int + # Parent nixl sender + parent_sender: NixlCPUSender + # nixl request uuid + request_uuid: str + + # Optional fields that will be updated later + # Cuda event for h2d copy + cuda_event: Optional[torch.cuda.Event] = None + # Destination physical address + receiver_paddr: Optional[int] = None + # nixl transfer handle + transfer_handle: Optional[nixl_xfer_handle] = None + + + def __post_init__(self) -> None: + self.creation_time = time.time() + + @_lmcache_nvtx_annotate + def update_states(self) -> None: + """Update the states of the send task. + """ + # Check the cuda event + if not self.state.sender_ready and self.cuda_event is not None \ + and self.cuda_event.query(): + self.state.sender_ready = True + + # check if the send is ready + if not self.state.receiver_ready and self.receiver_paddr is None: + rname, rpaddr = self.parent_sender.check_and_remove_prepared_send( + self.request_uuid) + if rname is not None: + assert rpaddr != -1 + self.receiver_paddr = rpaddr + self.state.receiver_ready = True + + if not self.is_done() and self.transfer_handle is not None: + # Check if the transfer is finished + if self.parent_sender.is_send_finished(self.transfer_handle): + self.state.send_done = True + + +class NixlKVSender(KVSenderInterface): + """NixlSendTask is an implementation of KVSenderInterface that provides a + ring buffer allocator for managing pin memory allocation and deallocation, + with NIXL for sending data. + """ + + def __init__(self, buffer_size: int) -> None: + super().__init__() + nixl_page_size = 4096 + self._buffer_size = buffer_size + self._allocator = RingBufferAllocator(self._buffer_size, + nixl_page_size) + self._nixl_sender = NixlCPUSender( + buffer_size, self._allocator.get_buffer_ptr(), + nixl_page_size) + + def create_send_task( + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: + """Create a non-ready send task with a CPU buffer allocated. + + Args: + source_spec (SourceSpec): The source specification of the send + task. + destination_spec (DestinationSpec): The destination + specification of the send task. + """ + # Allocate a buffer for the send task + size = source_spec.get_size() + address, buffer = self._allocator.allocate(size) + while address == -1: + # If allocation fails, wait for a while to process + # and try again + time.sleep(0.001) + self.progress() + address, buffer = self._allocator.allocate(size) + assert buffer is not None, "Buffer allocation failed" + + # Prepare the send request in NixlSender + req_uuid = self._nixl_sender.prepare_send( + source_spec, destination_spec) + + # Create a send task with the allocated buffer + task = NixlSendTask( + buffer=buffer, + source_spec=source_spec, + destination_spec=destination_spec, + state=SendTaskState(), + buffer_vaddr=address, + parent_sender=self._nixl_sender, + request_uuid=req_uuid + ) + self.add_send_task(task) + return task + + def free_task(self, task: SendTask) -> None: + """Free the send task. + Will be called in the pre-implemented progress() method. + + Args: + task (SendTask): The send task to be freed. + """ + # Free the buffer in the ring buffer allocator + self._allocator.free(task.buffer_vaddr) + + def send_task(self, task: SendTask) -> None: + """Send the send task after it is ready. + Will be called in the pre-implemented progress() method. + + Args: + task (SendTask): The send task to be sent. + """ + assert isinstance(task, NixlSendTask), \ + "Task is not a NixlSendTask" + handle = self._nixl_sender.send( + self._allocator.virtual_to_physical(task.buffer_vaddr), + task.receiver_paddr, + task.source_spec.get_size(), + task.request_uuid, + task.destination_spec) + task.transfer_handle = handle + task.mark_sending() + return + + def pre_progress_hook(self) -> None: + for task in self.get_send_tasks(): + task.update_states() + + def post_progress_hook(self) -> None: + pass + + def wait_for_all_tasks(self) -> None: + """Wait for all tasks to finish. + """ + # Wait for all tasks to finish + tasks = self.get_send_tasks() + while tasks: + self.progress() + time.sleep(1) + tasks = self.get_send_tasks() + logger.info("Still waiting for %d tasks to finish", len(tasks)) From 9b0c66b8de38cb556af413054e1b66f13ed4655d Mon Sep 17 00:00:00 2001 From: ApostaC Date: Thu, 22 May 2025 04:21:03 +0000 Subject: [PATCH 09/28] [add] NixlPrefillManager and NixlDecodeManager and test example Signed-off-by: ApostaC --- .../cpu_kv_integration/run_nsys.sh | 5 +- .../cpu_kv_integration/temptest.py | 7 + .../cpu_kv_integration/test_nixl_cpu_utils.py | 3 +- .../cpu_kv_integration/toy_decode.py | 71 +++++ .../cpu_kv_integration/toy_decoder_manager.py | 73 ++++++ .../cpu_kv_integration/toy_example.py | 98 ++++--- .../cpu_kv_integration/toy_receiver.py | 17 +- .../kv_connector/v1/cpu_connector.py | 72 ++++- .../kv_connector/v1/cpu_connector_utils.py | 18 ++ .../kv_connector/v1/nixl_cpu_utils.py | 248 +++++++++++++++++- 10 files changed, 542 insertions(+), 70 deletions(-) create mode 100644 tests/v1/kv_connector/cpu_kv_integration/temptest.py create mode 100644 tests/v1/kv_connector/cpu_kv_integration/toy_decode.py create mode 100644 tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh index 57e28a58a577..025b780c6f1d 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh +++ b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh @@ -1,5 +1,8 @@ CUDA_VISIBLE_DEVICES=7 nsys profile \ - --trace=cuda,nvtx,osrt \ + --trace=cuda,nvtx,osrt,ucx \ + --gpu-metrics-devices=cuda-visible \ + --python-sampling=true \ + --trace-fork-before-exec=true \ --output=prefiller \ --force-overwrite=true \ python3 toy_example.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/temptest.py b/tests/v1/kv_connector/cpu_kv_integration/temptest.py new file mode 100644 index 000000000000..08d0ebfcc7fa --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/temptest.py @@ -0,0 +1,7 @@ + +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlCPUReceiver, NixlKVSender, RingBufferAllocator) + +sender = NixlKVSender(1024 * 1024 * 1024) + +sender.close() diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py index 79031277dda4..6ec039d54ddf 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -184,7 +184,8 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read start=0, stop=16, # Assuming we want to send 16 tokens shape=(2, 1, 16, 8, 128), # Example shape - dtype_str="bfloat16" # Example dtype + dtype_str="bfloat16", # Example dtype + num_all_tokens=16, ) # Prepare send and wait for completion diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py new file mode 100644 index 000000000000..1c1f16553cfc --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os + +# VLLM_ENABLE_V1_MULTIPROCESSING=0 +# VLLM_WORKER_MULTIPROC_METHOD=spawn +os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +import time +import torch +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlCPUReceiver, RingBufferAllocator) + + +if __name__ == "__main__": + + context = "Hi " * 1000 + context2 = "Hi" * 1000 + context3 = "Hello " * 1000 + context4 = "How " * 1000 + prompts = [ + context + "Hello, my name is", + context2+ "The capital of France is", + context3 + "Your name is", + context4 + "The capital of China is", + ] + + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + + llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector = "CPUConnector", + kv_role = "kv_consumer", + kv_connector_extra_config = {}, + ), + load_format="dummy", + max_model_len=2048, + max_num_batched_tokens=2048, + block_size=128, + ) + + # 1ST generation (prefill instance) + outputs = llm.generate( + prompts, + sampling_params, + ) + + new_prompts = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Write new_prompts to output.txt + with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") + print(f"Saved {len(new_prompts)} prompts to output.txt") + + # HACK: for offline single-process inference only + # Wait for all send finishes + from vllm.distributed.kv_transfer import get_kv_transfer_group + cpu_connector = get_kv_transfer_group() + cpu_connector.close() diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py new file mode 100644 index 000000000000..11c5867cc9dc --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 + +import torch.multiprocessing as mp +import time + +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlCPUReceiver, RingBufferAllocator, NixlDecodeManager +) + +def main(): + """Main function to run the receiver.""" + # Setup test parameters + test_host = "127.0.0.1" + test_base_port = 54321 + test_rank = 0 + expected_layers = 32 + + # Buffer configuration + buffer_size = 1 << 30 # 1GB + nixl_page_size = 4096 # Standard page size + + try: + # Mock tensor_model_parallel_rank for this process + import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils + utils.get_tensor_model_parallel_rank = lambda: test_rank + utils.get_tensor_model_parallel_world_size = lambda: 1 + utils.get_tp_group = lambda: None + + decoder_manager = NixlDecodeManager(buffer_size, + test_host, + test_base_port) + + + print(f"Receiver started on {test_host}:{test_base_port}") + + # Run progress loop until interrupted + try: + while True: + decoder_manager.progress() + finished = decoder_manager.get_finished(expected_layers) + print(f"Got {len(finished)} finished requests") + + for req_id in finished: + print(f"Processing finished request {req_id}") + for i in range(expected_layers): + decode_specs = decoder_manager.get_kv_specs(req_id, i) + for spec in decode_specs: + print(f"Received layer {i} tokens " + f"{spec.start} - {spec.stop} request {req_id}. " + f"The shape is {spec.buffer.shape}. " + f"The digest is {spec.buffer.mean()}.") + + decoder_manager.free_request(req_id) + + allocator = decoder_manager._allocator + print("Allocator high/low watermark:", allocator.high_watermark, + allocator.low_watermark) + time.sleep(1) # Small sleep to prevent busy waiting + + except KeyboardInterrupt: + decoder_manager.close() + print("\nShutting down receiver...") + + print("Receiver stopped") + + except Exception as e: + print(f"Receiver error: {e}") + raise + +if __name__ == "__main__": + # Set multiprocessing start method + mp.set_start_method("spawn", force=True) + main() diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index 6528187a4df7..006de422c2cd 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -7,51 +7,65 @@ os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +import time +import torch from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -context = "Hi " * 1000 -context2 = "Hey " * 1000 -context3 = "Hello " * 1000 -context4 = "How " * 1000 -prompts = [ - context + "Hello, my name is", - context2+ "The capital of France is", - context3 + "Your name is", - context4 + "The capital of China is", -] +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + NixlCPUReceiver, RingBufferAllocator) -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig( - kv_connector = "CPUConnector", - kv_role = "kv_producer", - kv_connector_extra_config = {}, - ), - load_format="dummy", - max_model_len=2048, - max_num_batched_tokens=2048, - block_size=64, - ) - -# 1ST generation (prefill instance) -outputs = llm.generate( - prompts, - sampling_params, -) +if __name__ == "__main__": -new_prompts = [] -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) - #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -# Write new_prompts to output.txt -with open("output.txt", "w") as f: - for prompt in new_prompts: - f.write(prompt + "\n") -print(f"Saved {len(new_prompts)} prompts to output.txt") + context = "Hi " * 1000 + context2 = "Hey " * 1000 + context3 = "Hello " * 1000 + context4 = "How " * 1000 + prompts = [ + context + "Hello, my name is", + context2+ "The capital of France is", + context3 + "Your name is", + context4 + "The capital of China is", + ] + + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + + llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector = "CPUConnector", + kv_role = "kv_producer", + kv_connector_extra_config = {}, + ), + load_format="dummy", + max_model_len=2048, + max_num_batched_tokens=2048, + block_size=128, + ) + + # 1ST generation (prefill instance) + outputs = llm.generate( + prompts, + sampling_params, + ) + + new_prompts = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Write new_prompts to output.txt + with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") + print(f"Saved {len(new_prompts)} prompts to output.txt") + + # HACK: for offline single-process inference only + # Wait for all send finishes + from vllm.distributed.kv_transfer import get_kv_transfer_group + cpu_connector = get_kv_transfer_group() + cpu_connector.close() diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py index e6d66f2cc093..62e518ebbe28 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py @@ -53,11 +53,20 @@ def main(): # Verify received data num_elements = source_spec.get_size() - received_data = allocator._buffer[paddr : paddr + num_elements] - print(f"Received {num_elements} elements") - print(f"First few values: {received_data[:10]}") + received_data = allocator._buffer[paddr : paddr + num_elements]\ + .view(source_spec.dtype)\ + .reshape(source_spec.tensor_shape) + print(f"Received layer {source_spec.layer_id} tokens " + f"{source_spec.start} - {source_spec.stop} of request " + f"{source_spec.request_id}") + print(f"The shape is {received_data.shape}") + print(f"The digest is {received_data.mean()}") + allocator.free(vaddr) - time.sleep(0.001) # Small sleep to prevent busy waiting + + print("Allocator high/low watermark:", allocator.high_watermark, + allocator.low_watermark) + time.sleep(1) # Small sleep to prevent busy waiting except KeyboardInterrupt: print("\nShutting down receiver...") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 3679e86940fb..8b2e28c1cb4c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -21,7 +21,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( SourceSpec, DestinationSpec) from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlSendTask, NixlKVSender) + NixlSendTask, NixlPrefillManager, NixlDecodeManager) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -107,7 +107,10 @@ class PrefillRequestTracker: # Request id req_id: str - # Total number of tokens that are in this request + # Total number of tokens in the "full request" + num_all_tokens: int = 0 + + # Total number of tokens that are already seen until this step num_total_tokens: int = 0 # Number of tokens that are already saved @@ -135,6 +138,7 @@ def from_new_request( return PrefillRequestTracker( req_id=new_request.req_id, + num_all_tokens=len(new_request.prompt_token_ids), num_total_tokens = num_tokens_to_compute, num_saved_tokens=0, allocated_block_ids=unfolded_block_ids, @@ -172,6 +176,8 @@ class PrefillReqMeta: skip_leading_tokens: int # Skip last N tokens skip_trailing_tokens: int + # The number of tokens in the "full request" + num_all_tokens: int @staticmethod def from_request_tracker( @@ -222,12 +228,18 @@ def from_request_tracker( token_range=token_range, skip_leading_tokens=skip_leading_tokens, skip_trailing_tokens=skip_trailing_tokens, + num_all_tokens=request_tracker.num_all_tokens, ) @dataclass class DecodeReqMeta: - pass + # Request id + req_id: str + # Allocated block ids + block_ids: list[int] + # Skip the first N tokens + skip_leading_tokens: int @dataclass class CPUConnectorMetadata(KVConnectorMetadata): @@ -271,10 +283,14 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: self._block_size = vllm_config.cache_config.block_size if role == KVConnectorRole.SCHEDULER: - pass + self._kv_receiver = NixlDecodeManager( + 1024 * 1024 * 1024, # 1GB for debug + "localhost", + 54321, # Changed from string to int to match the class definition + ) elif role == KVConnectorRole.WORKER: # Prefiller side sender - self._kv_sender = NixlKVSender(1024 * 1024 * 1024) # 1GB for debug + self._kv_sender = NixlPrefillManager(1024 * 1024 * 1024) # 1GB for debug # request_id -> prefill request trackers self._prefill_reqs: dict[str, PrefillRequestTracker] = {} @@ -291,6 +307,10 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: # prefill offload tasks self._inflight_copy_tasks: list[NixlSendTask] = [] + # Decode request id to prefill request id mapping + self._decode_req_id_to_prefill_req_id: dict[str, str] = {} + + ############################################################ # Scheduler Side Methods @@ -346,7 +366,21 @@ def build_decode_meta( def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int) -> tuple[int, bool]: - return 0, False + kv_transfer_params = request.kv_transfer_params + num_tokens = len(request.prompt_token_ids) + request_id = request.request_id + if "prefill_request_id" not in kv_transfer_params: + logger.warning("Request %s does not have prefill_request_id", request.req_id) + #return 0, False + + logger.warning("NOW DEBUGGING SET THE REQUEST TO HAVE A PREFILL ID") + # Set the prefill_request_id to the request id + # This is a temporary fix to make the code work + self._decode_req_id_to_prefill_req_id[request_id] = request.request_id + return num_tokens, True + prefill_request_id = kv_transfer_params["prefill_request_id"] + self._decode_req_id_to_prefill_req_id[request_id] = prefill_request_id + return num_tokens, True def update_state_after_alloc( self, @@ -354,6 +388,7 @@ def update_state_after_alloc( blocks: "KVCacheBlocks", num_external_tokens: int) -> None: print("In update_state_after_alloc") + breakpoint() pass def build_connector_meta( @@ -454,7 +489,6 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, meta = self._get_connector_metadata() assert isinstance(meta, CPUConnectorMetadata), \ "Connector metadata is not of type CPUConnectorMetadata" - assert self._kv_sender is not None for prefill_req in meta.prefill_meta: @@ -465,7 +499,8 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, start=prefill_req.token_range.start, stop=prefill_req.token_range.stop, shape=tuple(self._get_kv_shape(len(prefill_req.blocks_to_save))), - dtype_str=str(kv_layer.dtype).split('.')[-1] # Convert torch.float32 -> "float32" + dtype_str=str(kv_layer.dtype).split('.')[-1], # Convert torch.float32 -> "float32" + num_all_tokens=prefill_req.num_all_tokens, ) # Create a destination spec @@ -501,8 +536,10 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, self._inflight_copy_tasks.append(task) - # Check the task states and send the tasks - self._kv_sender.progress() + # TODO(ApostaC): Potential optimizations + # 1. coalesce the d2h page copy to a single call + # 2. use a single cuda event instead of a list of cuda events + @_lmcache_nvtx_annotate @@ -514,13 +551,13 @@ def wait_for_save(self): This prevents overwrites of paged KV buffer before saving done. """ + # Check the task states and send the tasks for task in self._inflight_copy_tasks: if task.cuda_event is not None: task.cuda_event.synchronize() + self._kv_sender.progress() self._inflight_copy_tasks.clear() - self._kv_sender.wait_for_all_tasks() - def get_finished( self, finished_req_ids: set[str] ) -> tuple[Optional[set[str]], Optional[set[str]]]: @@ -535,3 +572,14 @@ def get_finished( call to this method (this call or a prior one). """ return None, None + + def close(self): + """ + Block until all the transfers are done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. + + This prevents overwrites of paged KV buffer before saving done. + """ + if hasattr(self, "_kv_sender") and self._kv_sender is not None: + self._kv_sender.close() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index 4b11b32c2295..0a6923a60985 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -80,6 +80,9 @@ class SourceSpec(msgspec.Struct): # The dtype of the offloaded KV cache tensor as a string dtype_str: str + # The total number of tokens in the "full request" + num_all_tokens: int + @property def token_range(self) -> slice: """Get the token range as a slice object.""" @@ -104,6 +107,14 @@ def __str__(self) -> str: f"layer_id={self.layer_id}, " f"token_range={self.token_range}, shape={self.tensor_shape})") +@dataclass +class DecoderKVSpec: + # Start index of the KV cache (inclusive) + start: int + # Stop index of the KV cache (exclusive) + stop: int + # The shape of the KV cache + buffer: torch.Tensor @dataclass @@ -233,15 +244,20 @@ def progress(self) -> None: new_task_list = [] + num_sent = 0 + num_freed = 0 for task in self._send_tasks: should_add = True if task.is_ready() and not task.is_sending(): self.send_task(task) + task.mark_sending() + num_sent += 1 if task.is_done(): self.free_task(task) should_add = False + num_freed += 1 if should_add: new_task_list.append(task) @@ -251,6 +267,8 @@ def progress(self) -> None: # Update after going through all send tasks self.post_progress_hook() + logger.info("KVSender progress: sent %d, freed %d", num_sent, num_freed) + ###################################################### # Abstract methods (to be implemented by subclasses) # ###################################################### diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 5ce9aaf3865c..f17620e78991 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -20,7 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( - SendTask, KVSenderInterface, SourceSpec, DestinationSpec, + SendTask, KVSenderInterface, SourceSpec, DestinationSpec, DecoderKVSpec, SendTaskState) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -51,6 +51,8 @@ # Helper classes and functions ################################################################### +DEFAULT_NIXL_PAGE_SIZE = 4096 + def init_nixl_agent( buffer_size: int, buffer_ptr: int, @@ -179,6 +181,29 @@ def allocate(self, size: int) -> tuple[int, Optional[torch.Tensor]]: # No space available return -1, None + def view_as_tensor(self, vaddr: int, + dtype: torch.dtype, shape: torch.Size) -> torch.Tensor: + """View the buffer as a tensor. + Args: + vaddr (int): The virtual address of the buffer. + dtype (torch.dtype): The data type of the tensor. + shape (torch.Size): The shape of the tensor. + Returns: + torch.Tensor: The tensor view of the buffer. + """ + assert vaddr % self._align_to == 0, \ + "Virtual address is not aligned to the alignment size" + + paddr = self.virtual_to_physical(vaddr) + size = shape.numel() * dtype.itemsize + assert paddr + size <= self._size, \ + "Physical address is out of bounds" + + # Get the tensor + return self._buffer[paddr:paddr + size].view(dtype).view(shape) + + + def free(self, address: int) -> None: """Free the buffer at the given address. @@ -187,7 +212,7 @@ def free(self, address: int) -> None: which is returned by the allocate() method. """ assert address in self._allocated, \ - "Address not found in allocated buffers" + f"Address {address} not found in allocated buffers" # Pop the address from the allocated dict, and update the # low watermark @@ -423,7 +448,6 @@ def send( ) self._nixl_wrapper.transfer(handle) - logger.info("Start trasnfer of the request %s", req_uuid) return handle @@ -556,6 +580,16 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: logger.debug("Successfully completed handshake with %s", destination_spec) + def close(self) -> None: + if self._reg_dlist is not None: + self._nixl_wrapper.deregister_memory(self._reg_dlist) + for agent in self._remote_agents.values(): + self._nixl_wrapper.remove_remote_agent(agent) + if self._local_xfer_handlers is not None: + self._nixl_wrapper.release_dlist_handle(self._local_xfer_handlers) + for remote_xfer_handler in self._remote_xfer_handlers.values(): + self._nixl_wrapper.release_dlist_handle(remote_xfer_handler) + del self._nixl_wrapper class NixlCPUReceiver: def __init__( @@ -607,8 +641,6 @@ def _process_msgs(self): for msg in notifs[remote_agent_name]: # Decode the messag obj = self._msg_decoder.decode(msg) - logger.info("Received message from %s: %s %s", - remote_agent_name, obj.msg_type, obj.req_uuid) if obj.msg_type == "REQMSG": # Add the request to the pending allocation self._pending_allocation[obj.req_uuid] = (obj.source_spec, @@ -641,7 +673,7 @@ def _process_allocation_requests(self): vaddr, buffer = self._allocator.allocate(requested_size) if vaddr == -1: - logger.info("No space available for request %s", req_uuid) + #logger.debug("No space available for request %s", req_uuid) # No space available, skip all the requests # NOTE: an alternative is to try allocation for other requests @@ -652,7 +684,6 @@ def _process_allocation_requests(self): # Add the request to the inflight requests self._inflight_requests[req_uuid] = source_spec self._inflight_request_vaddr[req_uuid] = vaddr - logger.info("Adding %s to inflight requests", req_uuid) # Send back the ready message paddr = self._allocator.virtual_to_physical(vaddr) @@ -765,6 +796,11 @@ def stop_handshake_listener(self) -> None: self._handshake_listener_t.join() self._handshake_listener_t = None + def close(self): + self.stop_handshake_listener() + self._nixl_wrapper.deregister_memory(self._reg_dlist) + del self._nixl_wrapper + @contextlib.contextmanager def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: @@ -834,7 +870,7 @@ def update_states(self) -> None: self.state.send_done = True -class NixlKVSender(KVSenderInterface): +class NixlPrefillManager(KVSenderInterface): """NixlSendTask is an implementation of KVSenderInterface that provides a ring buffer allocator for managing pin memory allocation and deallocation, with NIXL for sending data. @@ -842,7 +878,7 @@ class NixlKVSender(KVSenderInterface): def __init__(self, buffer_size: int) -> None: super().__init__() - nixl_page_size = 4096 + nixl_page_size = DEFAULT_NIXL_PAGE_SIZE self._buffer_size = buffer_size self._allocator = RingBufferAllocator(self._buffer_size, nixl_page_size) @@ -928,7 +964,8 @@ def post_progress_hook(self) -> None: pass def wait_for_all_tasks(self) -> None: - """Wait for all tasks to finish. + """Wait for all tasks to finish. Mainly for debug, test, + and offline inferences. """ # Wait for all tasks to finish tasks = self.get_send_tasks() @@ -937,3 +974,194 @@ def wait_for_all_tasks(self) -> None: time.sleep(1) tasks = self.get_send_tasks() logger.info("Still waiting for %d tasks to finish", len(tasks)) + + def close(self): + self.wait_for_all_tasks() + self._nixl_sender.close() + +class NixlDecodeManager: + def __init__(self, + buffer_size: int, + host: str, + port: int) -> None: + self.nixl_page_size = DEFAULT_NIXL_PAGE_SIZE + self._buffer_size = buffer_size + self._allocator = RingBufferAllocator(self._buffer_size, + self.nixl_page_size) + self._nixl_receiver = NixlCPUReceiver(self._allocator, + self.nixl_page_size) + self._nixl_receiver.start_handshake_listener(host, port) + + + # How many tokens are received for each request, each layer + # (p_request_id, layer_id) -> num_tokens + self._received_tokens: dict[str, dict[int, int]] = {} + + # How many tokens are expected for each request + # p_request_id -> num_tokens + self._expected_tokens: dict[str, int] = {} + + # The detailed specs of the requests + # (p_request_id, layer_id) -> (SourceSpec, vaddr) + self._request_specs: dict[tuple(str, int), + list[tuple(SourceSpec, int)]] = {} + + # Metadata + self.rank = get_tensor_model_parallel_rank() + self.world_size = get_tensor_model_parallel_world_size() + self.tp_group = get_tp_group() + + # Multi process receiving check + # p_request_id -> number of ready workers + self._done_receiving_count: defaultdict[str, int] = defaultdict(lambda: 0) + + def _check_receive_and_update(self): + """Checks the KV cache receiving status and update the internal + states + """ + finished_list = self._nixl_receiver.get_finished(clear = True) + for source_spec, vaddr in finished_list: + # Get the request id and layer id + p_request_id = source_spec.request_id + layer_id = source_spec.layer_id + num_received_tokens = source_spec.stop - source_spec.start + + if p_request_id not in self._expected_tokens: + self._expected_tokens[p_request_id] = source_spec.num_all_tokens + + # Update the received tokens + if p_request_id not in self._received_tokens: + self._received_tokens[p_request_id] = {} + if layer_id not in self._received_tokens[p_request_id]: + self._received_tokens[p_request_id][layer_id] = 0 + self._received_tokens[p_request_id][layer_id] += num_received_tokens + + # Update received specs + if (p_request_id, layer_id) not in self._request_specs: + self._request_specs[(p_request_id, layer_id)] = [] + self._request_specs[(p_request_id, layer_id)].append( + (source_spec, vaddr) + ) + + def progress(self) -> None: + """Process the received requests and the data. Updates the internal + status and respond to the allocation requests. + """ + self._nixl_receiver.progress() + + def get_finished(self, num_expected_layers: int) -> list[str]: + """Get the prefill node request_ids of the requests that finishes + receiving (which means the KV caches of all tokens and all layers + are in CPU memory) + + Returns: + list[str]: A list of prefill-side request ids. + """ + ready_requests = [] + self._check_receive_and_update() + for p_request_id in self._expected_tokens: + expected_tokens = self._expected_tokens[p_request_id] + assert p_request_id in self._received_tokens + # check if all the layers are there + if len(self._received_tokens[p_request_id]) != num_expected_layers: + continue + # check if all the tokens are there + ready = True + for layer_id in self._received_tokens[p_request_id]: + received_tokens = self._received_tokens[p_request_id][layer_id] + if received_tokens != expected_tokens: + ready = False + break + if ready: + ready_requests.append(p_request_id) + + if self.world_size == 1: + return ready_requests + + # For multi-process + if self.rank == 0: + for p_request_id in ready_requests: + self._done_receiving_count[p_request_id] += 1 + + other_ranks_finished_ids: list[str] = [] + for i in range(1, self.world_size): + other_ranks_finished_ids.extend( + self.tp_group.recv_object(src=i)) + for p_request_id in other_ranks_finished_ids: + self._done_receiving_count[p_request_id] += 1 + + all_done_recving: list[str] + for p_request_id in self._done_receiving_count: + if self._done_receiving_count[p_request_id] == \ + self.world_size: + all_done_recving.append(p_request_id) + self._done_receiving_count.pop(p_request_id) + return all_done_recving + else: + self.tp_group.send_object(ready_requests, dst=0) + return ready_requests + + def _create_decoder_kv_spec(self, + source_spec: SourceSpec, + vaddr: int) -> DecoderKVSpec: + """Create a DecoderKVSpec from the source spec and the virtual address. + """ + # Get the correct buffer + return DecoderKVSpec( + start = source_spec.start, + stop = source_spec.stop, + buffer = self._allocator.view_as_tensor( + vaddr, source_spec.dtype, source_spec.tensor_shape) + ) + + + def get_kv_specs(self, + p_request_id: str, + layer_id: int) -> list[DecoderKVSpec]: + """Get the KV specs for the given request id and layer id, which + will be used for connector to load the KV back to CPU + + Args: + p_request_id (str): The original request id from prefiller. + layer_id (int): The layer id of the request. + """ + ret = [] + if (p_request_id, layer_id) not in self._request_specs: + logger.warning("Request %s not found in request specs", + (p_request_id, layer_id)) + return ret + + for source_spec, vaddr in self._request_specs[(p_request_id, layer_id)]: + # Create the decoder kv spec + decoder_kv_spec = self._create_decoder_kv_spec(source_spec, vaddr) + ret.append(decoder_kv_spec) + + return ret + + def free_request(self, p_request_id): + """Free the request's memory with the given request id. + + Args: + p_request_id (str): The original request id from prefiller. + """ + # Free the memory and clear the internal states + self._expected_tokens.pop(p_request_id, None) + rcv_tokens = self._received_tokens.pop(p_request_id, None) + if rcv_tokens is not None: + for layer_id in rcv_tokens: + assert (p_request_id, layer_id) in self._request_specs, \ + "Found received tokens but no request specs" + + # Free the memory + for src_spec, vaddr in self._request_specs[(p_request_id, layer_id)]: + self._allocator.free(vaddr) + + # Clear the request specs + self._request_specs.pop((p_request_id, layer_id), None) + + else: + logger.warning("Request %s not found in received tokens", + p_request_id) + + def close(self): + self._nixl_receiver.close() From 2647ce5862ae976eeb3ef47f622ad60009615e95 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 23 May 2025 01:40:57 +0000 Subject: [PATCH 10/28] [Add] unit tests for more functionalities Signed-off-by: ApostaC --- .../cpu_kv_integration/output.txt | 8 +- .../cpu_kv_integration/output_decode.txt | 4 + .../test_cpu_connector_kernels.py | 343 ++++++++++++++++++ .../cpu_kv_integration/toy_decode.py | 8 +- .../cpu_kv_integration/toy_example.py | 2 +- 5 files changed, 356 insertions(+), 9 deletions(-) create mode 100644 tests/v1/kv_connector/cpu_kv_integration/output_decode.txt create mode 100644 tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/output.txt b/tests/v1/kv_connector/cpu_kv_integration/output.txt index 24b680935413..09cf415402dc 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/output.txt +++ b/tests/v1/kv_connector/cpu_kv_integration/output.txt @@ -1,4 +1,4 @@ -Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hello, my name isoplevel -Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey The capital of France isoplevel -Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Your name isoplevel -How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How The capital of China isoplevel +Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hello, my name is [ +Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey Hey The capital of France is Paris +Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Your name is not +How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How The capital of China is Beijing diff --git a/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt b/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt new file mode 100644 index 000000000000..2384fe2ab883 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt @@ -0,0 +1,4 @@ +Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hello, my name is [Your Name] and I am a [Your +Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi The capital of France is Paris. The capital of France is Paris. The +Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Your name is not in the list. Please check your email for +How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How The capital of China is Beijing. Beijing is a city in northern China. diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py b/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py new file mode 100644 index 000000000000..3f1e6e294767 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py @@ -0,0 +1,343 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch + +from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector import ( + d2h_page_copy, h2d_copy_leading_tokens, h2d_copy_trailing_tokens, h2d_page_copy +) + +@pytest.fixture +def device_tensors(): + """Create sample device tensors for testing.""" + # Create tensors with shape (2, num_blocks, page_size, head_size, hidden_size) + num_blocks = 4 + page_size = 16 + head_size = 8 + hidden_size = 128 + + # Initialize with unique values for each position + k_tensor = torch.arange(num_blocks * page_size * head_size * hidden_size, + dtype=torch.float32, device='cuda') + k_tensor = k_tensor.reshape(num_blocks, page_size, head_size, hidden_size) + + v_tensor = k_tensor + 1000 # Different values for v + + # Stack k and v tensors + kv_tensor = torch.stack([k_tensor, v_tensor], dim=0) + return kv_tensor + +@pytest.fixture +def host_buffer(): + """Create host buffer for testing.""" + # Create buffer with same dimensions as device tensor but fewer blocks + num_blocks = 2 # Smaller than device tensor + page_size = 16 + head_size = 8 + hidden_size = 128 + + k_buffer = torch.zeros(num_blocks * page_size * head_size * hidden_size, + dtype=torch.float32) + k_buffer = k_buffer.reshape(num_blocks, page_size, head_size, hidden_size) + + v_buffer = torch.zeros_like(k_buffer) + + # Stack k and v buffers + kv_buffer = torch.stack([k_buffer, v_buffer], dim=0) + return kv_buffer + +def test_d2h_page_copy(device_tensors, host_buffer): + """Test device to host copy operation.""" + # Copy blocks 1 and 3 from device to host + block_ids = [1, 3] + + d2h_page_copy(device_tensors, host_buffer, block_ids) + + # Verify copied data + for i, block_id in enumerate(block_ids): + # Check key tensor + assert torch.allclose( + host_buffer[0, i].cpu(), + device_tensors[0, block_id].cpu() + ) + # Check value tensor + assert torch.allclose( + host_buffer[1, i].cpu(), + device_tensors[1, block_id].cpu() + ) + +def test_h2d_copy_leading_tokens(): + """Test copying leading tokens from host to device.""" + # Create sample tensors + page_size = 16 + head_size = 8 + hidden_size = 128 + + src_buffer = torch.ones((2, 1, page_size, head_size, hidden_size), + dtype=torch.float32) + # Initialize destination with a known pattern + dst_layer = torch.full((2, 1, page_size, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32, device='cuda') + + # Copy first 8 tokens (half of page_size) + end_position = 8 + h2d_copy_leading_tokens( + src_buffer, dst_layer, + src_block_id=0, dst_block_id=0, + end_position_in_block=end_position + ) + + # Verify first 8 tokens were copied + assert torch.allclose( + dst_layer[0, 0, :end_position].cpu(), + src_buffer[0, 0, :end_position] + ) + assert torch.allclose( + dst_layer[1, 0, :end_position].cpu(), + src_buffer[1, 0, :end_position] + ) + + # Verify remaining tokens are unchanged (should still be 2.0) + expected_unchanged = torch.full((page_size - end_position, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32) + assert torch.allclose( + dst_layer[0, 0, end_position:].cpu(), + expected_unchanged + ) + assert torch.allclose( + dst_layer[1, 0, end_position:].cpu(), + expected_unchanged + ) + +def test_h2d_copy_trailing_tokens(): + """Test copying trailing tokens from host to device.""" + # Create sample tensors + page_size = 16 + head_size = 8 + hidden_size = 128 + + src_buffer = torch.ones((2, 1, page_size, head_size, hidden_size), + dtype=torch.float32) + # Initialize destination with a known pattern + dst_layer = torch.full((2, 1, page_size, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32, device='cuda') + + # Copy last 8 tokens (half of page_size) + start_position = 8 + h2d_copy_trailing_tokens( + src_buffer, dst_layer, + src_block_id=0, dst_block_id=0, + start_position_in_block=start_position + ) + + # Verify last 8 tokens were copied + assert torch.allclose( + dst_layer[0, 0, start_position:].cpu(), + src_buffer[0, 0, start_position:] + ) + assert torch.allclose( + dst_layer[1, 0, start_position:].cpu(), + src_buffer[1, 0, start_position:] + ) + + # Verify leading tokens are unchanged (should still be 2.0) + expected_unchanged = torch.full((start_position, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32) + assert torch.allclose( + dst_layer[0, 0, :start_position].cpu(), + expected_unchanged + ) + assert torch.allclose( + dst_layer[1, 0, :start_position].cpu(), + expected_unchanged + ) + +def test_h2d_page_copy(): + """Test host to device page copy operation.""" + # Create sample tensors + num_blocks = 4 + page_size = 16 + head_size = 8 + hidden_size = 128 + block_size = page_size + + src_buffer = torch.ones((2, num_blocks, page_size, head_size, hidden_size), + dtype=torch.float32) + # Initialize destination with a known pattern + dst_layer = torch.full((2, num_blocks, page_size, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32, device='cuda') + + # Test copying a range of tokens that spans multiple blocks + block_ids = [0, 1, 2, 3] + start_token_idx = 8 + stop_token_idx = 56 + + h2d_page_copy( + src_buffer, dst_layer, block_ids, + start_token_idx, stop_token_idx, block_size + ) + + # Calculate which blocks should be fully/partially copied + start_block = start_token_idx // block_size + end_block = (stop_token_idx + block_size - 1) // block_size + start_pos = start_token_idx % block_size + end_pos = stop_token_idx % block_size + + # Expected unchanged value + expected_unchanged = torch.full((page_size, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32) + + # Verify copied and unchanged data for each block + for i in range(num_blocks): + if i < start_block or i >= end_block: + # Blocks outside the copy range should be unchanged + assert torch.allclose( + dst_layer[:, block_ids[i]].cpu(), + expected_unchanged + ) + elif i == start_block: + # First block - verify both copied and unchanged parts + # Leading part should be unchanged + assert torch.allclose( + dst_layer[:, block_ids[i], :start_pos].cpu(), + expected_unchanged[:start_pos] + ) + # Trailing part should be copied + assert torch.allclose( + dst_layer[:, block_ids[i], start_pos:].cpu(), + src_buffer[:, i, start_pos:] + ) + elif i == end_block - 1: + # Last block - verify both copied and unchanged parts + # Leading part should be copied + assert torch.allclose( + dst_layer[:, block_ids[i], :end_pos].cpu(), + src_buffer[:, i, :end_pos] + ) + # Trailing part should be unchanged + assert torch.allclose( + dst_layer[:, block_ids[i], end_pos:].cpu(), + expected_unchanged[end_pos:] + ) + else: + # Middle blocks - verify full copy + assert torch.allclose( + dst_layer[:, block_ids[i]].cpu(), + src_buffer[:, i] + ) + +def test_h2d_page_copy_edge_cases(): + """Test edge cases for host to device page copy.""" + # Create sample tensors + num_blocks = 2 + page_size = 16 + head_size = 8 + hidden_size = 128 + block_size = page_size + + src_buffer = torch.ones((2, num_blocks, page_size, head_size, hidden_size), + dtype=torch.float32) + dst_layer = torch.zeros((2, num_blocks, page_size, head_size, hidden_size), + dtype=torch.float32, device='cuda') + + # Test case 1: Copy exactly one block + block_ids = [0, 1] + start_token_idx = 0 + stop_token_idx = block_size + + h2d_page_copy( + src_buffer, dst_layer, block_ids, + start_token_idx, stop_token_idx, block_size + ) + + assert torch.allclose( + dst_layer[:, block_ids[0]].cpu(), + src_buffer[:, 0] + ) + + # Test case 2: Copy partial block + dst_layer.zero_() + block_ids = [0, 1] + start_token_idx = block_size + 2 + stop_token_idx = block_size + 6 + + h2d_page_copy( + src_buffer, dst_layer, block_ids, + start_token_idx, stop_token_idx, block_size + ) + + start_pos = start_token_idx % block_size + end_pos = stop_token_idx % block_size + + assert torch.allclose( + dst_layer[:, block_ids[1], start_pos:end_pos].cpu(), + src_buffer[:, 1, start_pos:end_pos] + ) + +def test_h2d_page_copy_aligned(): + """Test host to device page copy operation with block-aligned boundaries.""" + # Create sample tensors + num_blocks = 4 + page_size = 16 + head_size = 8 + hidden_size = 128 + block_size = page_size + + src_buffer = torch.ones((2, num_blocks, page_size, head_size, hidden_size), + dtype=torch.float32) + # Initialize destination with a known pattern + dst_layer = torch.full((2, num_blocks, page_size, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32, device='cuda') + + # Test copying exactly 2 blocks (from block 1 to block 3) + block_ids = [0, 1, 2, 3] + start_token_idx = block_size # Start at beginning of block 1 + stop_token_idx = block_size * 3 # End at end of block 2 + + h2d_page_copy( + src_buffer, dst_layer, block_ids, + start_token_idx, stop_token_idx, block_size + ) + + # Expected unchanged value + expected_unchanged = torch.full((page_size, head_size, hidden_size), + fill_value=2.0, dtype=torch.float32) + + # Verify copied and unchanged data for each block + for i in range(num_blocks): + if i == 0 or i == 3: + # First and last blocks should be unchanged + assert torch.allclose( + dst_layer[:, block_ids[i]].cpu(), + expected_unchanged + ), f"Block {i} should be unchanged" + else: + # Middle blocks (1 and 2) should be fully copied + assert torch.allclose( + dst_layer[:, block_ids[i]].cpu(), + src_buffer[:, i] + ), f"Block {i} should be fully copied" + + # Test copying a single block-aligned region + dst_layer.fill_(2.0) # Reset destination + start_token_idx = block_size * 2 # Start at beginning of block 2 + stop_token_idx = block_size * 3 # End at end of block 2 + + h2d_page_copy( + src_buffer, dst_layer, block_ids, + start_token_idx, stop_token_idx, block_size + ) + + # Verify only block 2 was copied, others unchanged + for i in range(num_blocks): + if i == 2: + # Block 2 should be fully copied + assert torch.allclose( + dst_layer[:, block_ids[i]].cpu(), + src_buffer[:, i] + ), "Block 2 should be fully copied" + else: + # All other blocks should be unchanged + assert torch.allclose( + dst_layer[:, block_ids[i]].cpu(), + expected_unchanged + ), f"Block {i} should be unchanged" diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index 1c1f16553cfc..769075d8a258 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -19,7 +19,7 @@ if __name__ == "__main__": context = "Hi " * 1000 - context2 = "Hi" * 1000 + context2 = "Hi " * 1000 context3 = "Hello " * 1000 context4 = "How " * 1000 prompts = [ @@ -29,7 +29,7 @@ context4 + "The capital of China is", ] - sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True, @@ -39,7 +39,7 @@ kv_role = "kv_consumer", kv_connector_extra_config = {}, ), - load_format="dummy", + #load_format="dummy", max_model_len=2048, max_num_batched_tokens=2048, block_size=128, @@ -59,7 +59,7 @@ #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # Write new_prompts to output.txt - with open("output.txt", "w") as f: + with open("output_decode.txt", "w") as f: for prompt in new_prompts: f.write(prompt + "\n") print(f"Saved {len(new_prompts)} prompts to output.txt") diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index 006de422c2cd..8e42c7cfa666 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -39,7 +39,7 @@ kv_role = "kv_producer", kv_connector_extra_config = {}, ), - load_format="dummy", + #load_format="dummy", max_model_len=2048, max_num_batched_tokens=2048, block_size=128, From 96cb2b58e43d14f2af307908075baafd3e1b7a05 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 23 May 2025 01:41:25 +0000 Subject: [PATCH 11/28] [Ckpt] everything is functional Signed-off-by: ApostaC --- .../kv_connector/v1/cpu_connector.py | 363 ++++++++++++++++-- .../kv_connector/v1/nixl_cpu_utils.py | 2 + 2 files changed, 328 insertions(+), 37 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 8b2e28c1cb4c..a56bd55103c5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -55,23 +55,96 @@ def d2h_page_copy( (2, num_vllm_blocks, page_size, ...remaining dims...) dst_buffer (torch.Tensor): The destination buffer on host, shape is (2, len(block_ids), page_size, ...remaining dims...) + block_ids (list[int]): The list of vllm block ids to copy from. """ - # debug copy: block_mapping = torch.stack([torch.tensor(block_ids, dtype = torch.long), torch.arange(len(block_ids), dtype = torch.long)], dim = 1) ops.swap_blocks(src_layer[0], dst_buffer[0], block_mapping) ops.swap_blocks(src_layer[1], dst_buffer[1], block_mapping) - #for dst_idx, block_id in enumerate(block_ids): - # src_k, src_v = src_layer[:, block_id, :, :] - # dst_k, dst_v = dst_buffer[:, dst_idx, :, :] - # # Copy the data from device to host - # dst_k.copy_(src_k, non_blocking=True) - # dst_v.copy_(src_v, non_blocking=True) + +def h2d_copy_leading_tokens( + src_buffer: torch.Tensor, + dst_layer: torch.Tensor, + src_block_id: int, + dst_block_id: int, + end_position_in_block: int) -> None: + """Copy the leading tokens in 1 block from host buffer to device layer. + + Args: + src_buffer (torch.Tensor): The source buffer on host, shape is + (2, len(block_ids), page_size, ...remaining dims...) + dst_layer (torch.Tensor): The destination layer on device, shape is + (2, num_vllm_blocks, page_size, ...remaining dims...) + src_block_id (int): The source block id to copy. + dst_block_id (int): The destination block id to copy. + end_position_in_block (int): The end position in the block to copy. + """ + dst_k = dst_layer[0][dst_block_id][:end_position_in_block] + src_k = src_buffer[0][src_block_id][:end_position_in_block] + dst_v = dst_layer[1][dst_block_id][:end_position_in_block] + src_v = src_buffer[1][src_block_id][:end_position_in_block] + dst_k.copy_(src_k, non_blocking=True) + dst_v.copy_(src_v, non_blocking=True) + + +def h2d_copy_trailing_tokens( + src_buffer: torch.Tensor, + dst_layer: torch.Tensor, + src_block_id: int, + dst_block_id: int, + start_position_in_block: int) -> None: + """Copy the trailing tokens in 1 block from host buffer to device layer. + + Args: + src_buffer (torch.Tensor): The source buffer on host, shape is + (2, len(block_ids), page_size, ...remaining dims...) + dst_layer (torch.Tensor): The destination layer on device, shape is + (2, num_vllm_blocks, page_size, ...remaining dims...) + src_block_id (int): The source block id to copy. + dst_block_id (int): The destination block id to copy. + start_position_in_block (int): The start position in the block to copy. + """ + dst_k = dst_layer[0][dst_block_id][start_position_in_block:] + src_k = src_buffer[0][src_block_id][start_position_in_block:] + dst_v = dst_layer[1][dst_block_id][start_position_in_block:] + src_v = src_buffer[1][src_block_id][start_position_in_block:] + dst_k.copy_(src_k, non_blocking=True) + dst_v.copy_(src_v, non_blocking=True) + +def h2d_copy_part_block( + src_buffer: torch.Tensor, + dst_layer: torch.Tensor, + src_block_id: int, + dst_block_id: int, + start_position_in_block: int, + end_position_in_block: int) -> None: + """Copy the part of a block from host buffer to device layer. + + Args: + src_buffer (torch.Tensor): The source buffer on host, shape is + (2, len(block_ids), page_size, ...remaining dims...) + dst_layer (torch.Tensor): The destination layer on device, shape is + (2, num_vllm_blocks, page_size, ...remaining dims...) + src_block_id (int): The source block id to copy. + dst_block_id (int): The destination block id to copy. + start_position_in_block (int): The start position in the block to copy. + end_position_in_block (int): The end position in the block to copy. + """ + dst_k = dst_layer[0][dst_block_id][start_position_in_block:end_position_in_block] + src_k = src_buffer[0][src_block_id][start_position_in_block:end_position_in_block] + dst_v = dst_layer[1][dst_block_id][start_position_in_block:end_position_in_block] + src_v = src_buffer[1][src_block_id][start_position_in_block:end_position_in_block] + dst_k.copy_(src_k, non_blocking=True) + dst_v.copy_(src_v, non_blocking=True) + def h2d_page_copy( src_buffer: torch.Tensor, dst_layer: torch.Tensor, - block_ids: list[int] + block_ids: list[int], + start_token_idx: int, + stop_token_idx: int, + block_size: int ) -> None: """Copy data from host to device. @@ -80,13 +153,74 @@ def h2d_page_copy( (2, len(block_ids), page_size, ...remaining dims...) dst_layer (torch.Tensor): The destination layer on device, shape is (2, num_vllm_pages, page_size, ...remaining dims...) + block_ids (list[int]): The list of vllm block ids to copy to (for all + the tokens) + start_token_idx (int): The start token index in the request + stop_token_idx (int): The stop token index in the request + block_size (int): The block size in vLLM """ - for src_idx, block_id in enumerate(block_ids): - dst_k, dst_v = dst_layer[:, block_id, :, :] - src_k, src_v = src_buffer[:, src_idx, :, :] - # Copy the data from host to device - dst_k.copy_(src_k, non_blocking=True) - dst_v.copy_(src_v, non_blocking=True) + # Step 1: build the block mapping (src_block_id, dst_block_id) + separate_first_block = start_token_idx % block_size != 0 + separate_last_block = stop_token_idx % block_size != 0 + + start_block_id = start_token_idx // block_size # inclusive + end_block_id = stop_token_idx // block_size # exclusive + src_block_ids = torch.arange(start_block_id, end_block_id, + dtype = torch.long) + if separate_first_block: + src_block_ids = src_block_ids[1:] + # NOTE: we don't need to add the last block id here, because the + # end_block_id is exclusive + # E.g., start = 10, stop = 50, block_size = 16, then we have + # start_block_id = 0 , separate_first_block = True + # end_block_id = 3, separate_last_block = True + # src_block_ids = [1, 2] + # We will copy token 10-15 and 48-49 from the first and last block + # separately. + + vllm_block_ids = torch.tensor(block_ids, dtype=torch.long) + dst_block_ids = vllm_block_ids[src_block_ids] + + # Step 2: copy the first and last block separately if needed + if start_block_id == end_block_id: + # Only one block to copy + start_position_in_block = start_token_idx % block_size + end_position_in_block = stop_token_idx % block_size + h2d_copy_part_block( + src_buffer, + dst_layer, + start_block_id, + vllm_block_ids[start_block_id], + start_position_in_block, + end_position_in_block) + return + + if separate_first_block: + first_block_id_src = start_block_id + first_block_id_dst = vllm_block_ids[first_block_id_src] + start_token_idx_in_block = start_token_idx % block_size + h2d_copy_trailing_tokens( + src_buffer, + dst_layer, + first_block_id_src, + first_block_id_dst, + start_token_idx_in_block) + + if separate_last_block: + last_block_id_src = end_block_id + last_block_id_dst = vllm_block_ids[last_block_id_src] + stop_token_idx_in_block = stop_token_idx % block_size + h2d_copy_leading_tokens( + src_buffer, + dst_layer, + last_block_id_src, + last_block_id_dst, + stop_token_idx_in_block) + + # Step 3: copy the middle blocks + block_mapping = torch.stack([src_block_ids, dst_block_ids], dim=1) + ops.swap_blocks(src_buffer[0], dst_layer[0], block_mapping) + ops.swap_blocks(src_buffer[1], dst_layer[1], block_mapping) ##################################################################### @@ -236,10 +370,14 @@ def from_request_tracker( class DecodeReqMeta: # Request id req_id: str + # Prefiller-side request id + prefill_req_id: str # Allocated block ids block_ids: list[int] # Skip the first N tokens skip_leading_tokens: int + # if it's ready or not + is_ready: bool = False @dataclass class CPUConnectorMetadata(KVConnectorMetadata): @@ -283,14 +421,20 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: self._block_size = vllm_config.cache_config.block_size if role == KVConnectorRole.SCHEDULER: - self._kv_receiver = NixlDecodeManager( - 1024 * 1024 * 1024, # 1GB for debug - "localhost", - 54321, # Changed from string to int to match the class definition - ) + self._should_be_ready_reqs: set[str] = set() + pass elif role == KVConnectorRole.WORKER: # Prefiller side sender - self._kv_sender = NixlPrefillManager(1024 * 1024 * 1024) # 1GB for debug + if self.kv_role == "kv_producer": + self._kv_sender = NixlPrefillManager(1024 * 1024 * 1024) # 1GB for debug + elif self.kv_role == "kv_consumer": + self._kv_receiver = NixlDecodeManager( + 1024 * 1024 * 1024, # 1GB for debug + "localhost", + 54321, # Changed from string to int to match the class definition + ) + else: + raise ValueError(f"Unknown kv_role: {self.kv_role}") # request_id -> prefill request trackers self._prefill_reqs: dict[str, PrefillRequestTracker] = {} @@ -309,9 +453,24 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: # Decode request id to prefill request id mapping self._decode_req_id_to_prefill_req_id: dict[str, str] = {} + self._prefill_req_id_to_decode_req_id: dict[str, str] = {} + + # Decode request metadata for scheduler connector + # decode request id -> DecodeReqMeta + self._decode_req_metas: dict[str, DecodeReqMeta] = {} + + # Decode h2d cuda events + self._decoder_cuda_events: list[torch.cuda.Event] = [] + + # In-progress kv load requests's prefill request ids + self._inflight_h2d_requests: set[str] = set() + def _connect_request_ids(self, p_reqid: str, d_reqid: str) -> None: + self._decode_req_id_to_prefill_req_id[d_reqid] = p_reqid + self._prefill_req_id_to_decode_req_id[p_reqid] = d_reqid + ############################################################ # Scheduler Side Methods ############################################################ @@ -359,37 +518,88 @@ def build_decode_meta( scheduler_output (SchedulerOutput): The scheduler output. output_meta (CPUConnectorMetadata): The output metadata. """ - logger.error("build_decode_meta() not implemented, running a debug implementation!") - pass - + updated_decode_req_metas = {} + for req_meta in self._decode_req_metas.values(): + if not req_meta.is_ready: + updated_decode_req_metas[req_meta.req_id] = req_meta + # NOTE (ApostaC): Even if the request is not ready, we still + # want the worker connector to know about it, so that it can + # connector the decode request id to the prefill request id + output_meta.add_decode(req_meta) + self._decode_req_metas = updated_decode_req_metas def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int) -> tuple[int, bool]: + # NOTE(ApostaC): For a single request, this function will be called + # two times if the first time we returned async_load flag as True. + # The second time will be the "real schedule" time + + if self.kv_role == "kv_producer": + return 0, False + kv_transfer_params = request.kv_transfer_params num_tokens = len(request.prompt_token_ids) request_id = request.request_id - if "prefill_request_id" not in kv_transfer_params: - logger.warning("Request %s does not have prefill_request_id", request.req_id) + logger.info("For request %s, num_computed_tokens is %d, " + "total_num_tokens is %d", request_id, num_computed_tokens, + num_tokens) + + if request.request_id in self._should_be_ready_reqs: + self._should_be_ready_reqs.remove(request.request_id) + return 0, False + + if kv_transfer_params is None or "prefill_request_id" not in kv_transfer_params: + logger.warning("Request %s does not have prefill_request_id", request.request_id) #return 0, False - logger.warning("NOW DEBUGGING SET THE REQUEST TO HAVE A PREFILL ID") - # Set the prefill_request_id to the request id + # DEBUG: Set the prefill_request_id to the request id # This is a temporary fix to make the code work - self._decode_req_id_to_prefill_req_id[request_id] = request.request_id - return num_tokens, True + self._should_be_ready_reqs.add(request_id) + self._connect_request_ids(request_id, request_id) + return num_tokens // self._block_size * self._block_size, True + prefill_request_id = kv_transfer_params["prefill_request_id"] - self._decode_req_id_to_prefill_req_id[request_id] = prefill_request_id - return num_tokens, True + self._connect_request_ids(prefill_request_id, request_id) + self._should_be_ready_reqs.add(request_id) + + # NOTE: because the scheduler wants here to return "full blocks" if + # the async flag is true (see _update_waiting_for_remote_kv in + # scheduler.py). We need to carefully deal with it when copying + # the KV cache at worker side + return num_tokens // self._block_size * self._block_size, True def update_state_after_alloc( self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int) -> None: - print("In update_state_after_alloc") - breakpoint() - pass + """Update the state of the request after allocation. + """ + # NOTE(ApostaC): This function is called twice for the same request + # when we are using async loading. The first time is we got all the + # external "hit" blocks in `blocks`, and the second time we will have + # the remaining "last" block as a newly allocated block. + if self.kv_role == "kv_producer": + return + + if request.request_id in self._decode_req_metas: + # This is the second time we are called for the same request + # We need to mark the request as "ready" + self._decode_req_metas[request.request_id].is_ready = True + return + + p_req_id = self._decode_req_id_to_prefill_req_id[request.request_id] + block_ids = [] + for blks in blocks.get_block_ids(): + block_ids.extend(blks) + req_meta = DecodeReqMeta( + req_id = request.request_id, + prefill_req_id = p_req_id, + block_ids = block_ids, + skip_leading_tokens = 0, + is_ready = False) + self._decode_req_metas[request.request_id] = req_meta def build_connector_meta( self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata: @@ -455,8 +665,50 @@ def start_load_kv(self, forward_context: "ForwardContext", the same. """ - pass + if self.kv_role == "kv_producer": + # encoder side + return + + meta = self._get_connector_metadata() + assert isinstance(meta, CPUConnectorMetadata), \ + "Connector metadata is not of type CPUConnectorMetadata" + for decode_meta in meta.decode_meta: + self._connect_request_ids( + decode_meta.prefill_req_id, + decode_meta.req_id) + if not decode_meta.is_ready: + continue + + total_expected_tokens = len(decode_meta.block_ids) * \ + self._block_size + + self._inflight_h2d_requests.add(decode_meta.prefill_req_id) + for layer_id in range(len(self._gpu_kv_caches)): + decode_specs = self._kv_receiver.get_kv_specs( + decode_meta.prefill_req_id, + layer_id) + layer_name = self._layer_id_to_name[layer_id] + dst_layer = self._gpu_kv_caches[layer_name] + for decode_spec in decode_specs: + start = decode_spec.start + stop = min(decode_spec.stop, total_expected_tokens) + if start >= total_expected_tokens: + continue + src_buffer = decode_spec.buffer + block_ids = decode_meta.block_ids + + with torch.cuda.stream(self._cuda_stream): + h2d_page_copy( + src_buffer, + dst_layer, + block_ids, + start, + stop, + self._block_size) + event = torch.cuda.Event() + event.record(self._cuda_stream) + self._decoder_cuda_events.append(event) def wait_for_layer_load(self, layer_name: str) -> None: """ @@ -469,7 +721,18 @@ def wait_for_layer_load(self, layer_name: str) -> None: Args: layer_name: the name of that layer """ - pass + if self.kv_role == "kv_producer": + # encoder side + return + + layer_id = self._get_layer_id(layer_name) + self._decoder_cuda_events[layer_id].synchronize() + + if layer_id == len(self._gpu_kv_caches) - 1: + # Free the memory for the whole request + for p_req_id in self._inflight_h2d_requests: + logger.info("Freeing request %s", p_req_id) + self._kv_receiver.free_request(p_req_id) @_lmcache_nvtx_annotate def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, @@ -486,6 +749,10 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata (AttentionMetadata): the attention metadata. **kwargs: additional arguments for the save operation. """ + if self.kv_role == "kv_consumer": + # decoder side + return + meta = self._get_connector_metadata() assert isinstance(meta, CPUConnectorMetadata), \ "Connector metadata is not of type CPUConnectorMetadata" @@ -539,6 +806,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, # TODO(ApostaC): Potential optimizations # 1. coalesce the d2h page copy to a single call # 2. use a single cuda event instead of a list of cuda events + # 3. use a cuda event pool to prevent the creation overhead @@ -551,6 +819,10 @@ def wait_for_save(self): This prevents overwrites of paged KV buffer before saving done. """ + if self.kv_role == "kv_consumer": + # decoder side + return + # Check the task states and send the tasks for task in self._inflight_copy_tasks: if task.cuda_event is not None: @@ -571,7 +843,22 @@ def get_finished( The finished saves/sends req ids must belong to a set provided in a call to this method (this call or a prior one). """ - return None, None + if self.kv_role == "kv_consumer": + # decoder side + self._kv_receiver.progress() + p_ready_reqs = self._kv_receiver.get_finished(len(self._gpu_kv_caches)) + ret = set() + # TODO: Bug here: we need to send the prefill request id from scheduler + # connector to the worker connector in kv_params + for p_req_id in p_ready_reqs: + ret.add(self._prefill_req_id_to_decode_req_id[p_req_id]) + + if ret: + logger.info("Got finished requests: %s", ret) + + return None, ret + else: + return None, None def close(self): """ @@ -583,3 +870,5 @@ def close(self): """ if hasattr(self, "_kv_sender") and self._kv_sender is not None: self._kv_sender.close() + if hasattr(self, "_kv_receiver") and self._kv_receiver is not None: + self._kv_receiver.close() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index f17620e78991..329c4509e7fd 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -797,6 +797,8 @@ def stop_handshake_listener(self) -> None: self._handshake_listener_t = None def close(self): + logger.info("Watermark information before closing: (low: %d, high: %d)", + self._allocator.low_watermark, self._allocator.high_watermark) self.stop_handshake_listener() self._nixl_wrapper.deregister_memory(self._reg_dlist) del self._nixl_wrapper From 005d5c1a38df32624b32e399f957a0781f4a93ed Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 14:41:02 -0700 Subject: [PATCH 12/28] [Add] remove the hard-coded host and port Signed-off-by: ApostaC --- .../kv_connector/v1/cpu_connector.py | 33 ++- .../kv_connector/v1/request_tracker.py | 221 ------------------ 2 files changed, 30 insertions(+), 224 deletions(-) delete mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index a56bd55103c5..5af67ab87a9d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -407,6 +407,26 @@ def add_decode(self, decode_meta: DecodeReqMeta) -> None: """ self.decode_meta.append(decode_meta) +def validate_kv_transfer_config( + kv_transfer_config: Optional["KVTransferConfig"]) -> None: + """Validate the KV transfer configuration. + It expects the host and port configuration in the kv_connector_extra_config + + Args: + kv_transfer_config (Optional[KVTransferConfig]): The KV transfer + configuration to validate. + + Raises: + AssertionError: If the configuration is invalid. + """ + assert kv_transfer_config is not None, \ + "KV transfer config is not set in the vLLM config" + + extra_config = kv_transfer_config.kv_connector_extra_config + assert "host" in extra_config, \ + "CPUConnector: must have 'host' in kv_connector_extra_config" + assert "port" in extra_config, \ + "CPUConnector: must have 'port' in kv_connector_extra_config" class CPUConnector(KVConnectorBase_V1): """CPUKVConnector is an implementation of KVConnectorBase_V1 that @@ -416,6 +436,14 @@ class CPUConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: super().__init__(vllm_config, role) + validate_kv_transfer_config(vllm_config.kv_transfer_config) + extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config + self._host = extra_config["host"] + self._port = extra_config["port"] + if isinstance(self._port, str): + # Convert the port to an integer if it's a string + self._port = int(self._port) + self.kv_role = vllm_config.kv_transfer_config.kv_role self._block_size = vllm_config.cache_config.block_size @@ -771,11 +799,10 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, ) # Create a destination spec - # TODO: remove the hard-code here dest_spec = DestinationSpec( rank=get_tensor_model_parallel_rank(), - host="localhost", - base_port=54321, # Changed from string to int to match the class definition + host=self._host, + base_port=self._port, ) # Create the send task diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py b/vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py deleted file mode 100644 index 66f87100ce83..000000000000 --- a/vllm/distributed/kv_transfer/kv_connector/v1/request_tracker.py +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Adpoted from LMCache https://github.com/LMCache/LMCache - -import threading -from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional - -import torch -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.v1.base import ( - KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) -from vllm.logger import init_logger -from vllm.utils import cdiv, make_zmq_socket -from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata - from vllm.forward_context import ForwardContext - from vllm.v1.core.kv_cache_manager import KVCacheManager - from vllm.v1.core.sched.output import CachedRequestData, NewRequestData - from vllm.v1.request import Request - -logger = init_logger(__name__) - - -@dataclass -class LoadSpec: - # Number of tokens cached in vLLM - vllm_cached_tokens: int - # Number of tokens that are cached in LMCache - external_cached_tokens: int - # Whether the scheduler allow us to load the tokens - can_load: bool - - -@dataclass -class SaveSpec: - # Skip already saved tokens - skip_leading_tokens: int - # Whether the scheduler allow us to save the tokens - can_save: bool - - -@dataclass -class RequestTracker: - # Request id - req_id: str - - # The token ids that has been scheduled so far - token_ids: list[int] - - # The block ids that has been allocated so far - # NOTE: allocated blocks could be more than the number of tokens - # FIXME: need to check whether the block ids will be changed after - # preemption - allocated_block_ids: list[int] - - # The number of tokens that has been savd - num_saved_tokens: int = 0 - - @staticmethod - def from_new_request( - new_request: "NewRequestData", - num_tokens_to_compute: int, - ) -> "RequestTracker": - """Create the request tracker from a new request. - - Args: - new_request (NewRequestData): the new request data. - num_tokens_to_compute (int): the number of tokens that will - be 'computed', including the `num_computed_tokens` (vLLM's - local cache hit) and new tokens that will be scheduled. - - """ - # vLLM 0.9.0 update: request.block_ids changed from list[int] to - # list[list[int]] - # Need to check the type of request.block_ids - - unfolded_block_ids = [] - - if not isinstance(new_request.block_ids[0], list): - unfolded_block_ids = new_request.block_ids.copy() - else: - # According to the vLLM code - # (https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/ - # sched/scheduler.py#L943), - # only one KVCacheGroup is supported in connector for now. - - # TODO: Please support multiple KVCacheGroup in connector. - # NOTE: Also, `update` method in RequestTracker should be - # updated accordingly. - unfolded_block_ids = new_request.block_ids[0].copy() - - return RequestTracker( - req_id=new_request.req_id, - token_ids=new_request.prompt_token_ids[:num_tokens_to_compute]. - copy(), - allocated_block_ids=unfolded_block_ids, - num_saved_tokens=0, - ) - - def update( - self, - cached_request: "CachedRequestData", - ) -> None: - """Update the request tracker when a running request is - scheduled again - """ - self.token_ids.extend(cached_request.new_token_ids) - new_block_ids: list[int] - if not isinstance(cached_request.new_block_ids[0], list): - new_block_ids = cached_request.new_block_ids - else: - new_block_ids = cached_request.new_block_ids[0] - self.allocated_block_ids.extend(new_block_ids) - - -@dataclass -class ReqMeta: - # Request id - req_id: str - # Request tokens - token_ids: torch.Tensor - # Block ids - block_ids: torch.Tensor - # Slot mapping - slot_mapping: torch.Tensor - # Skip save or not - save_spec: Optional[SaveSpec] = None - # load_spec - load_spec: Optional[LoadSpec] = None - - @staticmethod - def from_request_tracker( - tracker: RequestTracker, - block_size: int, - load_spec: Optional[LoadSpec] = None, - skip_save: bool = False, - ) -> Optional["ReqMeta"]: - """Create the request metadata from a request tracker. - - Args: - tracker (RequestTracker): the request tracker. - block_size (int): the block size in vLLM. - load_spec (Optional[LoadSpec]): the load spec for KV cache loading. - skip_save (bool): whether to skip the save operation. - - Returns: - the request metadata if we need to perform load/save - operations, None otherwise. - """ - input_token_ids = tracker.token_ids - input_token_len = len(input_token_ids) - - if skip_save and load_spec is None: - return None - - num_tokens_to_save = input_token_len - skip_leading_tokens = tracker.num_saved_tokens - - # If we need to save, update the number of saved tokens - if not skip_save: - tracker.num_saved_tokens = num_tokens_to_save - save_spec = SaveSpec(skip_leading_tokens, not skip_save) - - # Calculate the token ids and slot mappings for load and save - # OPTIMIZATION: pre-allocate the buffer for token ids and block - # ids - token_ids = torch.tensor(input_token_ids)[:num_tokens_to_save] - num_blocks = len(tracker.allocated_block_ids) - block_ids = torch.tensor(tracker.allocated_block_ids, dtype=torch.long) - - if len(token_ids) > num_blocks * block_size: - logger.error( - "The number of tokens is more than the number of blocks." - "Something might be wrong in scheduling logic!") - logger.error("Num tokens: %d, num blocks: %d, block size: %d", - len(token_ids), num_blocks, block_size) - - block_offsets = torch.arange(0, block_size, dtype=torch.long) - slot_mapping = block_offsets.reshape((1, block_size)) + \ - block_ids.reshape((num_blocks, 1)) * block_size - - slot_mapping = slot_mapping.flatten()[:len(token_ids)] - assert slot_mapping.dtype == torch.long # TODO: this could be removed - - # For load operation: check whether the request is scheduled to load - if load_spec is not None and load_spec.can_load: - logger.debug("Scheduled to load %d tokens for request %s", - load_spec.external_cached_tokens, tracker.req_id) - else: - # Do not load if not in `can_load` state - load_spec = None - - return ReqMeta( - req_id=tracker.req_id, - token_ids=token_ids, - slot_mapping=slot_mapping, - save_spec=save_spec, - load_spec=load_spec, - ) - - -@dataclass -class GeneralKVConnectorMetadata(KVConnectorMetadata): - requests: list[ReqMeta] - - def __init__(self): - self.requests = [] - - def add_request(self, req_meta: ReqMeta) -> None: - """Add a request to the metadata. - - Args: - req_meta (ReqMeta): the request metadata. - """ - self.requests.append(req_meta) - - From 44b36be36893e113ddee0c862129b9c6ab81fe65 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 16:35:58 -0700 Subject: [PATCH 13/28] [Add] precommit fixes Signed-off-by: ApostaC --- .../cpu_kv_integration/__init__.py | 3 +- .../cpu_kv_integration/run_nsys.sh | 2 +- .../cpu_kv_integration/temptest.py | 4 +- .../test_cpu_connector_kernels.py | 314 ++++++++---------- .../cpu_kv_integration/test_nixl_cpu_utils.py | 211 ++++++------ .../test_ring_buffer_allocator.py | 50 +-- .../cpu_kv_integration/test_toy_example.py | 29 +- .../cpu_kv_integration/toy_decode.py | 54 ++- .../cpu_kv_integration/toy_decoder_manager.py | 37 ++- .../cpu_kv_integration/toy_example.py | 54 ++- .../cpu_kv_integration/toy_receiver.py | 54 +-- .../kv_connector/v1/cpu_connector.py | 291 +++++++--------- .../kv_connector/v1/cpu_connector_utils.py | 52 +-- .../kv_connector/v1/nixl_cpu_utils.py | 287 +++++++--------- 14 files changed, 659 insertions(+), 783 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/__init__.py b/tests/v1/kv_connector/cpu_kv_integration/__init__.py index 50135644f7bc..f5bc53998307 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/__init__.py +++ b/tests/v1/kv_connector/cpu_kv_integration/__init__.py @@ -1 +1,2 @@ -# Empty init file to mark directory as Python package \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 +# Empty init file to mark directory as Python package diff --git a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh index 025b780c6f1d..3b77790da39a 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh +++ b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh @@ -5,4 +5,4 @@ CUDA_VISIBLE_DEVICES=7 nsys profile \ --trace-fork-before-exec=true \ --output=prefiller \ --force-overwrite=true \ - python3 toy_example.py + python3 toy_decode.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/temptest.py b/tests/v1/kv_connector/cpu_kv_integration/temptest.py index 08d0ebfcc7fa..8a133ae7d902 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/temptest.py +++ b/tests/v1/kv_connector/cpu_kv_integration/temptest.py @@ -1,6 +1,6 @@ - +# SPDX-License-Identifier: Apache-2.0 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlCPUReceiver, NixlKVSender, RingBufferAllocator) + NixlKVSender) sender = NixlKVSender(1024 * 1024 * 1024) diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py b/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py index 3f1e6e294767..a19ba9188019 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py @@ -3,8 +3,9 @@ import torch from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector import ( - d2h_page_copy, h2d_copy_leading_tokens, h2d_copy_trailing_tokens, h2d_page_copy -) + d2h_page_copy, h2d_copy_leading_tokens, h2d_copy_trailing_tokens, + h2d_page_copy) + @pytest.fixture def device_tensors(): @@ -14,18 +15,20 @@ def device_tensors(): page_size = 16 head_size = 8 hidden_size = 128 - + # Initialize with unique values for each position - k_tensor = torch.arange(num_blocks * page_size * head_size * hidden_size, - dtype=torch.float32, device='cuda') + k_tensor = torch.arange(num_blocks * page_size * head_size * hidden_size, + dtype=torch.float32, + device='cuda') k_tensor = k_tensor.reshape(num_blocks, page_size, head_size, hidden_size) - + v_tensor = k_tensor + 1000 # Different values for v - + # Stack k and v tensors kv_tensor = torch.stack([k_tensor, v_tensor], dim=0) return kv_tensor + @pytest.fixture def host_buffer(): """Create host buffer for testing.""" @@ -34,36 +37,34 @@ def host_buffer(): page_size = 16 head_size = 8 hidden_size = 128 - + k_buffer = torch.zeros(num_blocks * page_size * head_size * hidden_size, - dtype=torch.float32) + dtype=torch.float32) k_buffer = k_buffer.reshape(num_blocks, page_size, head_size, hidden_size) - + v_buffer = torch.zeros_like(k_buffer) - + # Stack k and v buffers kv_buffer = torch.stack([k_buffer, v_buffer], dim=0) return kv_buffer + def test_d2h_page_copy(device_tensors, host_buffer): """Test device to host copy operation.""" # Copy blocks 1 and 3 from device to host block_ids = [1, 3] - + d2h_page_copy(device_tensors, host_buffer, block_ids) - + # Verify copied data for i, block_id in enumerate(block_ids): # Check key tensor - assert torch.allclose( - host_buffer[0, i].cpu(), - device_tensors[0, block_id].cpu() - ) - # Check value tensor - assert torch.allclose( - host_buffer[1, i].cpu(), - device_tensors[1, block_id].cpu() - ) + assert torch.allclose(host_buffer[0, i].cpu(), + device_tensors[0, block_id].cpu()) + # Check value tensor + assert torch.allclose(host_buffer[1, i].cpu(), + device_tensors[1, block_id].cpu()) + def test_h2d_copy_leading_tokens(): """Test copying leading tokens from host to device.""" @@ -71,42 +72,39 @@ def test_h2d_copy_leading_tokens(): page_size = 16 head_size = 8 hidden_size = 128 - - src_buffer = torch.ones((2, 1, page_size, head_size, hidden_size), - dtype=torch.float32) + + src_buffer = torch.ones((2, 1, page_size, head_size, hidden_size), + dtype=torch.float32) # Initialize destination with a known pattern - dst_layer = torch.full((2, 1, page_size, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32, device='cuda') - + dst_layer = torch.full((2, 1, page_size, head_size, hidden_size), + fill_value=2.0, + dtype=torch.float32, + device='cuda') + # Copy first 8 tokens (half of page_size) end_position = 8 - h2d_copy_leading_tokens( - src_buffer, dst_layer, - src_block_id=0, dst_block_id=0, - end_position_in_block=end_position - ) - + h2d_copy_leading_tokens(src_buffer, + dst_layer, + src_block_id=0, + dst_block_id=0, + end_position_in_block=end_position) + # Verify first 8 tokens were copied - assert torch.allclose( - dst_layer[0, 0, :end_position].cpu(), - src_buffer[0, 0, :end_position] - ) - assert torch.allclose( - dst_layer[1, 0, :end_position].cpu(), - src_buffer[1, 0, :end_position] - ) - + assert torch.allclose(dst_layer[0, 0, :end_position].cpu(), + src_buffer[0, 0, :end_position]) + assert torch.allclose(dst_layer[1, 0, :end_position].cpu(), + src_buffer[1, 0, :end_position]) + # Verify remaining tokens are unchanged (should still be 2.0) - expected_unchanged = torch.full((page_size - end_position, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32) - assert torch.allclose( - dst_layer[0, 0, end_position:].cpu(), - expected_unchanged - ) - assert torch.allclose( - dst_layer[1, 0, end_position:].cpu(), - expected_unchanged - ) + expected_unchanged = torch.full( + (page_size - end_position, head_size, hidden_size), + fill_value=2.0, + dtype=torch.float32) + assert torch.allclose(dst_layer[0, 0, end_position:].cpu(), + expected_unchanged) + assert torch.allclose(dst_layer[1, 0, end_position:].cpu(), + expected_unchanged) + def test_h2d_copy_trailing_tokens(): """Test copying trailing tokens from host to device.""" @@ -114,42 +112,38 @@ def test_h2d_copy_trailing_tokens(): page_size = 16 head_size = 8 hidden_size = 128 - + src_buffer = torch.ones((2, 1, page_size, head_size, hidden_size), - dtype=torch.float32) + dtype=torch.float32) # Initialize destination with a known pattern dst_layer = torch.full((2, 1, page_size, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32, device='cuda') - + fill_value=2.0, + dtype=torch.float32, + device='cuda') + # Copy last 8 tokens (half of page_size) start_position = 8 - h2d_copy_trailing_tokens( - src_buffer, dst_layer, - src_block_id=0, dst_block_id=0, - start_position_in_block=start_position - ) - + h2d_copy_trailing_tokens(src_buffer, + dst_layer, + src_block_id=0, + dst_block_id=0, + start_position_in_block=start_position) + # Verify last 8 tokens were copied - assert torch.allclose( - dst_layer[0, 0, start_position:].cpu(), - src_buffer[0, 0, start_position:] - ) - assert torch.allclose( - dst_layer[1, 0, start_position:].cpu(), - src_buffer[1, 0, start_position:] - ) - + assert torch.allclose(dst_layer[0, 0, start_position:].cpu(), + src_buffer[0, 0, start_position:]) + assert torch.allclose(dst_layer[1, 0, start_position:].cpu(), + src_buffer[1, 0, start_position:]) + # Verify leading tokens are unchanged (should still be 2.0) expected_unchanged = torch.full((start_position, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32) - assert torch.allclose( - dst_layer[0, 0, :start_position].cpu(), - expected_unchanged - ) - assert torch.allclose( - dst_layer[1, 0, :start_position].cpu(), - expected_unchanged - ) + fill_value=2.0, + dtype=torch.float32) + assert torch.allclose(dst_layer[0, 0, :start_position].cpu(), + expected_unchanged) + assert torch.allclose(dst_layer[1, 0, :start_position].cpu(), + expected_unchanged) + def test_h2d_page_copy(): """Test host to device page copy operation.""" @@ -159,71 +153,61 @@ def test_h2d_page_copy(): head_size = 8 hidden_size = 128 block_size = page_size - + src_buffer = torch.ones((2, num_blocks, page_size, head_size, hidden_size), - dtype=torch.float32) + dtype=torch.float32) # Initialize destination with a known pattern dst_layer = torch.full((2, num_blocks, page_size, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32, device='cuda') - + fill_value=2.0, + dtype=torch.float32, + device='cuda') + # Test copying a range of tokens that spans multiple blocks block_ids = [0, 1, 2, 3] start_token_idx = 8 stop_token_idx = 56 - - h2d_page_copy( - src_buffer, dst_layer, block_ids, - start_token_idx, stop_token_idx, block_size - ) - + + h2d_page_copy(src_buffer, dst_layer, block_ids, start_token_idx, + stop_token_idx, block_size) + # Calculate which blocks should be fully/partially copied start_block = start_token_idx // block_size end_block = (stop_token_idx + block_size - 1) // block_size start_pos = start_token_idx % block_size end_pos = stop_token_idx % block_size - + # Expected unchanged value expected_unchanged = torch.full((page_size, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32) - + fill_value=2.0, + dtype=torch.float32) + # Verify copied and unchanged data for each block for i in range(num_blocks): if i < start_block or i >= end_block: # Blocks outside the copy range should be unchanged - assert torch.allclose( - dst_layer[:, block_ids[i]].cpu(), - expected_unchanged - ) + assert torch.allclose(dst_layer[:, block_ids[i]].cpu(), + expected_unchanged) elif i == start_block: # First block - verify both copied and unchanged parts # Leading part should be unchanged - assert torch.allclose( - dst_layer[:, block_ids[i], :start_pos].cpu(), - expected_unchanged[:start_pos] - ) + assert torch.allclose(dst_layer[:, block_ids[i], :start_pos].cpu(), + expected_unchanged[:start_pos]) # Trailing part should be copied - assert torch.allclose( - dst_layer[:, block_ids[i], start_pos:].cpu(), - src_buffer[:, i, start_pos:] - ) + assert torch.allclose(dst_layer[:, block_ids[i], start_pos:].cpu(), + src_buffer[:, i, start_pos:]) elif i == end_block - 1: # Last block - verify both copied and unchanged parts # Leading part should be copied - assert torch.allclose( - dst_layer[:, block_ids[i], :end_pos].cpu(), - src_buffer[:, i, :end_pos] - ) + assert torch.allclose(dst_layer[:, block_ids[i], :end_pos].cpu(), + src_buffer[:, i, :end_pos]) # Trailing part should be unchanged - assert torch.allclose( - dst_layer[:, block_ids[i], end_pos:].cpu(), - expected_unchanged[end_pos:] - ) + assert torch.allclose(dst_layer[:, block_ids[i], end_pos:].cpu(), + expected_unchanged[end_pos:]) else: # Middle blocks - verify full copy - assert torch.allclose( - dst_layer[:, block_ids[i]].cpu(), - src_buffer[:, i] - ) + assert torch.allclose(dst_layer[:, block_ids[i]].cpu(), + src_buffer[:, i]) + def test_h2d_page_copy_edge_cases(): """Test edge cases for host to device page copy.""" @@ -233,45 +217,38 @@ def test_h2d_page_copy_edge_cases(): head_size = 8 hidden_size = 128 block_size = page_size - + src_buffer = torch.ones((2, num_blocks, page_size, head_size, hidden_size), - dtype=torch.float32) + dtype=torch.float32) dst_layer = torch.zeros((2, num_blocks, page_size, head_size, hidden_size), - dtype=torch.float32, device='cuda') - + dtype=torch.float32, + device='cuda') + # Test case 1: Copy exactly one block block_ids = [0, 1] start_token_idx = 0 stop_token_idx = block_size - - h2d_page_copy( - src_buffer, dst_layer, block_ids, - start_token_idx, stop_token_idx, block_size - ) - - assert torch.allclose( - dst_layer[:, block_ids[0]].cpu(), - src_buffer[:, 0] - ) - + + h2d_page_copy(src_buffer, dst_layer, block_ids, start_token_idx, + stop_token_idx, block_size) + + assert torch.allclose(dst_layer[:, block_ids[0]].cpu(), src_buffer[:, 0]) + # Test case 2: Copy partial block dst_layer.zero_() block_ids = [0, 1] start_token_idx = block_size + 2 stop_token_idx = block_size + 6 - - h2d_page_copy( - src_buffer, dst_layer, block_ids, - start_token_idx, stop_token_idx, block_size - ) - + + h2d_page_copy(src_buffer, dst_layer, block_ids, start_token_idx, + stop_token_idx, block_size) + start_pos = start_token_idx % block_size end_pos = stop_token_idx % block_size - - assert torch.allclose( - dst_layer[:, block_ids[1], start_pos:end_pos].cpu(), - src_buffer[:, 1, start_pos:end_pos] - ) + + assert torch.allclose(dst_layer[:, block_ids[1], start_pos:end_pos].cpu(), + src_buffer[:, 1, start_pos:end_pos]) + def test_h2d_page_copy_aligned(): """Test host to device page copy operation with block-aligned boundaries.""" @@ -281,63 +258,58 @@ def test_h2d_page_copy_aligned(): head_size = 8 hidden_size = 128 block_size = page_size - + src_buffer = torch.ones((2, num_blocks, page_size, head_size, hidden_size), - dtype=torch.float32) + dtype=torch.float32) # Initialize destination with a known pattern dst_layer = torch.full((2, num_blocks, page_size, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32, device='cuda') - + fill_value=2.0, + dtype=torch.float32, + device='cuda') + # Test copying exactly 2 blocks (from block 1 to block 3) block_ids = [0, 1, 2, 3] start_token_idx = block_size # Start at beginning of block 1 stop_token_idx = block_size * 3 # End at end of block 2 - - h2d_page_copy( - src_buffer, dst_layer, block_ids, - start_token_idx, stop_token_idx, block_size - ) - + + h2d_page_copy(src_buffer, dst_layer, block_ids, start_token_idx, + stop_token_idx, block_size) + # Expected unchanged value expected_unchanged = torch.full((page_size, head_size, hidden_size), - fill_value=2.0, dtype=torch.float32) - + fill_value=2.0, + dtype=torch.float32) + # Verify copied and unchanged data for each block for i in range(num_blocks): if i == 0 or i == 3: # First and last blocks should be unchanged assert torch.allclose( dst_layer[:, block_ids[i]].cpu(), - expected_unchanged - ), f"Block {i} should be unchanged" + expected_unchanged), f"Block {i} should be unchanged" else: # Middle blocks (1 and 2) should be fully copied assert torch.allclose( dst_layer[:, block_ids[i]].cpu(), - src_buffer[:, i] - ), f"Block {i} should be fully copied" - + src_buffer[:, i]), f"Block {i} should be fully copied" + # Test copying a single block-aligned region dst_layer.fill_(2.0) # Reset destination start_token_idx = block_size * 2 # Start at beginning of block 2 stop_token_idx = block_size * 3 # End at end of block 2 - - h2d_page_copy( - src_buffer, dst_layer, block_ids, - start_token_idx, stop_token_idx, block_size - ) - + + h2d_page_copy(src_buffer, dst_layer, block_ids, start_token_idx, + stop_token_idx, block_size) + # Verify only block 2 was copied, others unchanged for i in range(num_blocks): if i == 2: # Block 2 should be fully copied assert torch.allclose( dst_layer[:, block_ids[i]].cpu(), - src_buffer[:, i] - ), "Block 2 should be fully copied" + src_buffer[:, i]), "Block 2 should be fully copied" else: # All other blocks should be unchanged assert torch.allclose( dst_layer[:, block_ids[i]].cpu(), - expected_unchanged - ), f"Block {i} should be unchanged" + expected_unchanged), f"Block {i} should be unchanged" diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py index 6ec039d54ddf..157daab4ce13 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -1,14 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 -import torch.multiprocessing as mp +import time + import pytest import torch -import threading -import time +import torch.multiprocessing as mp from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlCPUSender, NixlCPUReceiver, SourceSpec, DestinationSpec, - RingBufferAllocator -) + DestinationSpec, NixlCPUReceiver, NixlCPUSender, RingBufferAllocator, + SourceSpec) try: from nixl._api import nixl_agent as NixlWrapper @@ -16,7 +15,9 @@ except ImportError: NIXL_AVAILABLE = False -def run_receiver(buffer_config, host, base_port, rank, ready_event, stop_event): + +def run_receiver(buffer_config, host, base_port, rank, ready_event, + stop_event): """Process function for running the receiver.""" try: # Mock tensor_model_parallel_rank for this process @@ -26,29 +27,28 @@ def run_receiver(buffer_config, host, base_port, rank, ready_event, stop_event): # Create ring buffer allocator allocator = utils.RingBufferAllocator( size=buffer_config['buffer_size'], - align_to=buffer_config['nixl_page_size'] - ) + align_to=buffer_config['nixl_page_size']) # Create and start receiver receiver = NixlCPUReceiver( allocator=allocator, - nixl_page_size=buffer_config['nixl_page_size'] - ) + nixl_page_size=buffer_config['nixl_page_size']) receiver.start_handshake_listener(host, base_port) - + # Signal receiver is ready ready_event.set() - + # Wait for stop signal stop_event.wait() - + # Cleanup receiver.stop_handshake_listener() - + except Exception as e: print(f"Receiver process error: {e}") raise + def run_sender(buffer_config, host, base_port, rank, receiver_ready_event): """Process function for running the sender.""" try: @@ -59,38 +59,38 @@ def run_sender(buffer_config, host, base_port, rank, receiver_ready_event): # Create ring buffer allocator allocator = utils.RingBufferAllocator( size=buffer_config['buffer_size'], - align_to=buffer_config['nixl_page_size'] - ) + align_to=buffer_config['nixl_page_size']) # Wait for receiver to be ready receiver_ready_event.wait() # Create sender and perform handshake - sender = NixlCPUSender( - buffer_size=buffer_config['buffer_size'], - buffer_ptr=allocator.get_buffer_ptr(), - nixl_page_size=buffer_config['nixl_page_size'] - ) - - dest_spec = DestinationSpec( - rank=rank, - host=host, - base_port=base_port - ) + sender = NixlCPUSender(buffer_size=buffer_config['buffer_size'], + buffer_ptr=allocator.get_buffer_ptr(), + nixl_page_size=buffer_config['nixl_page_size']) + + dest_spec = DestinationSpec(rank=rank, host=host, base_port=base_port) sender._nixl_handshake(dest_spec) - + # Verify handshake results assert dest_spec.get_id() in sender._remote_agents assert sender._remote_agents[dest_spec.get_id()] is not None peer_name = sender._remote_agents[dest_spec.get_id()] assert sender._remote_xfer_handlers[peer_name] is not None - + return True except Exception as e: print(f"Sender process error: {e}") raise -def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event, stop_event, progress_interval=0.001): + +def run_receiver_with_progress(buffer_config, + host, + base_port, + rank, + ready_event, + stop_event, + progress_interval=0.001): """Process function for running the receiver with progress loop.""" try: # Mock tensor_model_parallel_rank for this process @@ -100,33 +100,31 @@ def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event # Create ring buffer allocator allocator = utils.RingBufferAllocator( size=buffer_config['buffer_size'], - align_to=buffer_config['nixl_page_size'] - ) + align_to=buffer_config['nixl_page_size']) allocator._buffer.fill_(0) # Create and start receiver receiver = NixlCPUReceiver( allocator=allocator, - nixl_page_size=buffer_config['nixl_page_size'] - ) + nixl_page_size=buffer_config['nixl_page_size']) receiver.start_handshake_listener(host, base_port) - + # Signal receiver is ready ready_event.set() - + # Run progress loop until stop signal while not receiver.get_finished(): receiver.progress() time.sleep(progress_interval) - finished = receiver.get_finished(clear = True) + finished = receiver.get_finished(clear=True) assert len(finished) == 1 source_spec, vaddr = finished[0] paddr = allocator.virtual_to_physical(vaddr) # Check if the numbers are all correct (should be uint8 all 1) num_elements = source_spec.get_size() - should_1 = allocator._buffer[paddr : paddr + num_elements] + should_1 = allocator._buffer[paddr:paddr + num_elements] should_0_a = allocator._buffer[:paddr] should_0_b = allocator._buffer[paddr + num_elements:] assert (should_1 == 1).all(), "Buffer data mismatch" @@ -138,15 +136,17 @@ def run_receiver_with_progress(buffer_config, host, base_port, rank, ready_event while not stop_event.is_set(): receiver.progress() time.sleep(progress_interval) - + # Cleanup receiver.stop_handshake_listener() - + except Exception as e: print(f"Receiver process error: {e}") raise -def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_ready_event, success_event): + +def run_sender_with_protocol(buffer_config, host, base_port, rank, + receiver_ready_event, success_event): """Process function for running the sender with protocol communication.""" try: # Mock tensor_model_parallel_rank for this process @@ -156,27 +156,20 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read # Create ring buffer allocator allocator = utils.RingBufferAllocator( size=buffer_config['buffer_size'], - align_to=buffer_config['nixl_page_size'] - ) + align_to=buffer_config['nixl_page_size']) # Wait for receiver to be ready receiver_ready_event.wait() # Create sender - sender = NixlCPUSender( - buffer_size=buffer_config['buffer_size'], - buffer_ptr=allocator.get_buffer_ptr(), - nixl_page_size=buffer_config['nixl_page_size'] - ) - + sender = NixlCPUSender(buffer_size=buffer_config['buffer_size'], + buffer_ptr=allocator.get_buffer_ptr(), + nixl_page_size=buffer_config['nixl_page_size']) + # Create destination spec and perform handshake - dest_spec = DestinationSpec( - rank=rank, - host=host, - base_port=base_port - ) + dest_spec = DestinationSpec(rank=rank, host=host, base_port=base_port) sender._nixl_handshake(dest_spec) - + # Create source spec and prepare send source_spec = SourceSpec( request_id="test_request", @@ -187,14 +180,14 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read dtype_str="bfloat16", # Example dtype num_all_tokens=16, ) - + # Prepare send and wait for completion uid = sender.prepare_send(source_spec, dest_spec) - + max_retries = 100 retry_count = 0 remote_agent = None - + while retry_count < max_retries: remote_agent, receiver_paddr = \ sender.check_and_remove_prepared_send(uid) @@ -212,9 +205,8 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read buffer.fill_(1) # Fill with dummy data - handle = sender.send( - paddr, receiver_paddr, source_spec.get_size(), - uid, dest_spec) + handle = sender.send(paddr, receiver_paddr, source_spec.get_size(), + uid, dest_spec) while not sender.is_send_finished(handle): time.sleep(0.1) @@ -222,11 +214,12 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, receiver_read if remote_agent is not None: success_event.set() - + except Exception as e: print(f"Sender process error: {e}") raise + @pytest.mark.skipif(not NIXL_AVAILABLE, reason="NIXL is not available") class TestNixlCPUUtils: """Test cases for NixlCPUSender and NixlCPUReceiver.""" @@ -240,28 +233,28 @@ def setup_class(cls): def buffer_config(self): """Common buffer configuration for tests.""" buffer_size = 1 << 20 # 1MB - torch_buffer = torch.zeros(buffer_size, dtype=torch.uint8, device='cpu') - + torch_buffer = torch.zeros(buffer_size, + dtype=torch.uint8, + device='cpu') + return { 'buffer_size': buffer_size, - 'buffer_ptr': torch_buffer.data_ptr(), + 'buffer_ptr': torch_buffer.data_ptr(), 'nixl_page_size': 4096 # Standard page size } def test_sender_creation(self, buffer_config): """Test creation of NixlCPUSender.""" - sender = NixlCPUSender( - buffer_size=buffer_config['buffer_size'], - buffer_ptr=buffer_config['buffer_ptr'], - nixl_page_size=buffer_config['nixl_page_size'] - ) - + sender = NixlCPUSender(buffer_size=buffer_config['buffer_size'], + buffer_ptr=buffer_config['buffer_ptr'], + nixl_page_size=buffer_config['nixl_page_size']) + # Verify internal state assert sender._buffer_size == buffer_config['buffer_size'] assert sender._buffer_ptr == buffer_config['buffer_ptr'] assert sender._nixl_page_size == buffer_config['nixl_page_size'] assert isinstance(sender._remote_agents, dict) - + # Verify NIXL initialization assert sender._nixl_wrapper is not None assert sender._reg_dlist is not None @@ -272,14 +265,12 @@ def test_receiver_creation(self, buffer_config): # Create ring buffer allocator allocator = RingBufferAllocator( size=buffer_config['buffer_size'], - align_to=buffer_config['nixl_page_size'] - ) + align_to=buffer_config['nixl_page_size']) receiver = NixlCPUReceiver( allocator=allocator, - nixl_page_size=buffer_config['nixl_page_size'] - ) - + nixl_page_size=buffer_config['nixl_page_size']) + # Verify internal state assert receiver._buffer_size == buffer_config['buffer_size'] assert receiver._buffer_ptr == allocator.get_buffer_ptr() @@ -287,7 +278,7 @@ def test_receiver_creation(self, buffer_config): assert isinstance(receiver._inflight_requests, dict) assert isinstance(receiver._inflight_request_vaddr, dict) assert receiver._allocator is allocator - + # Verify NIXL initialization assert receiver._nixl_wrapper is not None assert receiver._reg_dlist is not None @@ -295,17 +286,16 @@ def test_receiver_creation(self, buffer_config): def test_creation_with_invalid_buffer_size(self, buffer_config): """Test creation with invalid buffer size.""" - with pytest.raises(Exception): # Specific exception type depends on NIXL implementation + with pytest.raises( + Exception + ): # Specific exception type depends on NIXL implementation # Create allocator with invalid size allocator = RingBufferAllocator( size=0, # Invalid size - align_to=buffer_config['nixl_page_size'] - ) - - NixlCPUReceiver( - allocator=allocator, - nixl_page_size=buffer_config['nixl_page_size'] - ) + align_to=buffer_config['nixl_page_size']) + + NixlCPUReceiver(allocator=allocator, + nixl_page_size=buffer_config['nixl_page_size']) def test_nixl_handshake_multiprocess(self, buffer_config): """Test NIXL handshake between sender and receiver in separate processes.""" @@ -322,31 +312,29 @@ def test_nixl_handshake_multiprocess(self, buffer_config): stop_receiver = mp.Event() # Start receiver process - receiver_process = mp.Process( - target=run_receiver, - args=(buffer_config, test_host, test_base_port, - test_rank, receiver_ready, stop_receiver) - ) + receiver_process = mp.Process(target=run_receiver, + args=(buffer_config, test_host, + test_base_port, test_rank, + receiver_ready, stop_receiver)) receiver_process.start() # Start sender process - sender_process = mp.Process( - target=run_sender, - args=(buffer_config, test_host, test_base_port, - test_rank, receiver_ready) - ) + sender_process = mp.Process(target=run_sender, + args=(buffer_config, test_host, + test_base_port, test_rank, + receiver_ready)) sender_process.start() try: # Wait for processes to complete - sender_process.join(timeout = 20) + sender_process.join(timeout=20) assert sender_process.exitcode == 0, "Sender process failed" finally: # Cleanup stop_receiver.set() receiver_process.join(timeout=5) - + # Force terminate if processes haven't exited if receiver_process.is_alive(): receiver_process.terminate() @@ -372,26 +360,24 @@ def test_nixl_protocol_communication(self, buffer_config): protocol_success = mp.Event() # Start receiver process with progress loop - receiver_process = mp.Process( - target=run_receiver_with_progress, - args=(buffer_config, test_host, test_base_port, - test_rank, receiver_ready, stop_receiver) - ) + receiver_process = mp.Process(target=run_receiver_with_progress, + args=(buffer_config, test_host, + test_base_port, test_rank, + receiver_ready, stop_receiver)) receiver_process.start() # Start sender process with protocol communication - sender_process = mp.Process( - target=run_sender_with_protocol, - args=(buffer_config, test_host, test_base_port, - test_rank, receiver_ready, protocol_success) - ) + sender_process = mp.Process(target=run_sender_with_protocol, + args=(buffer_config, test_host, + test_base_port, test_rank, + receiver_ready, protocol_success)) sender_process.start() try: # Wait for protocol communication to complete protocol_complete = protocol_success.wait(timeout=20) assert protocol_complete, "Protocol communication failed or timed out" - + # Wait for sender process to complete sender_process.join(timeout=5) assert sender_process.exitcode == 0, "Sender process failed" @@ -400,12 +386,11 @@ def test_nixl_protocol_communication(self, buffer_config): # Cleanup stop_receiver.set() receiver_process.join(timeout=5) - + # Force terminate if processes haven't exited if receiver_process.is_alive(): receiver_process.terminate() if sender_process.is_alive(): sender_process.terminate() - - mp.set_start_method(old_start_method, force=True) + mp.set_start_method(old_start_method, force=True) diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py index ead0aae2f921..26051fdb3e00 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_ring_buffer_allocator.py @@ -1,14 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -import torch -from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import RingBufferAllocator +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( + RingBufferAllocator) + def test_basic_allocation(): """Test basic allocation and deallocation behavior.""" # Create a buffer with 1024 bytes, aligned to 256 bytes allocator = RingBufferAllocator(size=1024, align_to=256) - + # Allocate 100 bytes - should be aligned to 256 addr1, buffer1 = allocator.allocate(100) assert addr1 >= 0 # Valid address @@ -16,26 +17,27 @@ def test_basic_allocation(): assert len(buffer1) == 100 assert allocator.high_watermark == 256 # Aligned to 256 assert allocator.low_watermark == 0 - + # Allocate another 100 bytes addr2, buffer2 = allocator.allocate(100) assert addr2 >= 0 # Valid address assert buffer2 is not None assert len(buffer2) == 100 assert allocator.high_watermark == 512 # Next aligned position - + # Verify buffers don't overlap assert buffer1.data_ptr() + len(buffer1) <= buffer2.data_ptr() + def test_alignment(): """Test that allocations are properly aligned.""" allocator = RingBufferAllocator(size=1024, align_to=256) - + # Allocate various sizes and verify alignment sizes = [10, 100, 200, 50] addresses = [] buffers = [] - + for size in sizes: addr, buf = allocator.allocate(size) assert addr >= 0 # Valid address @@ -45,24 +47,25 @@ def test_alignment(): # High watermark should always be aligned to 256 assert allocator.high_watermark % 256 == 0 + def test_wraparound(): """Test buffer wraparound behavior.""" allocator = RingBufferAllocator(size=1024, align_to=256) - + # Fill most of the buffer addr1, buffer1 = allocator.allocate(300) # Takes 512 bytes aligned addr2, buffer2 = allocator.allocate(300) # Takes 512 bytes aligned assert addr1 >= 0 and addr2 >= 0 # Valid addresses assert buffer1 is not None and buffer2 is not None - + # This allocation should fail as we don't have enough contiguous space addr3, buffer3 = allocator.allocate(300) assert addr3 == -1 # Invalid address assert buffer3 is None - + # Free the first buffer allocator.free(addr1) # Free first 512 bytes - + # Now we should be able to allocate again by wrapping around addr4, buffer4 = allocator.allocate(200) assert addr4 >= 0 # Valid address @@ -70,25 +73,26 @@ def test_wraparound(): assert allocator.high_watermark >= allocator._size # Wrapped around assert allocator.high_watermark % allocator._size < 512 # Using freed space + def test_fragmentation(): """Test handling of fragmentation.""" allocator = RingBufferAllocator(size=1024, align_to=256) - + # Allocate several buffers addr1, buffer1 = allocator.allocate(100) # 256 bytes aligned addr2, buffer2 = allocator.allocate(100) # 256 bytes aligned addr3, buffer3 = allocator.allocate(100) # 256 bytes aligned assert all(addr >= 0 for addr in [addr1, addr2, addr3]) # Valid addresses assert all(buf is not None for buf in [buffer1, buffer2, buffer3]) - + # Free buffer2, creating a gap allocator.free(addr2) # Free middle buffer - + # Try to allocate a buffer larger than the gap addr4, buffer4 = allocator.allocate(300) assert addr4 == -1 # Invalid address assert buffer4 is None # Should fail due to fragmentation - + # Allocate a buffer that fits in the gap # This should also fail as we don't track gaps in current implementation addr5, buffer5 = allocator.allocate(100) @@ -105,10 +109,11 @@ def test_fragmentation(): assert allocator.high_watermark >= allocator._size # Wrapped around assert allocator.high_watermark % allocator._size < 512 # Using freed space + def test_full_buffer(): """Test behavior when buffer is completely full.""" allocator = RingBufferAllocator(size=1024, align_to=256) - + # Fill the entire buffer addresses = [] buffers = [] @@ -118,30 +123,31 @@ def test_full_buffer(): break addresses.append(addr) buffers.append(buf) - + # Verify we can't allocate more addr, buf = allocator.allocate(10) assert addr == -1 assert buf is None - + # Free everything for addr in addresses: allocator.free(addr) - + # Should be able to allocate again addr, buffer = allocator.allocate(200) assert addr >= 0 # Valid address assert buffer is not None + def test_invalid_free(): """Test that freeing invalid addresses raises an error.""" allocator = RingBufferAllocator(size=1024, align_to=256) - + # Allocate a buffer addr, buffer = allocator.allocate(100) assert addr >= 0 # Valid address assert buffer is not None - + # Try to free an invalid address with pytest.raises(AssertionError): - allocator.free(100) # Invalid address + allocator.free(100) # Invalid address diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py index 1d8e914c9d43..824110e087af 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py @@ -1,16 +1,20 @@ # SPDX-License-Identifier: Apache-2.0 import os + import pytest + from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig + @pytest.fixture def env_setup(): """Set up required environment variables""" os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + @pytest.fixture def input_prompts(): """Create test prompts""" @@ -25,6 +29,7 @@ def input_prompts(): context4 + "The capital of China is", ] + @pytest.fixture def llm_instance(): """Create LLM instance with test configuration""" @@ -43,38 +48,42 @@ def llm_instance(): block_size=64, ) + def test_llm_generation(env_setup, input_prompts, llm_instance, tmp_path): """Test LLM generation and output saving""" # Configure sampling parameters sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - + # Generate outputs outputs = llm_instance.generate(input_prompts, sampling_params) - + # Verify outputs - assert len(outputs) == len(input_prompts), "Number of outputs should match number of prompts" - + assert len(outputs) == len( + input_prompts), "Number of outputs should match number of prompts" + # Process outputs new_prompts = [] for output in outputs: assert hasattr(output, 'prompt'), "Output should have prompt attribute" - assert hasattr(output, 'outputs'), "Output should have outputs attribute" + assert hasattr(output, + 'outputs'), "Output should have outputs attribute" assert len(output.outputs) > 0, "Output should have generated text" - + prompt = output.prompt generated_text = output.outputs[0].text new_prompts.append(prompt + generated_text) - + # Test file writing output_file = tmp_path / "output.txt" with open(output_file, "w") as f: for prompt in new_prompts: f.write(prompt + "\n") - + # Verify file contents assert output_file.exists(), "Output file should be created" - with open(output_file, "r") as f: + with open(output_file) as f: lines = f.readlines() - assert len(lines) == len(input_prompts), "File should contain all prompts" + assert len(lines) == len( + input_prompts), "File should contain all prompts" for line in lines: assert line.strip(), "Lines should not be empty" diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index 769075d8a258..a5a9f7d63cd5 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -2,68 +2,66 @@ import os -# VLLM_ENABLE_V1_MULTIPROCESSING=0 +# VLLM_ENABLE_V1_MULTIPROCESSING=0 # VLLM_WORKER_MULTIPROC_METHOD=spawn os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import time -import torch from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlCPUReceiver, RingBufferAllocator) - - if __name__ == "__main__": context = "Hi " * 1000 context2 = "Hi " * 1000 - context3 = "Hello " * 1000 + context3 = "Hello " * 1000 context4 = "How " * 1000 prompts = [ context + "Hello, my name is", - context2+ "The capital of France is", + context2 + "The capital of France is", context3 + "Your name is", context4 + "The capital of China is", ] - + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - - llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig( - kv_connector = "CPUConnector", - kv_role = "kv_consumer", - kv_connector_extra_config = {}, - ), - #load_format="dummy", - max_model_len=2048, - max_num_batched_tokens=2048, - block_size=128, - ) - + + llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector="CPUConnector", + kv_role="kv_consumer", + kv_connector_extra_config={ + "host": "localhost", + "port": 54321, + }, + ), + #load_format="dummy", + max_model_len=2048, + max_num_batched_tokens=2048, + block_size=128, + ) + # 1ST generation (prefill instance) outputs = llm.generate( prompts, sampling_params, ) - + new_prompts = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text new_prompts.append(prompt + generated_text) #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - + # Write new_prompts to output.txt with open("output_decode.txt", "w") as f: for prompt in new_prompts: f.write(prompt + "\n") print(f"Saved {len(new_prompts)} prompts to output.txt") - + # HACK: for offline single-process inference only # Wait for all send finishes from vllm.distributed.kv_transfer import get_kv_transfer_group diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py index 11c5867cc9dc..bd7f73c8f3bb 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 -import torch.multiprocessing as mp import time +import torch.multiprocessing as mp + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlCPUReceiver, RingBufferAllocator, NixlDecodeManager -) + NixlDecodeManager) + def main(): """Main function to run the receiver.""" @@ -14,7 +15,7 @@ def main(): test_base_port = 54321 test_rank = 0 expected_layers = 32 - + # Buffer configuration buffer_size = 1 << 30 # 1GB nixl_page_size = 4096 # Standard page size @@ -26,13 +27,11 @@ def main(): utils.get_tensor_model_parallel_world_size = lambda: 1 utils.get_tp_group = lambda: None - decoder_manager = NixlDecodeManager(buffer_size, - test_host, + decoder_manager = NixlDecodeManager(buffer_size, test_host, test_base_port) - print(f"Receiver started on {test_host}:{test_base_port}") - + # Run progress loop until interrupted try: while True: @@ -45,29 +44,31 @@ def main(): for i in range(expected_layers): decode_specs = decoder_manager.get_kv_specs(req_id, i) for spec in decode_specs: - print(f"Received layer {i} tokens " - f"{spec.start} - {spec.stop} request {req_id}. " - f"The shape is {spec.buffer.shape}. " - f"The digest is {spec.buffer.mean()}.") + print( + f"Received layer {i} tokens " + f"{spec.start} - {spec.stop} request {req_id}. " + f"The shape is {spec.buffer.shape}. " + f"The digest is {spec.buffer.mean()}.") decoder_manager.free_request(req_id) allocator = decoder_manager._allocator - print("Allocator high/low watermark:", allocator.high_watermark, - allocator.low_watermark) + print("Allocator high/low watermark:", + allocator.high_watermark, allocator.low_watermark) time.sleep(1) # Small sleep to prevent busy waiting - + except KeyboardInterrupt: decoder_manager.close() print("\nShutting down receiver...") - + print("Receiver stopped") - + except Exception as e: print(f"Receiver error: {e}") raise + if __name__ == "__main__": # Set multiprocessing start method mp.set_start_method("spawn", force=True) - main() + main() diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index 8e42c7cfa666..fd53d0a88ea1 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -2,68 +2,66 @@ import os -# VLLM_ENABLE_V1_MULTIPROCESSING=0 +# VLLM_ENABLE_V1_MULTIPROCESSING=0 # VLLM_WORKER_MULTIPROC_METHOD=spawn os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import time -import torch from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlCPUReceiver, RingBufferAllocator) - - if __name__ == "__main__": context = "Hi " * 1000 context2 = "Hey " * 1000 - context3 = "Hello " * 1000 + context3 = "Hello " * 1000 context4 = "How " * 1000 prompts = [ context + "Hello, my name is", - context2+ "The capital of France is", + context2 + "The capital of France is", context3 + "Your name is", context4 + "The capital of China is", ] - + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - - llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig( - kv_connector = "CPUConnector", - kv_role = "kv_producer", - kv_connector_extra_config = {}, - ), - #load_format="dummy", - max_model_len=2048, - max_num_batched_tokens=2048, - block_size=128, - ) - + + llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector="CPUConnector", + kv_role="kv_producer", + kv_connector_extra_config={ + "host": "localhost", + "port": 54321, + }, + ), + #load_format="dummy", + max_model_len=2048, + max_num_batched_tokens=2048, + block_size=128, + ) + # 1ST generation (prefill instance) outputs = llm.generate( prompts, sampling_params, ) - + new_prompts = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text new_prompts.append(prompt + generated_text) #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - + # Write new_prompts to output.txt with open("output.txt", "w") as f: for prompt in new_prompts: f.write(prompt + "\n") print(f"Saved {len(new_prompts)} prompts to output.txt") - + # HACK: for offline single-process inference only # Wait for all send finishes from vllm.distributed.kv_transfer import get_kv_transfer_group diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py index 62e518ebbe28..71a3e7525a54 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 -import torch.multiprocessing as mp import time +import torch.multiprocessing as mp + from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlCPUReceiver, RingBufferAllocator -) + NixlCPUReceiver, RingBufferAllocator) + def main(): """Main function to run the receiver.""" @@ -13,7 +14,7 @@ def main(): test_host = "127.0.0.1" test_base_port = 54321 test_rank = 0 - + # Buffer configuration buffer_size = 1 << 30 # 1GB nixl_page_size = 4096 # Standard page size @@ -24,62 +25,61 @@ def main(): utils.get_tensor_model_parallel_rank = lambda: test_rank # Create ring buffer allocator - allocator = RingBufferAllocator( - size=buffer_size, - align_to=nixl_page_size - ) + allocator = RingBufferAllocator(size=buffer_size, + align_to=nixl_page_size) allocator._buffer.fill_(0) # Create and start receiver - receiver = NixlCPUReceiver( - allocator=allocator, - nixl_page_size=nixl_page_size - ) + receiver = NixlCPUReceiver(allocator=allocator, + nixl_page_size=nixl_page_size) receiver.start_handshake_listener(test_host, test_base_port) - + print(f"Receiver started on {test_host}:{test_base_port}") - + # Run progress loop until interrupted try: while True: receiver.progress() - + # Check for finished requests finished = receiver.get_finished(clear=True) if finished: for source_spec, vaddr in finished: - print(f"Received data from request {source_spec.request_id}") + print( + f"Received data from request {source_spec.request_id}" + ) paddr = allocator.virtual_to_physical(vaddr) - + # Verify received data num_elements = source_spec.get_size() received_data = allocator._buffer[paddr : paddr + num_elements]\ .view(source_spec.dtype)\ .reshape(source_spec.tensor_shape) - print(f"Received layer {source_spec.layer_id} tokens " - f"{source_spec.start} - {source_spec.stop} of request " - f"{source_spec.request_id}") + print( + f"Received layer {source_spec.layer_id} tokens " + f"{source_spec.start} - {source_spec.stop} of request " + f"{source_spec.request_id}") print(f"The shape is {received_data.shape}") print(f"The digest is {received_data.mean()}") allocator.free(vaddr) - - print("Allocator high/low watermark:", allocator.high_watermark, - allocator.low_watermark) + print("Allocator high/low watermark:", + allocator.high_watermark, allocator.low_watermark) time.sleep(1) # Small sleep to prevent busy waiting - + except KeyboardInterrupt: print("\nShutting down receiver...") - + # Cleanup receiver.stop_handshake_listener() print("Receiver stopped") - + except Exception as e: print(f"Receiver error: {e}") raise + if __name__ == "__main__": # Set multiprocessing start method mp.set_start_method("spawn", force=True) - main() + main() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 5af67ab87a9d..e3ff0722451d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -1,38 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 -import contextlib -import math -import threading -import time -import uuid -from abc import ABC, abstractmethod -from collections import defaultdict, OrderedDict -from collections.abc import Iterator from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Tuple +from typing import TYPE_CHECKING, Any, Optional -import msgspec import torch -import zmq +from lmcache.utils import _lmcache_nvtx_annotate -from vllm import envs +from vllm import _custom_ops as ops from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( - SourceSpec, DestinationSpec) + DestinationSpec, SourceSpec) from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( - NixlSendTask, NixlPrefillManager, NixlDecodeManager) - -from vllm.distributed.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, - get_tp_group) + NixlDecodeManager, NixlPrefillManager, NixlSendTask) +from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.logger import init_logger -from vllm.utils import make_zmq_path, make_zmq_socket, round_down, cdiv +from vllm.utils import cdiv from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.request import RequestStatus -from vllm import _custom_ops as ops - -from lmcache.utils import _lmcache_nvtx_annotate if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata @@ -43,11 +27,9 @@ logger = init_logger(__name__) -def d2h_page_copy( - src_layer: torch.Tensor, - dst_buffer: torch.Tensor, - block_ids: list[int] - ) -> None: + +def d2h_page_copy(src_layer: torch.Tensor, dst_buffer: torch.Tensor, + block_ids: list[int]) -> None: """Copy data from device to host. Args: @@ -57,17 +39,18 @@ def d2h_page_copy( (2, len(block_ids), page_size, ...remaining dims...) block_ids (list[int]): The list of vllm block ids to copy from. """ - block_mapping = torch.stack([torch.tensor(block_ids, dtype = torch.long), - torch.arange(len(block_ids), dtype = torch.long)], dim = 1) + block_mapping = torch.stack([ + torch.tensor(block_ids, dtype=torch.long), + torch.arange(len(block_ids), dtype=torch.long) + ], + dim=1) ops.swap_blocks(src_layer[0], dst_buffer[0], block_mapping) ops.swap_blocks(src_layer[1], dst_buffer[1], block_mapping) -def h2d_copy_leading_tokens( - src_buffer: torch.Tensor, - dst_layer: torch.Tensor, - src_block_id: int, - dst_block_id: int, - end_position_in_block: int) -> None: + +def h2d_copy_leading_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + src_block_id: int, dst_block_id: int, + end_position_in_block: int) -> None: """Copy the leading tokens in 1 block from host buffer to device layer. Args: @@ -87,12 +70,9 @@ def h2d_copy_leading_tokens( dst_v.copy_(src_v, non_blocking=True) -def h2d_copy_trailing_tokens( - src_buffer: torch.Tensor, - dst_layer: torch.Tensor, - src_block_id: int, - dst_block_id: int, - start_position_in_block: int) -> None: +def h2d_copy_trailing_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + src_block_id: int, dst_block_id: int, + start_position_in_block: int) -> None: """Copy the trailing tokens in 1 block from host buffer to device layer. Args: @@ -111,13 +91,11 @@ def h2d_copy_trailing_tokens( dst_k.copy_(src_k, non_blocking=True) dst_v.copy_(src_v, non_blocking=True) -def h2d_copy_part_block( - src_buffer: torch.Tensor, - dst_layer: torch.Tensor, - src_block_id: int, - dst_block_id: int, - start_position_in_block: int, - end_position_in_block: int) -> None: + +def h2d_copy_part_block(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + src_block_id: int, dst_block_id: int, + start_position_in_block: int, + end_position_in_block: int) -> None: """Copy the part of a block from host buffer to device layer. Args: @@ -130,22 +108,21 @@ def h2d_copy_part_block( start_position_in_block (int): The start position in the block to copy. end_position_in_block (int): The end position in the block to copy. """ - dst_k = dst_layer[0][dst_block_id][start_position_in_block:end_position_in_block] - src_k = src_buffer[0][src_block_id][start_position_in_block:end_position_in_block] - dst_v = dst_layer[1][dst_block_id][start_position_in_block:end_position_in_block] - src_v = src_buffer[1][src_block_id][start_position_in_block:end_position_in_block] + dst_k = dst_layer[0][dst_block_id][ + start_position_in_block:end_position_in_block] + src_k = src_buffer[0][src_block_id][ + start_position_in_block:end_position_in_block] + dst_v = dst_layer[1][dst_block_id][ + start_position_in_block:end_position_in_block] + src_v = src_buffer[1][src_block_id][ + start_position_in_block:end_position_in_block] dst_k.copy_(src_k, non_blocking=True) dst_v.copy_(src_v, non_blocking=True) - -def h2d_page_copy( - src_buffer: torch.Tensor, - dst_layer: torch.Tensor, - block_ids: list[int], - start_token_idx: int, - stop_token_idx: int, - block_size: int - ) -> None: + +def h2d_page_copy(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + block_ids: list[int], start_token_idx: int, + stop_token_idx: int, block_size: int) -> None: """Copy data from host to device. Args: @@ -163,13 +140,14 @@ def h2d_page_copy( separate_first_block = start_token_idx % block_size != 0 separate_last_block = stop_token_idx % block_size != 0 - start_block_id = start_token_idx // block_size # inclusive + start_block_id = start_token_idx // block_size # inclusive end_block_id = stop_token_idx // block_size # exclusive - src_block_ids = torch.arange(start_block_id, end_block_id, - dtype = torch.long) + src_block_ids = torch.arange(start_block_id, + end_block_id, + dtype=torch.long) if separate_first_block: src_block_ids = src_block_ids[1:] - # NOTE: we don't need to add the last block id here, because the + # NOTE: we don't need to add the last block id here, because the # end_block_id is exclusive # E.g., start = 10, stop = 50, block_size = 16, then we have # start_block_id = 0 , separate_first_block = True @@ -186,36 +164,24 @@ def h2d_page_copy( # Only one block to copy start_position_in_block = start_token_idx % block_size end_position_in_block = stop_token_idx % block_size - h2d_copy_part_block( - src_buffer, - dst_layer, - start_block_id, - vllm_block_ids[start_block_id], - start_position_in_block, - end_position_in_block) + h2d_copy_part_block(src_buffer, dst_layer, start_block_id, + vllm_block_ids[start_block_id], + start_position_in_block, end_position_in_block) return if separate_first_block: first_block_id_src = start_block_id first_block_id_dst = vllm_block_ids[first_block_id_src] start_token_idx_in_block = start_token_idx % block_size - h2d_copy_trailing_tokens( - src_buffer, - dst_layer, - first_block_id_src, - first_block_id_dst, - start_token_idx_in_block) + h2d_copy_trailing_tokens(src_buffer, dst_layer, first_block_id_src, + first_block_id_dst, start_token_idx_in_block) if separate_last_block: last_block_id_src = end_block_id last_block_id_dst = vllm_block_ids[last_block_id_src] stop_token_idx_in_block = stop_token_idx % block_size - h2d_copy_leading_tokens( - src_buffer, - dst_layer, - last_block_id_src, - last_block_id_dst, - stop_token_idx_in_block) + h2d_copy_leading_tokens(src_buffer, dst_layer, last_block_id_src, + last_block_id_dst, stop_token_idx_in_block) # Step 3: copy the middle blocks block_mapping = torch.stack([src_block_ids, dst_block_ids], dim=1) @@ -227,6 +193,7 @@ def h2d_page_copy( # Connector related code ##################################################################### + @dataclass class PrefillRequestTracker: """RequestTracker is used to track the state of a request. @@ -244,7 +211,7 @@ class PrefillRequestTracker: # Total number of tokens in the "full request" num_all_tokens: int = 0 - # Total number of tokens that are already seen until this step + # Total number of tokens that are already seen until this step num_total_tokens: int = 0 # Number of tokens that are already saved @@ -255,8 +222,8 @@ class PrefillRequestTracker: @staticmethod def from_new_request( - new_request: "NewRequestData", - num_tokens_to_compute: int, + new_request: "NewRequestData", + num_tokens_to_compute: int, ) -> "PrefillRequestTracker": """Create the request tracker from a new request. @@ -273,7 +240,7 @@ def from_new_request( return PrefillRequestTracker( req_id=new_request.req_id, num_all_tokens=len(new_request.prompt_token_ids), - num_total_tokens = num_tokens_to_compute, + num_total_tokens=num_tokens_to_compute, num_saved_tokens=0, allocated_block_ids=unfolded_block_ids, ) @@ -298,6 +265,7 @@ def update_num_saved_tokens(self, num_saved_tokens: int) -> None: """ self.num_saved_tokens = num_saved_tokens + @dataclass class PrefillReqMeta: # Request id @@ -335,7 +303,7 @@ def from_request_tracker( f"Request {req_id} has more tokens than allocated blocks" token_range = slice(request_tracker.num_saved_tokens, - request_tracker.num_total_tokens) + request_tracker.num_total_tokens) num_saved_full_blocks = request_tracker.num_saved_tokens // block_size num_active_blocks = cdiv(request_tracker.num_total_tokens, block_size) @@ -348,10 +316,9 @@ def from_request_tracker( logger.debug( "Request %s: num_saved_full_blocks=%d, num_active_blocks=%d, " "blocks_to_save=%s, skip_leading_tokens=%d, " - "skip_trailing_tokens=%d", - request_tracker.req_id, - num_saved_full_blocks, num_active_blocks, - blocks_to_save, skip_leading_tokens, skip_trailing_tokens) + "skip_trailing_tokens=%d", request_tracker.req_id, + num_saved_full_blocks, num_active_blocks, blocks_to_save, + skip_leading_tokens, skip_trailing_tokens) # Update the request tracker with the number of saved tokens request_tracker.update_num_saved_tokens( @@ -374,14 +341,15 @@ class DecodeReqMeta: prefill_req_id: str # Allocated block ids block_ids: list[int] - # Skip the first N tokens + # Skip the first N tokens skip_leading_tokens: int # if it's ready or not is_ready: bool = False + @dataclass class CPUConnectorMetadata(KVConnectorMetadata): - prefill_meta: list[PrefillReqMeta] + prefill_meta: list[PrefillReqMeta] decode_meta: list[DecodeReqMeta] def __init__(self) -> None: @@ -407,6 +375,7 @@ def add_decode(self, decode_meta: DecodeReqMeta) -> None: """ self.decode_meta.append(decode_meta) + def validate_kv_transfer_config( kv_transfer_config: Optional["KVTransferConfig"]) -> None: """Validate the KV transfer configuration. @@ -428,12 +397,14 @@ def validate_kv_transfer_config( assert "port" in extra_config, \ "CPUConnector: must have 'port' in kv_connector_extra_config" + class CPUConnector(KVConnectorBase_V1): """CPUKVConnector is an implementation of KVConnectorBase_V1 that provides a CPU-based KV cache sending mechanism. """ - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: + def __init__(self, vllm_config: "VllmConfig", + role: KVConnectorRole) -> None: super().__init__(vllm_config, role) validate_kv_transfer_config(vllm_config.kv_transfer_config) @@ -454,13 +425,14 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: elif role == KVConnectorRole.WORKER: # Prefiller side sender if self.kv_role == "kv_producer": - self._kv_sender = NixlPrefillManager(1024 * 1024 * 1024) # 1GB for debug + self._kv_sender = NixlPrefillManager(1024 * 1024 * + 1024) # 1GB for debug elif self.kv_role == "kv_consumer": self._kv_receiver = NixlDecodeManager( - 1024 * 1024 * 1024, # 1GB for debug - "localhost", - 54321, # Changed from string to int to match the class definition - ) + 1024 * 1024 * 1024, # 1GB for debug + "localhost", + 54321, # Changed from string to int to match the class definition + ) else: raise ValueError(f"Unknown kv_role: {self.kv_role}") @@ -493,8 +465,6 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole) -> None: # In-progress kv load requests's prefill request ids self._inflight_h2d_requests: set[str] = set() - - def _connect_request_ids(self, p_reqid: str, d_reqid: str) -> None: self._decode_req_id_to_prefill_req_id[d_reqid] = p_reqid self._prefill_req_id_to_decode_req_id[p_reqid] = d_reqid @@ -502,10 +472,8 @@ def _connect_request_ids(self, p_reqid: str, d_reqid: str) -> None: ############################################################ # Scheduler Side Methods ############################################################ - def _build_prefiller_meta( - self, - scheduler_output: SchedulerOutput, - output_meta: CPUConnectorMetadata) -> None: + def _build_prefiller_meta(self, scheduler_output: SchedulerOutput, + output_meta: CPUConnectorMetadata) -> None: """Build the prefill request metadata from the scheduler output. Args: @@ -523,8 +491,7 @@ def _build_prefiller_meta( self._prefill_reqs[request.req_id] = request_tracker req_meta = PrefillReqMeta.from_request_tracker( - request_tracker, - self._block_size) + request_tracker, self._block_size) output_meta.add_prefill(req_meta) for request in scheduler_output.scheduled_cached_reqs: @@ -532,14 +499,11 @@ def _build_prefiller_meta( request_tracker.update(request) req_meta = PrefillReqMeta.from_request_tracker( - request_tracker, - self._block_size) + request_tracker, self._block_size) output_meta.add_prefill(req_meta) - def build_decode_meta( - self, - scheduler_output: SchedulerOutput, - output_meta: CPUConnectorMetadata) -> None: + def build_decode_meta(self, scheduler_output: SchedulerOutput, + output_meta: CPUConnectorMetadata) -> None: """Build the decode request metadata from the scheduler output. Args: @@ -551,7 +515,7 @@ def build_decode_meta( if not req_meta.is_ready: updated_decode_req_metas[req_meta.req_id] = req_meta # NOTE (ApostaC): Even if the request is not ready, we still - # want the worker connector to know about it, so that it can + # want the worker connector to know about it, so that it can # connector the decode request id to the prefill request id output_meta.add_decode(req_meta) self._decode_req_metas = updated_decode_req_metas @@ -559,7 +523,7 @@ def build_decode_meta( def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int) -> tuple[int, bool]: - # NOTE(ApostaC): For a single request, this function will be called + # NOTE(ApostaC): For a single request, this function will be called # two times if the first time we returned async_load flag as True. # The second time will be the "real schedule" time @@ -569,16 +533,18 @@ def get_num_new_matched_tokens( kv_transfer_params = request.kv_transfer_params num_tokens = len(request.prompt_token_ids) request_id = request.request_id - logger.info("For request %s, num_computed_tokens is %d, " - "total_num_tokens is %d", request_id, num_computed_tokens, - num_tokens) + logger.info( + "For request %s, num_computed_tokens is %d, " + "total_num_tokens is %d", request_id, num_computed_tokens, + num_tokens) if request.request_id in self._should_be_ready_reqs: self._should_be_ready_reqs.remove(request.request_id) return 0, False if kv_transfer_params is None or "prefill_request_id" not in kv_transfer_params: - logger.warning("Request %s does not have prefill_request_id", request.request_id) + logger.warning("Request %s does not have prefill_request_id", + request.request_id) #return 0, False # DEBUG: Set the prefill_request_id to the request id @@ -591,17 +557,15 @@ def get_num_new_matched_tokens( self._connect_request_ids(prefill_request_id, request_id) self._should_be_ready_reqs.add(request_id) - # NOTE: because the scheduler wants here to return "full blocks" if - # the async flag is true (see _update_waiting_for_remote_kv in - # scheduler.py). We need to carefully deal with it when copying + # NOTE: because the scheduler wants here to return "full blocks" if + # the async flag is true (see _update_waiting_for_remote_kv in + # scheduler.py). We need to carefully deal with it when copying # the KV cache at worker side return num_tokens // self._block_size * self._block_size, True - def update_state_after_alloc( - self, - request: "Request", - blocks: "KVCacheBlocks", - num_external_tokens: int) -> None: + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int) -> None: """Update the state of the request after allocation. """ # NOTE(ApostaC): This function is called twice for the same request @@ -621,12 +585,11 @@ def update_state_after_alloc( block_ids = [] for blks in blocks.get_block_ids(): block_ids.extend(blks) - req_meta = DecodeReqMeta( - req_id = request.request_id, - prefill_req_id = p_req_id, - block_ids = block_ids, - skip_leading_tokens = 0, - is_ready = False) + req_meta = DecodeReqMeta(req_id=request.request_id, + prefill_req_id=p_req_id, + block_ids=block_ids, + skip_leading_tokens=0, + is_ready=False) self._decode_req_metas[request.request_id] = req_meta def build_connector_meta( @@ -639,13 +602,13 @@ def build_connector_meta( self.build_decode_meta(scheduler_output, meta) else: raise ValueError(f"Unknown kv_role: {self.kv_role}") - + return meta def request_finished( - self, - request: "Request", - block_ids: list[int], + self, + request: "Request", + block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: print("In request_finished") return False, None @@ -664,19 +627,21 @@ def _get_layer_name(self, layer_id: int) -> str: return self._layer_id_to_name[layer_id] def _get_kv_shape(self, num_blocks: int) -> torch.Size: - return torch.Size((2, num_blocks, ) + self._kv_page_shape) + return torch.Size(( + 2, + num_blocks, + ) + self._kv_page_shape) def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self._gpu_kv_caches = kv_caches idx = 0 - for layer_name in kv_caches.keys(): + for layer_name in kv_caches: self._layer_name_to_id[layer_name] = idx self._layer_id_to_name[idx] = layer_name idx += 1 self._kv_page_shape = kv_caches[list(kv_caches.keys())[0]].shape[2:] - def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: """ @@ -702,20 +667,18 @@ def start_load_kv(self, forward_context: "ForwardContext", "Connector metadata is not of type CPUConnectorMetadata" for decode_meta in meta.decode_meta: - self._connect_request_ids( - decode_meta.prefill_req_id, - decode_meta.req_id) + self._connect_request_ids(decode_meta.prefill_req_id, + decode_meta.req_id) if not decode_meta.is_ready: continue total_expected_tokens = len(decode_meta.block_ids) * \ - self._block_size + self._block_size self._inflight_h2d_requests.add(decode_meta.prefill_req_id) for layer_id in range(len(self._gpu_kv_caches)): decode_specs = self._kv_receiver.get_kv_specs( - decode_meta.prefill_req_id, - layer_id) + decode_meta.prefill_req_id, layer_id) layer_name = self._layer_id_to_name[layer_id] dst_layer = self._gpu_kv_caches[layer_name] for decode_spec in decode_specs: @@ -727,13 +690,8 @@ def start_load_kv(self, forward_context: "ForwardContext", block_ids = decode_meta.block_ids with torch.cuda.stream(self._cuda_stream): - h2d_page_copy( - src_buffer, - dst_layer, - block_ids, - start, - stop, - self._block_size) + h2d_page_copy(src_buffer, dst_layer, block_ids, start, + stop, self._block_size) event = torch.cuda.Event() event.record(self._cuda_stream) self._decoder_cuda_events.append(event) @@ -793,8 +751,10 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, layer_id=self._get_layer_id(layer_name), start=prefill_req.token_range.start, stop=prefill_req.token_range.stop, - shape=tuple(self._get_kv_shape(len(prefill_req.blocks_to_save))), - dtype_str=str(kv_layer.dtype).split('.')[-1], # Convert torch.float32 -> "float32" + shape=tuple(self._get_kv_shape(len( + prefill_req.blocks_to_save))), + dtype_str=str(kv_layer.dtype).split('.') + [-1], # Convert torch.float32 -> "float32" num_all_tokens=prefill_req.num_all_tokens, ) @@ -802,7 +762,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, dest_spec = DestinationSpec( rank=get_tensor_model_parallel_rank(), host=self._host, - base_port=self._port, + base_port=self._port, ) # Create the send task @@ -818,11 +778,9 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, self._cuda_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self._cuda_stream): # Copy the data from the GPU to the CPU buffer page by page - d2h_page_copy( - src_layer=kv_layer, - dst_buffer=buffer, - block_ids=prefill_req.blocks_to_save - ) + d2h_page_copy(src_layer=kv_layer, + dst_buffer=buffer, + block_ids=prefill_req.blocks_to_save) # record the cuda stream task.cuda_event = torch.cuda.Event() @@ -835,8 +793,6 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, # 2. use a single cuda event instead of a list of cuda events # 3. use a cuda event pool to prevent the creation overhead - - @_lmcache_nvtx_annotate def wait_for_save(self): """ @@ -873,7 +829,8 @@ def get_finished( if self.kv_role == "kv_consumer": # decoder side self._kv_receiver.progress() - p_ready_reqs = self._kv_receiver.get_finished(len(self._gpu_kv_caches)) + p_ready_reqs = self._kv_receiver.get_finished( + len(self._gpu_kv_caches)) ret = set() # TODO: Bug here: we need to send the prefill request id from scheduler # connector to the worker connector in kv_params diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index 0a6923a60985..d38f2bbed684 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -1,38 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 -import contextlib import math -import threading -import time -import uuid from abc import ABC, abstractmethod -from collections import defaultdict, OrderedDict -from collections.abc import Iterator from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Tuple +from typing import TYPE_CHECKING import msgspec import torch -import zmq +from lmcache.utils import _lmcache_nvtx_annotate -from vllm import envs -from vllm.config import VllmConfig -from vllm.distributed.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, - get_tp_group) from vllm.logger import init_logger -from vllm.utils import make_zmq_path, make_zmq_socket, round_down, cdiv -from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.request import RequestStatus -from vllm import _custom_ops as ops - -from lmcache.utils import _lmcache_nvtx_annotate if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata - from vllm.forward_context import ForwardContext - from vllm.v1.core.kv_cache_manager import KVCacheBlocks - from vllm.v1.core.sched.output import CachedRequestData, NewRequestData - from vllm.v1.request import Request + pass logger = init_logger(__name__) @@ -61,6 +40,7 @@ def get_id(self) -> str: """ return f"{self.rank}_{self.host}_{self.base_port}" + class SourceSpec(msgspec.Struct): """SourceSpec is used to specify the source of kv sending task. """ @@ -72,7 +52,7 @@ class SourceSpec(msgspec.Struct): # The range of tokens to be offloaded start: int # For token_range slice - stop: int # For token_range slice + stop: int # For token_range slice # The shape of the offloaded KV cache tensor as a tuple shape: tuple[int, ...] @@ -107,6 +87,7 @@ def __str__(self) -> str: f"layer_id={self.layer_id}, " f"token_range={self.token_range}, shape={self.tensor_shape})") + @dataclass class DecoderKVSpec: # Start index of the KV cache (inclusive) @@ -148,6 +129,7 @@ def is_done(self) -> bool: """ return self.send_done + @dataclass class SendTask: """Wraps a KV Cache sending task @@ -167,9 +149,8 @@ def tensor(self) -> torch.Tensor: torch.Tensor: The tensor of the send task. """ num_elements = self.source_spec.tensor_shape.numel() - return self.buffer.view( - self.source_spec.dtype)[:num_elements].view( - self.source_spec.tensor_shape) + return self.buffer.view(self.source_spec.dtype)[:num_elements].view( + self.source_spec.tensor_shape) def update_states(self) -> None: """Update the states of the send task. This needs to be OVERWRITTEN in @@ -209,6 +190,7 @@ def mark_sending(self) -> None: """ self.state.is_sending = True + class KVSenderInterface(ABC): """KVSenderInterface is an interface for sending KV cache data. """ @@ -216,7 +198,6 @@ class KVSenderInterface(ABC): def __init__(self) -> None: self._send_tasks: list[SendTask] = [] - def add_send_task(self, task: SendTask) -> None: """Add a send task to the list of send tasks. @@ -267,7 +248,8 @@ def progress(self) -> None: # Update after going through all send tasks self.post_progress_hook() - logger.info("KVSender progress: sent %d, freed %d", num_sent, num_freed) + logger.info("KVSender progress: sent %d, freed %d", num_sent, + num_freed) ###################################################### # Abstract methods (to be implemented by subclasses) # @@ -275,10 +257,10 @@ def progress(self) -> None: @abstractmethod def create_send_task( - self, - source_spec: SourceSpec, - destination_spec: DestinationSpec, - ) -> SendTask: + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: """Create a non-ready send task with a CPU buffer allocated. Args: @@ -326,5 +308,3 @@ def post_progress_hook(self, task: SendTask) -> None: task (SendTask): The send task to be processed. """ raise NotImplementedError("post_progress_hook() not implemented") - - diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 329c4509e7fd..7abdf4f21127 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import contextlib -import math import threading import time import uuid -from collections import defaultdict, OrderedDict +from collections import OrderedDict, defaultdict from collections.abc import Iterator from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional @@ -12,29 +11,19 @@ import msgspec import torch import zmq - from lmcache.utils import _lmcache_nvtx_annotate -from vllm import envs -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.v1.base import ( - KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( - SendTask, KVSenderInterface, SourceSpec, DestinationSpec, DecoderKVSpec, - SendTaskState) + DecoderKVSpec, DestinationSpec, KVSenderInterface, SendTask, SendTaskState, + SourceSpec) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) from vllm.logger import init_logger -from vllm.utils import make_zmq_path, make_zmq_socket, round_down -from vllm.v1.core.sched.output import SchedulerOutput -from vllm.v1.request import RequestStatus +from vllm.utils import make_zmq_path, make_zmq_socket if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata - from vllm.forward_context import ForwardContext - from vllm.v1.core.kv_cache_manager import KVCacheBlocks - from vllm.v1.request import Request + pass logger = init_logger(__name__) @@ -53,6 +42,7 @@ DEFAULT_NIXL_PAGE_SIZE = 4096 + def init_nixl_agent( buffer_size: int, buffer_ptr: int, @@ -84,17 +74,16 @@ def init_nixl_agent( # Create xfer handlers xfer_desc = [] - for base_addr in range(buffer_ptr, - buffer_ptr + buffer_size, + for base_addr in range(buffer_ptr, buffer_ptr + buffer_size, nixl_page_size): xfer_desc.append((base_addr, nixl_page_size, 0)) xfer_descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type="DRAM") - xfer_handler = nixl_agent.prep_xfer_dlist( - "", xfer_descs, mem_type="DRAM") + xfer_handler = nixl_agent.prep_xfer_dlist("", xfer_descs, mem_type="DRAM") return nixl_agent, reg_descs, xfer_descs, xfer_handler + class RingBufferAllocator: """RingBufferAllocator is a simple ring buffer allocator for managing memory allocation and deallocation. @@ -143,7 +132,7 @@ def allocate(self, size: int) -> tuple[int, Optional[torch.Tensor]]: """ # During allocation, we always make sure that high watermark and # low watermark are aligned to the alignment size - aligned_size = self._align_size(size) # Align the requested size + aligned_size = self._align_size(size) # Align the requested size turnaround_size = (self._high_watermark // self._size + 1) * self._size local_high = self._high_watermark % self._size @@ -181,8 +170,8 @@ def allocate(self, size: int) -> tuple[int, Optional[torch.Tensor]]: # No space available return -1, None - def view_as_tensor(self, vaddr: int, - dtype: torch.dtype, shape: torch.Size) -> torch.Tensor: + def view_as_tensor(self, vaddr: int, dtype: torch.dtype, + shape: torch.Size) -> torch.Tensor: """View the buffer as a tensor. Args: vaddr (int): The virtual address of the buffer. @@ -202,8 +191,6 @@ def view_as_tensor(self, vaddr: int, # Get the tensor return self._buffer[paddr:paddr + size].view(dtype).view(shape) - - def free(self, address: int) -> None: """Free the buffer at the given address. @@ -214,7 +201,7 @@ def free(self, address: int) -> None: assert address in self._allocated, \ f"Address {address} not found in allocated buffers" - # Pop the address from the allocated dict, and update the + # Pop the address from the allocated dict, and update the # low watermark self._allocated.pop(address) @@ -263,10 +250,12 @@ def get_buffer_ptr(self) -> int: """ return self._buffer.data_ptr() + ################################################################### # NIXL Related Classes ################################################################### + class NixlProtocolMsg(msgspec.Struct): msg_type: str req_uuid: str @@ -274,11 +263,7 @@ class NixlProtocolMsg(msgspec.Struct): receiver_paddr: Optional[int] = None - -def make_send_req_msg( - source_spec: SourceSpec, - req_uuid: str -) -> bytes: +def make_send_req_msg(source_spec: SourceSpec, req_uuid: str) -> bytes: """Make the send request message. Args: @@ -290,19 +275,18 @@ def make_send_req_msg( # Create the request message msg_type = "REQMSG" receiver_paddr = None - send_req_msg = NixlProtocolMsg( - msg_type=msg_type, - req_uuid=req_uuid, - source_spec=source_spec, - receiver_paddr=receiver_paddr - ) + send_req_msg = NixlProtocolMsg(msg_type=msg_type, + req_uuid=req_uuid, + source_spec=source_spec, + receiver_paddr=receiver_paddr) # Encode the message send_req_msg_bytes = msgspec.msgpack.encode(send_req_msg) return send_req_msg_bytes + def make_receive_ready_msg( - req_uuid: str, - receiver_paddr: int, + req_uuid: str, + receiver_paddr: int, ) -> bytes: """Make the receive ready message. @@ -316,19 +300,16 @@ def make_receive_ready_msg( # Create the request message msg_type = "READYMSG" source_spec = None - receive_ready_msg = NixlProtocolMsg( - msg_type=msg_type, - req_uuid=req_uuid, - source_spec=source_spec, - receiver_paddr=receiver_paddr - ) + receive_ready_msg = NixlProtocolMsg(msg_type=msg_type, + req_uuid=req_uuid, + source_spec=source_spec, + receiver_paddr=receiver_paddr) # Encode the message receive_ready_msg_bytes = msgspec.msgpack.encode(receive_ready_msg) return receive_ready_msg_bytes -def make_send_finish_msg( - req_uuid: str, -) -> bytes: + +def make_send_finish_msg(req_uuid: str, ) -> bytes: """Make the send finish message. Args: @@ -341,18 +322,17 @@ def make_send_finish_msg( msg_type = "FINISHMSG" source_spec = None receiver_paddr = None - send_finish_msg = NixlProtocolMsg( - msg_type=msg_type, - req_uuid=req_uuid, - source_spec=source_spec, - receiver_paddr=receiver_paddr - ) + send_finish_msg = NixlProtocolMsg(msg_type=msg_type, + req_uuid=req_uuid, + source_spec=source_spec, + receiver_paddr=receiver_paddr) # Encode the message send_finish_msg_bytes = msgspec.msgpack.encode(send_finish_msg) return send_finish_msg_bytes class NixlCPUSender: + def __init__( self, buffer_size: int, @@ -372,7 +352,7 @@ def __init__( self._local_xfer_handlers = \ init_nixl_agent(buffer_size, buffer_ptr, nixl_page_size) - # Remote xfer dlists, peer name -> prepped xfer handlers + # Remote xfer dlists, peer name -> prepped xfer handlers self._remote_xfer_handlers: dict[str, Any] = {} # Add ZMQ context for handshakes @@ -382,13 +362,12 @@ def __init__( # uuid -> (remote agent name, receiver paddr) self._ready_requests: dict[str, tuple[str, int]] = {} - # NOTE(ApostaC): we don't track the requests that are waiting for the + # NOTE(ApostaC): we don't track the requests that are waiting for the # receiver to be ready, and may want to add this in the future # Msg decoder self._msg_decoder = msgspec.msgpack.Decoder(NixlProtocolMsg) - def _get_desc_idxs(self, paddr: int, size: int) -> list[int]: """Get the sender descriptor indexes for the given physical address and size. @@ -439,13 +418,8 @@ def send( notif_msg = make_send_finish_msg(req_uuid) # Transfer handle = self._nixl_wrapper.make_prepped_xfer( - "WRITE", - self._local_xfer_handlers, - desc_idxs, - remote_xfer_handlers, - r_desc_idxs, - notif_msg - ) + "WRITE", self._local_xfer_handlers, desc_idxs, + remote_xfer_handlers, r_desc_idxs, notif_msg) self._nixl_wrapper.transfer(handle) @@ -465,7 +439,7 @@ def is_send_finished(self, handle: "nixl_xfer_handle") -> bool: logger.error("Error in send operation") return False return status == "DONE" - + def prepare_send( self, source_spec: SourceSpec, @@ -557,11 +531,11 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: sock.send(local_meta) metadata_bytes = sock.recv() - + # Get remote agent name and register it remote_agent_name = self._nixl_wrapper.add_remote_agent( metadata_bytes) - + # Store remote agent info self._remote_agents[destination_spec.get_id()] = remote_agent_name @@ -570,14 +544,14 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: s_remote_xfer_descs = sock.recv() remote_xfer_dlist = self._nixl_wrapper.deserialize_descs( s_remote_xfer_descs) - remote_xfer_handlers = self._nixl_wrapper.prep_xfer_dlist( remote_agent_name, remote_xfer_dlist, mem_type="DRAM") - self._remote_xfer_handlers[remote_agent_name] = remote_xfer_handlers - - logger.debug("Successfully completed handshake with %s", + self._remote_xfer_handlers[ + remote_agent_name] = remote_xfer_handlers + + logger.debug("Successfully completed handshake with %s", destination_spec) def close(self) -> None: @@ -591,7 +565,9 @@ def close(self) -> None: self._nixl_wrapper.release_dlist_handle(remote_xfer_handler) del self._nixl_wrapper + class NixlCPUReceiver: + def __init__( self, allocator: RingBufferAllocator = None, @@ -624,7 +600,7 @@ def __init__( self._reg_dlist, \ self._local_xfer_dlist, \ self._local_xfer_handlers = \ - init_nixl_agent(self._buffer_size, self._buffer_ptr, + init_nixl_agent(self._buffer_size, self._buffer_ptr, nixl_page_size) # Add handshake listener thread @@ -643,17 +619,19 @@ def _process_msgs(self): obj = self._msg_decoder.decode(msg) if obj.msg_type == "REQMSG": # Add the request to the pending allocation - self._pending_allocation[obj.req_uuid] = (obj.source_spec, - remote_agent_name) + self._pending_allocation[obj.req_uuid] = ( + obj.source_spec, remote_agent_name) elif obj.msg_type == "FINISHMSG": # Add the request to the finished requests if obj.req_uuid in self._inflight_requests: source_spec = self._inflight_requests.pop(obj.req_uuid) vaddr = self._inflight_request_vaddr.pop(obj.req_uuid) - self._finished_requests[obj.req_uuid] = (source_spec, vaddr) + self._finished_requests[obj.req_uuid] = (source_spec, + vaddr) else: - logger.error("Request %s not found in inflight requests", - obj.req_uuid) + logger.error( + "Request %s not found in inflight requests", + obj.req_uuid) else: logger.error("Unexpected message type: %s", obj.msg_type) continue @@ -668,8 +646,7 @@ def _process_allocation_requests(self): if requested_size > self._buffer_size: raise RuntimeError( f"Requested size {requested_size} is larger than the " - f"nixl receiver buffer size {self._buffer_size}" - ) + f"nixl receiver buffer size {self._buffer_size}") vaddr, buffer = self._allocator.allocate(requested_size) if vaddr == -1: @@ -677,7 +654,7 @@ def _process_allocation_requests(self): # No space available, skip all the requests # NOTE: an alternative is to try allocation for other requests - # and then come back to this one, but this may create + # and then come back to this one, but this may create # starvation break @@ -690,7 +667,7 @@ def _process_allocation_requests(self): ready_msg = make_receive_ready_msg(req_uuid, paddr) self._nixl_wrapper.send_notif(peer_name, ready_msg) - # Add the request to the allocated requests + # Add the request to the allocated requests allocated_requests.append(req_uuid) # Remove the allocated requests from the pending allocation @@ -703,7 +680,7 @@ def progress(self) -> None: self._process_msgs() self._process_allocation_requests() - def get_finished(self, clear = False) -> list[tuple[SourceSpec, int]]: + def get_finished(self, clear=False) -> list[tuple[SourceSpec, int]]: """Get the requests that finishes receiving. Args: @@ -713,8 +690,8 @@ def get_finished(self, clear = False) -> list[tuple[SourceSpec, int]]: list[tuple[SourceSpec, int]]: A list of tuples containing the source spec and the address. """ - ret = [(source_spec, vaddr) for source_spec, vaddr in - self._finished_requests.values()] + ret = [(source_spec, vaddr) + for source_spec, vaddr in self._finished_requests.values()] if clear: self._finished_requests.clear() return ret @@ -731,17 +708,12 @@ def start_handshake_listener(self, host: str, base_port: int) -> None: target=self._nixl_handshake_listener, args=(host, base_port, ready_event), daemon=True, - name="nixl_cpu_handshake_listener" - ) + name="nixl_cpu_handshake_listener") self._handshake_listener_t.start() ready_event.wait() - def _nixl_handshake_listener( - self, - host: str, - base_port: int, - ready_event: threading.Event - ) -> None: + def _nixl_handshake_listener(self, host: str, base_port: int, + ready_event: threading.Event) -> None: """Background thread that listens for and responds to handshake requests. Args: @@ -751,15 +723,15 @@ def _nixl_handshake_listener( """ # Prepare metadata local_meta = self._nixl_wrapper.get_agent_metadata() - + # Setup ZMQ socket port = base_port + get_tensor_model_parallel_rank() path = make_zmq_path("tcp", host, port) logger.debug("Starting handshake listener on path: %s", path) - + with zmq_ctx(zmq.ROUTER, path) as sock: ready_event.set() - + while not self._stop_listener.is_set(): try: identity, _, msg = sock.recv_multipart(flags=zmq.NOBLOCK) @@ -768,18 +740,21 @@ def _nixl_handshake_listener( # Send back the local xfer descs s_local_xfer_descs = self._nixl_wrapper.get_serialized_descs( self._local_xfer_dlist) - sock.send_multipart([identity, b"", s_local_xfer_descs]) - logger.debug("Sent back the local xfer descs to %s", identity) + sock.send_multipart( + [identity, b"", s_local_xfer_descs]) + logger.debug("Sent back the local xfer descs to %s", + identity) else: # Send the agent metadata remote_agent_name = self._nixl_wrapper.add_remote_agent( msg) self._remote_agents[identity] = remote_agent_name - logger.debug("Successfully received handshake from %s", + logger.debug("Successfully received handshake from %s", identity) # Send back the local metadata to the sender sock.send_multipart([identity, b"", local_meta]) - logger.debug("Sent local metadata back to %s", identity) + logger.debug("Sent local metadata back to %s", + identity) except zmq.error.Again: # No message available @@ -797,8 +772,9 @@ def stop_handshake_listener(self) -> None: self._handshake_listener_t = None def close(self): - logger.info("Watermark information before closing: (low: %d, high: %d)", - self._allocator.low_watermark, self._allocator.high_watermark) + logger.info( + "Watermark information before closing: (low: %d, high: %d)", + self._allocator.low_watermark, self._allocator.high_watermark) self.stop_handshake_listener() self._nixl_wrapper.deregister_memory(self._reg_dlist) del self._nixl_wrapper @@ -844,7 +820,6 @@ class NixlSendTask(SendTask): # nixl transfer handle transfer_handle: Optional[nixl_xfer_handle] = None - def __post_init__(self) -> None: self.creation_time = time.time() @@ -860,7 +835,7 @@ def update_states(self) -> None: # check if the send is ready if not self.state.receiver_ready and self.receiver_paddr is None: rname, rpaddr = self.parent_sender.check_and_remove_prepared_send( - self.request_uuid) + self.request_uuid) if rname is not None: assert rpaddr != -1 self.receiver_paddr = rpaddr @@ -878,21 +853,21 @@ class NixlPrefillManager(KVSenderInterface): with NIXL for sending data. """ - def __init__(self, buffer_size: int) -> None: + def __init__(self, buffer_size: int) -> None: super().__init__() nixl_page_size = DEFAULT_NIXL_PAGE_SIZE self._buffer_size = buffer_size self._allocator = RingBufferAllocator(self._buffer_size, nixl_page_size) - self._nixl_sender = NixlCPUSender( - buffer_size, self._allocator.get_buffer_ptr(), - nixl_page_size) + self._nixl_sender = NixlCPUSender(buffer_size, + self._allocator.get_buffer_ptr(), + nixl_page_size) def create_send_task( - self, - source_spec: SourceSpec, - destination_spec: DestinationSpec, - ) -> SendTask: + self, + source_spec: SourceSpec, + destination_spec: DestinationSpec, + ) -> SendTask: """Create a non-ready send task with a CPU buffer allocated. Args: @@ -905,7 +880,7 @@ def create_send_task( size = source_spec.get_size() address, buffer = self._allocator.allocate(size) while address == -1: - # If allocation fails, wait for a while to process + # If allocation fails, wait for a while to process # and try again time.sleep(0.001) self.progress() @@ -913,19 +888,17 @@ def create_send_task( assert buffer is not None, "Buffer allocation failed" # Prepare the send request in NixlSender - req_uuid = self._nixl_sender.prepare_send( - source_spec, destination_spec) + req_uuid = self._nixl_sender.prepare_send(source_spec, + destination_spec) # Create a send task with the allocated buffer - task = NixlSendTask( - buffer=buffer, - source_spec=source_spec, - destination_spec=destination_spec, - state=SendTaskState(), - buffer_vaddr=address, - parent_sender=self._nixl_sender, - request_uuid=req_uuid - ) + task = NixlSendTask(buffer=buffer, + source_spec=source_spec, + destination_spec=destination_spec, + state=SendTaskState(), + buffer_vaddr=address, + parent_sender=self._nixl_sender, + request_uuid=req_uuid) self.add_send_task(task) return task @@ -949,14 +922,12 @@ def send_task(self, task: SendTask) -> None: assert isinstance(task, NixlSendTask), \ "Task is not a NixlSendTask" handle = self._nixl_sender.send( - self._allocator.virtual_to_physical(task.buffer_vaddr), - task.receiver_paddr, - task.source_spec.get_size(), - task.request_uuid, - task.destination_spec) + self._allocator.virtual_to_physical(task.buffer_vaddr), + task.receiver_paddr, task.source_spec.get_size(), + task.request_uuid, task.destination_spec) task.transfer_handle = handle task.mark_sending() - return + return def pre_progress_hook(self) -> None: for task in self.get_send_tasks(): @@ -981,20 +952,18 @@ def close(self): self.wait_for_all_tasks() self._nixl_sender.close() + class NixlDecodeManager: - def __init__(self, - buffer_size: int, - host: str, - port: int) -> None: + + def __init__(self, buffer_size: int, host: str, port: int) -> None: self.nixl_page_size = DEFAULT_NIXL_PAGE_SIZE self._buffer_size = buffer_size self._allocator = RingBufferAllocator(self._buffer_size, self.nixl_page_size) - self._nixl_receiver = NixlCPUReceiver(self._allocator, + self._nixl_receiver = NixlCPUReceiver(self._allocator, self.nixl_page_size) self._nixl_receiver.start_handshake_listener(host, port) - # How many tokens are received for each request, each layer # (p_request_id, layer_id) -> num_tokens self._received_tokens: dict[str, dict[int, int]] = {} @@ -1005,7 +974,7 @@ def __init__(self, # The detailed specs of the requests # (p_request_id, layer_id) -> (SourceSpec, vaddr) - self._request_specs: dict[tuple(str, int), + self._request_specs: dict[tuple(str, int), list[tuple(SourceSpec, int)]] = {} # Metadata @@ -1015,13 +984,14 @@ def __init__(self, # Multi process receiving check # p_request_id -> number of ready workers - self._done_receiving_count: defaultdict[str, int] = defaultdict(lambda: 0) + self._done_receiving_count: defaultdict[str, + int] = defaultdict(lambda: 0) def _check_receive_and_update(self): """Checks the KV cache receiving status and update the internal states """ - finished_list = self._nixl_receiver.get_finished(clear = True) + finished_list = self._nixl_receiver.get_finished(clear=True) for source_spec, vaddr in finished_list: # Get the request id and layer id p_request_id = source_spec.request_id @@ -1029,21 +999,22 @@ def _check_receive_and_update(self): num_received_tokens = source_spec.stop - source_spec.start if p_request_id not in self._expected_tokens: - self._expected_tokens[p_request_id] = source_spec.num_all_tokens + self._expected_tokens[ + p_request_id] = source_spec.num_all_tokens # Update the received tokens if p_request_id not in self._received_tokens: self._received_tokens[p_request_id] = {} if layer_id not in self._received_tokens[p_request_id]: self._received_tokens[p_request_id][layer_id] = 0 - self._received_tokens[p_request_id][layer_id] += num_received_tokens + self._received_tokens[p_request_id][ + layer_id] += num_received_tokens # Update received specs if (p_request_id, layer_id) not in self._request_specs: self._request_specs[(p_request_id, layer_id)] = [] self._request_specs[(p_request_id, layer_id)].append( - (source_spec, vaddr) - ) + (source_spec, vaddr)) def progress(self) -> None: """Process the received requests and the data. Updates the internal @@ -1080,7 +1051,7 @@ def get_finished(self, num_expected_layers: int) -> list[str]: if self.world_size == 1: return ready_requests - # For multi-process + # For multi-process if self.rank == 0: for p_request_id in ready_requests: self._done_receiving_count[p_request_id] += 1 @@ -1088,7 +1059,7 @@ def get_finished(self, num_expected_layers: int) -> list[str]: other_ranks_finished_ids: list[str] = [] for i in range(1, self.world_size): other_ranks_finished_ids.extend( - self.tp_group.recv_object(src=i)) + self.tp_group.recv_object(src=i)) for p_request_id in other_ranks_finished_ids: self._done_receiving_count[p_request_id] += 1 @@ -1103,22 +1074,18 @@ def get_finished(self, num_expected_layers: int) -> list[str]: self.tp_group.send_object(ready_requests, dst=0) return ready_requests - def _create_decoder_kv_spec(self, - source_spec: SourceSpec, + def _create_decoder_kv_spec(self, source_spec: SourceSpec, vaddr: int) -> DecoderKVSpec: """Create a DecoderKVSpec from the source spec and the virtual address. """ # Get the correct buffer - return DecoderKVSpec( - start = source_spec.start, - stop = source_spec.stop, - buffer = self._allocator.view_as_tensor( - vaddr, source_spec.dtype, source_spec.tensor_shape) - ) - + return DecoderKVSpec(start=source_spec.start, + stop=source_spec.stop, + buffer=self._allocator.view_as_tensor( + vaddr, source_spec.dtype, + source_spec.tensor_shape)) - def get_kv_specs(self, - p_request_id: str, + def get_kv_specs(self, p_request_id: str, layer_id: int) -> list[DecoderKVSpec]: """Get the KV specs for the given request id and layer id, which will be used for connector to load the KV back to CPU @@ -1129,11 +1096,12 @@ def get_kv_specs(self, """ ret = [] if (p_request_id, layer_id) not in self._request_specs: - logger.warning("Request %s not found in request specs", + logger.warning("Request %s not found in request specs", (p_request_id, layer_id)) return ret - for source_spec, vaddr in self._request_specs[(p_request_id, layer_id)]: + for source_spec, vaddr in self._request_specs[(p_request_id, + layer_id)]: # Create the decoder kv spec decoder_kv_spec = self._create_decoder_kv_spec(source_spec, vaddr) ret.append(decoder_kv_spec) @@ -1155,14 +1123,15 @@ def free_request(self, p_request_id): "Found received tokens but no request specs" # Free the memory - for src_spec, vaddr in self._request_specs[(p_request_id, layer_id)]: + for src_spec, vaddr in self._request_specs[(p_request_id, + layer_id)]: self._allocator.free(vaddr) # Clear the request specs self._request_specs.pop((p_request_id, layer_id), None) else: - logger.warning("Request %s not found in received tokens", + logger.warning("Request %s not found in received tokens", p_request_id) def close(self): From 2e2937fdfb3f0884a1c6395f0e52e5ea5584da44 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 16:46:11 -0700 Subject: [PATCH 14/28] [fix] ruff errors Signed-off-by: ApostaC --- .../kv_connector/v1/cpu_connector.py | 19 +++++++------ .../kv_connector/v1/cpu_connector_utils.py | 5 ++-- .../kv_connector/v1/nixl_cpu_utils.py | 27 ++++++++++--------- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index e3ff0722451d..0d22de168e92 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata + from vllm.config import KVTransferConfig from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import CachedRequestData, NewRequestData @@ -300,7 +301,8 @@ def from_request_tracker( """ assert request_tracker.num_total_tokens <= \ len(request_tracker.allocated_block_ids) * block_size, \ - f"Request {req_id} has more tokens than allocated blocks" + f"Request {request_tracker.req_id} has more tokens " + \ + "than allocated blocks" token_range = slice(request_tracker.num_saved_tokens, request_tracker.num_total_tokens) @@ -425,13 +427,15 @@ def __init__(self, vllm_config: "VllmConfig", elif role == KVConnectorRole.WORKER: # Prefiller side sender if self.kv_role == "kv_producer": + # TODO: remove the hard-code here self._kv_sender = NixlPrefillManager(1024 * 1024 * 1024) # 1GB for debug elif self.kv_role == "kv_consumer": + # TODO: remove the hard-code here self._kv_receiver = NixlDecodeManager( 1024 * 1024 * 1024, # 1GB for debug "localhost", - 54321, # Changed from string to int to match the class definition + 54321, ) else: raise ValueError(f"Unknown kv_role: {self.kv_role}") @@ -542,7 +546,8 @@ def get_num_new_matched_tokens( self._should_be_ready_reqs.remove(request.request_id) return 0, False - if kv_transfer_params is None or "prefill_request_id" not in kv_transfer_params: + if kv_transfer_params is None or \ + "prefill_request_id" not in kv_transfer_params: logger.warning("Request %s does not have prefill_request_id", request.request_id) #return 0, False @@ -634,11 +639,9 @@ def _get_kv_shape(self, num_blocks: int) -> torch.Size: def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self._gpu_kv_caches = kv_caches - idx = 0 - for layer_name in kv_caches: + for idx, layer_name in enumerate(kv_caches): self._layer_name_to_id[layer_name] = idx self._layer_id_to_name[idx] = layer_name - idx += 1 self._kv_page_shape = kv_caches[list(kv_caches.keys())[0]].shape[2:] @@ -832,8 +835,8 @@ def get_finished( p_ready_reqs = self._kv_receiver.get_finished( len(self._gpu_kv_caches)) ret = set() - # TODO: Bug here: we need to send the prefill request id from scheduler - # connector to the worker connector in kv_params + # TODO: Bug here: we need to send the prefill request id from + # scheduler connector to the worker connector in kv_params for p_req_id in p_ready_reqs: ret.add(self._prefill_req_id_to_decode_req_id[p_req_id]) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index d38f2bbed684..b7160b026476 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -30,7 +30,8 @@ class DestinationSpec: base_port: int def __str__(self) -> str: - return f"DestinationSpec(rank={self.rank}, host={self.host}, base_port={self.base_port})" + return f"DestinationSpec(rank={self.rank}, " + \ + f"host={self.host}, base_port={self.base_port})" def get_id(self) -> str: """Get the id of the destination spec. @@ -79,7 +80,7 @@ def dtype(self) -> torch.dtype: return getattr(torch, self.dtype_str) def get_size(self) -> int: - """Get the size in bytes of the cooresponding kv cache.""" + """Get the size in bytes of the corresponding kv cache.""" return math.prod(self.shape) * self.dtype.itemsize def __str__(self) -> str: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 7abdf4f21127..3983891931c3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -210,7 +210,7 @@ def free(self, address: int) -> None: # Else, set the low_watermark to the first address in the allocated # dict - for addr in self._allocated.keys(): + for addr in self._allocated: new_low_watermark = addr break self._low_watermark = new_low_watermark @@ -615,7 +615,7 @@ def _process_msgs(self): notifs = self._nixl_wrapper.get_new_notifs() for remote_agent_name in notifs: for msg in notifs[remote_agent_name]: - # Decode the messag + # Decode the message obj = self._msg_decoder.decode(msg) if obj.msg_type == "REQMSG": # Add the request to the pending allocation @@ -687,8 +687,8 @@ def get_finished(self, clear=False) -> list[tuple[SourceSpec, int]]: clear (bool): Whether to clear the finished requests or not. Returns: - list[tuple[SourceSpec, int]]: A list of tuples containing the source - spec and the address. + list[tuple[SourceSpec, int]]: A list of tuples containing the + source spec and the address. """ ret = [(source_spec, vaddr) for source_spec, vaddr in self._finished_requests.values()] @@ -714,12 +714,14 @@ def start_handshake_listener(self, host: str, base_port: int) -> None: def _nixl_handshake_listener(self, host: str, base_port: int, ready_event: threading.Event) -> None: - """Background thread that listens for and responds to handshake requests. + """Background thread that listens for and responds to handshake + requests. Args: host (str): Host address to listen on base_port (int): Base port number to listen on - ready_event (threading.Event): Event to signal when listener is ready + ready_event (threading.Event): Event to signal when listener is + ready """ # Prepare metadata local_meta = self._nixl_wrapper.get_agent_metadata() @@ -738,8 +740,8 @@ def _nixl_handshake_listener(self, host: str, base_port: int, if msg == b"get_xfer_descs": # Send back the local xfer descs - s_local_xfer_descs = self._nixl_wrapper.get_serialized_descs( - self._local_xfer_dlist) + s_local_xfer_descs = self._nixl_wrapper.\ + get_serialized_descs(self._local_xfer_dlist) sock.send_multipart( [identity, b"", s_local_xfer_descs]) logger.debug("Sent back the local xfer descs to %s", @@ -841,10 +843,9 @@ def update_states(self) -> None: self.receiver_paddr = rpaddr self.state.receiver_ready = True - if not self.is_done() and self.transfer_handle is not None: - # Check if the transfer is finished - if self.parent_sender.is_send_finished(self.transfer_handle): - self.state.send_done = True + if not self.is_done() and self.transfer_handle is not None \ + and self.parent_sender.is_send_finished(self.transfer_handle): + self.state.send_done = True class NixlPrefillManager(KVSenderInterface): @@ -1063,7 +1064,7 @@ def get_finished(self, num_expected_layers: int) -> list[str]: for p_request_id in other_ranks_finished_ids: self._done_receiving_count[p_request_id] += 1 - all_done_recving: list[str] + all_done_recving: list[str] = [] for p_request_id in self._done_receiving_count: if self._done_receiving_count[p_request_id] == \ self.world_size: From 242098b6690bfe27978319dffd80dc2e83855156 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 16:58:22 -0700 Subject: [PATCH 15/28] [fix] format checker issue for tests Signed-off-by: ApostaC --- .../test_cpu_connector_kernels.py | 3 +- .../cpu_kv_integration/test_nixl_cpu_utils.py | 31 +++++++------------ .../cpu_kv_integration/toy_decoder_manager.py | 3 +- .../cpu_kv_integration/toy_receiver.py | 15 +++++---- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py b/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py index a19ba9188019..06525cbba7eb 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_cpu_connector_kernels.py @@ -10,7 +10,8 @@ @pytest.fixture def device_tensors(): """Create sample device tensors for testing.""" - # Create tensors with shape (2, num_blocks, page_size, head_size, hidden_size) + # Create tensors with shape (2, num_blocks, page_size, head_size, + # hidden_size) num_blocks = 4 page_size = 16 head_size = 8 diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py index 157daab4ce13..60e443b62d6c 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -5,12 +5,17 @@ import torch import torch.multiprocessing as mp +import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( DestinationSpec, NixlCPUReceiver, NixlCPUSender, RingBufferAllocator, SourceSpec) try: - from nixl._api import nixl_agent as NixlWrapper + #from nixl._api import nixl_agent as NixlWrapper + import importlib + spec = importlib.util.find_spec("nixl._api.nixl_agent") + if spec is None: + raise ImportError("NIXL is not available") NIXL_AVAILABLE = True except ImportError: NIXL_AVAILABLE = False @@ -21,7 +26,6 @@ def run_receiver(buffer_config, host, base_port, rank, ready_event, """Process function for running the receiver.""" try: # Mock tensor_model_parallel_rank for this process - import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: rank # Create ring buffer allocator @@ -53,7 +57,6 @@ def run_sender(buffer_config, host, base_port, rank, receiver_ready_event): """Process function for running the sender.""" try: # Mock tensor_model_parallel_rank for this process - import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: rank # Create ring buffer allocator @@ -94,7 +97,6 @@ def run_receiver_with_progress(buffer_config, """Process function for running the receiver with progress loop.""" try: # Mock tensor_model_parallel_rank for this process - import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: rank # Create ring buffer allocator @@ -150,7 +152,6 @@ def run_sender_with_protocol(buffer_config, host, base_port, rank, """Process function for running the sender with protocol communication.""" try: # Mock tensor_model_parallel_rank for this process - import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: rank # Create ring buffer allocator @@ -284,21 +285,10 @@ def test_receiver_creation(self, buffer_config): assert receiver._reg_dlist is not None assert receiver._local_xfer_dlist is not None - def test_creation_with_invalid_buffer_size(self, buffer_config): - """Test creation with invalid buffer size.""" - with pytest.raises( - Exception - ): # Specific exception type depends on NIXL implementation - # Create allocator with invalid size - allocator = RingBufferAllocator( - size=0, # Invalid size - align_to=buffer_config['nixl_page_size']) - - NixlCPUReceiver(allocator=allocator, - nixl_page_size=buffer_config['nixl_page_size']) - def test_nixl_handshake_multiprocess(self, buffer_config): - """Test NIXL handshake between sender and receiver in separate processes.""" + """Test NIXL handshake between sender and receiver in separate + processes. + """ # Setup test parameters test_host = "127.0.0.1" test_base_port = 50051 @@ -376,7 +366,8 @@ def test_nixl_protocol_communication(self, buffer_config): try: # Wait for protocol communication to complete protocol_complete = protocol_success.wait(timeout=20) - assert protocol_complete, "Protocol communication failed or timed out" + assert protocol_complete, \ + "Protocol communication failed or timed out" # Wait for sender process to complete sender_process.join(timeout=5) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py index bd7f73c8f3bb..68b8d4e3f8d6 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decoder_manager.py @@ -4,6 +4,7 @@ import torch.multiprocessing as mp +import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( NixlDecodeManager) @@ -18,11 +19,9 @@ def main(): # Buffer configuration buffer_size = 1 << 30 # 1GB - nixl_page_size = 4096 # Standard page size try: # Mock tensor_model_parallel_rank for this process - import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: test_rank utils.get_tensor_model_parallel_world_size = lambda: 1 utils.get_tp_group = lambda: None diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py index 71a3e7525a54..9059151d56f5 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_receiver.py @@ -4,6 +4,7 @@ import torch.multiprocessing as mp +import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils from vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils import ( NixlCPUReceiver, RingBufferAllocator) @@ -21,7 +22,6 @@ def main(): try: # Mock tensor_model_parallel_rank for this process - import vllm.distributed.kv_transfer.kv_connector.v1.nixl_cpu_utils as utils utils.get_tensor_model_parallel_rank = lambda: test_rank # Create ring buffer allocator @@ -46,19 +46,18 @@ def main(): if finished: for source_spec, vaddr in finished: print( - f"Received data from request {source_spec.request_id}" - ) + f"Got data from request {source_spec.request_id}") paddr = allocator.virtual_to_physical(vaddr) # Verify received data num_elements = source_spec.get_size() - received_data = allocator._buffer[paddr : paddr + num_elements]\ + received_data = allocator._buffer\ + [paddr : paddr + num_elements]\ .view(source_spec.dtype)\ .reshape(source_spec.tensor_shape) - print( - f"Received layer {source_spec.layer_id} tokens " - f"{source_spec.start} - {source_spec.stop} of request " - f"{source_spec.request_id}") + print(f"Received layer {source_spec.layer_id} tokens " + f"{source_spec.start} - {source_spec.stop} of " + f"request {source_spec.request_id}") print(f"The shape is {received_data.shape}") print(f"The digest is {received_data.mean()}") allocator.free(vaddr) From 76e1473f5c5b95622d59f0bf5737d58d94432b29 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 17:17:16 -0700 Subject: [PATCH 16/28] [remove] outdated tests Signed-off-by: ApostaC --- tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py | 2 +- .../{test_toy_example.py => toy_example_outdated.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/v1/kv_connector/cpu_kv_integration/{test_toy_example.py => toy_example_outdated.py} (100%) diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py index 60e443b62d6c..d4837e0d1c56 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py +++ b/tests/v1/kv_connector/cpu_kv_integration/test_nixl_cpu_utils.py @@ -13,7 +13,7 @@ try: #from nixl._api import nixl_agent as NixlWrapper import importlib - spec = importlib.util.find_spec("nixl._api.nixl_agent") + spec = importlib.util.find_spec("nixl._api") if spec is None: raise ImportError("NIXL is not available") NIXL_AVAILABLE = True diff --git a/tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example_outdated.py similarity index 100% rename from tests/v1/kv_connector/cpu_kv_integration/test_toy_example.py rename to tests/v1/kv_connector/cpu_kv_integration/toy_example_outdated.py From 01de06adb9d889b1a3aa1a5c90bd7a47df0ae8f8 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 17:28:20 -0700 Subject: [PATCH 17/28] [remove] hardcodes and fix precommit issues Signed-off-by: ApostaC --- .../cpu_kv_integration/toy_decode.py | 1 + .../cpu_kv_integration/toy_example.py | 1 + .../kv_connector/v1/cpu_connector.py | 20 ++++++++++--------- .../kv_connector/v1/cpu_connector_utils.py | 10 ++-------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index a5a9f7d63cd5..bef73cc1d05c 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -35,6 +35,7 @@ kv_connector_extra_config={ "host": "localhost", "port": 54321, + "size": 4, }, ), #load_format="dummy", diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index fd53d0a88ea1..5531a2137a92 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -35,6 +35,7 @@ kv_connector_extra_config={ "host": "localhost", "port": 54321, + "size": 4, }, ), #load_format="dummy", diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 0d22de168e92..a00b9639b2c2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -398,6 +398,8 @@ def validate_kv_transfer_config( "CPUConnector: must have 'host' in kv_connector_extra_config" assert "port" in extra_config, \ "CPUConnector: must have 'port' in kv_connector_extra_config" + assert "size" in extra_config, \ + "CPUConnector: must have 'size' in kv_connector_extra_config" class CPUConnector(KVConnectorBase_V1): @@ -412,10 +414,11 @@ def __init__(self, vllm_config: "VllmConfig", validate_kv_transfer_config(vllm_config.kv_transfer_config) extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config self._host = extra_config["host"] - self._port = extra_config["port"] - if isinstance(self._port, str): - # Convert the port to an integer if it's a string - self._port = int(self._port) + self._port = int(extra_config["port"]) + # Convert GB to bytes and align to 4K for storage size + kv_size_in_bytes = float(extra_config["size"]) * (1 << 30) + kv_size_in_bytes = int(kv_size_in_bytes) & (~0xFFF) # Align to 4K + self._kv_size = kv_size_in_bytes self.kv_role = vllm_config.kv_transfer_config.kv_role @@ -428,14 +431,13 @@ def __init__(self, vllm_config: "VllmConfig", # Prefiller side sender if self.kv_role == "kv_producer": # TODO: remove the hard-code here - self._kv_sender = NixlPrefillManager(1024 * 1024 * - 1024) # 1GB for debug + self._kv_sender = NixlPrefillManager(self._kv_size) elif self.kv_role == "kv_consumer": # TODO: remove the hard-code here self._kv_receiver = NixlDecodeManager( - 1024 * 1024 * 1024, # 1GB for debug - "localhost", - 54321, + self._kv_size, + self._host, + self._port, ) else: raise ValueError(f"Unknown kv_role: {self.kv_role}") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index b7160b026476..7c8146238c81 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -293,19 +293,13 @@ def send_task(self, task: SendTask) -> None: raise NotImplementedError("send_task() not implemented") @abstractmethod - def pre_progress_hook(self, task: SendTask) -> None: + def pre_progress_hook(self) -> None: """Hook to be called before processing the send task. - - Args: - task (SendTask): The send task to be processed. """ raise NotImplementedError("pre_progress_hook() not implemented") @abstractmethod - def post_progress_hook(self, task: SendTask) -> None: + def post_progress_hook(self) -> None: """Hook to be called after processing the send task. - - Args: - task (SendTask): The send task to be processed. """ raise NotImplementedError("post_progress_hook() not implemented") From 104418ea7057c8f7f9aed0b8fc63f79dbcf70622 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 17:52:33 -0700 Subject: [PATCH 18/28] [remove] previous debug codes Signed-off-by: ApostaC --- .../cpu_kv_integration/toy_decode.py | 17 ++++++++++++++++- .../cpu_kv_integration/toy_example.py | 1 + .../kv_connector/v1/cpu_connector.py | 13 ++----------- .../kv_connector/v1/nixl_cpu_utils.py | 6 +++--- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index bef73cc1d05c..9895bd0a6042 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -10,6 +10,11 @@ from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig + +def get_kv_transfer_params(req_id: int): + return {"prefill_request_id": str(req_id)} + + if __name__ == "__main__": context = "Hi " * 1000 @@ -23,7 +28,16 @@ context4 + "The capital of China is", ] - sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) + sampling_param_base = SamplingParams(temperature=0, + top_p=0.95, + max_tokens=10) + sampling_params = [] + for i in range(len(prompts)): + sampling_param = sampling_param_base.clone() + sampling_param.extra_args = { + "kv_transfer_params": get_kv_transfer_params(i), + } + sampling_params.append(sampling_param) llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", @@ -42,6 +56,7 @@ max_model_len=2048, max_num_batched_tokens=2048, block_size=128, + tensor_parallel_size=1, ) # 1ST generation (prefill instance) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index 5531a2137a92..bfe019733ea5 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -42,6 +42,7 @@ max_model_len=2048, max_num_batched_tokens=2048, block_size=128, + tensor_parallel_size=1, ) # 1ST generation (prefill instance) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index a00b9639b2c2..3545315acd23 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -430,10 +430,8 @@ def __init__(self, vllm_config: "VllmConfig", elif role == KVConnectorRole.WORKER: # Prefiller side sender if self.kv_role == "kv_producer": - # TODO: remove the hard-code here self._kv_sender = NixlPrefillManager(self._kv_size) elif self.kv_role == "kv_consumer": - # TODO: remove the hard-code here self._kv_receiver = NixlDecodeManager( self._kv_size, self._host, @@ -552,13 +550,7 @@ def get_num_new_matched_tokens( "prefill_request_id" not in kv_transfer_params: logger.warning("Request %s does not have prefill_request_id", request.request_id) - #return 0, False - - # DEBUG: Set the prefill_request_id to the request id - # This is a temporary fix to make the code work - self._should_be_ready_reqs.add(request_id) - self._connect_request_ids(request_id, request_id) - return num_tokens // self._block_size * self._block_size, True + return 0, False prefill_request_id = kv_transfer_params["prefill_request_id"] self._connect_request_ids(prefill_request_id, request_id) @@ -724,6 +716,7 @@ def wait_for_layer_load(self, layer_name: str) -> None: for p_req_id in self._inflight_h2d_requests: logger.info("Freeing request %s", p_req_id) self._kv_receiver.free_request(p_req_id) + self._inflight_h2d_requests.clear() @_lmcache_nvtx_annotate def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, @@ -837,8 +830,6 @@ def get_finished( p_ready_reqs = self._kv_receiver.get_finished( len(self._gpu_kv_caches)) ret = set() - # TODO: Bug here: we need to send the prefill request id from - # scheduler connector to the worker connector in kv_params for p_req_id in p_ready_reqs: ret.add(self._prefill_req_id_to_decode_req_id[p_req_id]) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 3983891931c3..0acd891713b3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -570,7 +570,7 @@ class NixlCPUReceiver: def __init__( self, - allocator: RingBufferAllocator = None, + allocator: RingBufferAllocator, nixl_page_size: int = 4096, ) -> None: self._buffer_size = allocator.get_size() @@ -975,8 +975,8 @@ def __init__(self, buffer_size: int, host: str, port: int) -> None: # The detailed specs of the requests # (p_request_id, layer_id) -> (SourceSpec, vaddr) - self._request_specs: dict[tuple(str, int), - list[tuple(SourceSpec, int)]] = {} + self._request_specs: dict[tuple[str, int], list[tuple[SourceSpec, + int]]] = {} # Metadata self.rank = get_tensor_model_parallel_rank() From b4994f0de4403a019c735943734a79d968ee1616 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Tue, 27 May 2025 18:29:37 -0700 Subject: [PATCH 19/28] [Add] bug fix for TP and correctly shutdown Signed-off-by: ApostaC --- .../cpu_kv_integration/toy_decode.py | 2 +- .../cpu_kv_integration/toy_example.py | 9 ++++++--- .../kv_transfer/kv_connector/v1/base.py | 8 ++++++++ .../kv_connector/v1/cpu_connector.py | 12 ++++++++---- .../kv_connector/v1/nixl_cpu_utils.py | 17 ++++++++++++----- .../kv_transfer/kv_transfer_state.py | 2 ++ 6 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index 9895bd0a6042..89310d5fdcd1 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -56,7 +56,7 @@ def get_kv_transfer_params(req_id: int): max_model_len=2048, max_num_batched_tokens=2048, block_size=128, - tensor_parallel_size=1, + tensor_parallel_size=2, ) # 1ST generation (prefill instance) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index bfe019733ea5..6862eabe16ed 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -42,7 +42,7 @@ max_model_len=2048, max_num_batched_tokens=2048, block_size=128, - tensor_parallel_size=1, + tensor_parallel_size=2, ) # 1ST generation (prefill instance) @@ -67,5 +67,8 @@ # HACK: for offline single-process inference only # Wait for all send finishes from vllm.distributed.kv_transfer import get_kv_transfer_group - cpu_connector = get_kv_transfer_group() - cpu_connector.close() + try: + cpu_connector = get_kv_transfer_group() + cpu_connector.close() + except Exception: + pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index ef4460a592bd..5e0f6315357e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -256,3 +256,11 @@ def request_finished( returned by the engine. """ return False, None + + def close(self) -> None: + """ + Close the connector. This is called when the connector is no longer + needed. + """ + logger.debug("Closing KVConnectorBase_V1") + pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 3545315acd23..1ad2050d1584 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -209,6 +209,9 @@ class PrefillRequestTracker: # Request id req_id: str + # Block ids that are already allocated for this request + allocated_block_ids: list[int] + # Total number of tokens in the "full request" num_all_tokens: int = 0 @@ -218,9 +221,6 @@ class PrefillRequestTracker: # Number of tokens that are already saved num_saved_tokens: int = 0 - # Block ids that are already allocated for this request - allocated_block_ids: list[int] = None - @staticmethod def from_new_request( new_request: "NewRequestData", @@ -240,10 +240,10 @@ def from_new_request( return PrefillRequestTracker( req_id=new_request.req_id, + allocated_block_ids=unfolded_block_ids, num_all_tokens=len(new_request.prompt_token_ids), num_total_tokens=num_tokens_to_compute, num_saved_tokens=0, - allocated_block_ids=unfolded_block_ids, ) def update(self, cached_request: "CachedRequestData") -> None: @@ -848,7 +848,11 @@ def close(self): This prevents overwrites of paged KV buffer before saving done. """ + logger.warning("Closing the CPUConnector") if hasattr(self, "_kv_sender") and self._kv_sender is not None: self._kv_sender.close() if hasattr(self, "_kv_receiver") and self._kv_receiver is not None: self._kv_receiver.close() + + def __del__(self): + self.close() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 0acd891713b3..30114fb7b752 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -102,7 +102,7 @@ def __init__(self, size: int, align_to: int = 256) -> None: self._low_watermark = 0 self._align_to = align_to - self._allocated = OrderedDict() # Track allocated buffers + self._allocated: OrderedDict = OrderedDict() # Track allocated buffers # Register pin memory cudart = torch.cuda.cudart() @@ -223,7 +223,7 @@ def high_watermark(self) -> int: def low_watermark(self) -> int: return self._low_watermark - def virtual_to_physical(self, vaddr: int) -> torch.Tensor: + def virtual_to_physical(self, vaddr: int) -> int: """Convert a virtual address to a physical address. Args: @@ -391,7 +391,7 @@ def send( src_paddr: int, dst_paddr: int, data_size: int, - req_uuid: int, + req_uuid: str, destination_spec: DestinationSpec, ) -> nixl_xfer_handle: """Send data from src_addr to dst_addr using NIXL. @@ -910,6 +910,8 @@ def free_task(self, task: SendTask) -> None: Args: task (SendTask): The send task to be freed. """ + assert isinstance(task, NixlSendTask), \ + "Task is not a NixlSendTask" # Free the buffer in the ring buffer allocator self._allocator.free(task.buffer_vaddr) @@ -922,6 +924,8 @@ def send_task(self, task: SendTask) -> None: """ assert isinstance(task, NixlSendTask), \ "Task is not a NixlSendTask" + assert task.receiver_paddr is not None, \ + "Receiver physical address is not set in the task" handle = self._nixl_sender.send( self._allocator.virtual_to_physical(task.buffer_vaddr), task.receiver_paddr, task.source_spec.get_size(), @@ -1069,7 +1073,10 @@ def get_finished(self, num_expected_layers: int) -> list[str]: if self._done_receiving_count[p_request_id] == \ self.world_size: all_done_recving.append(p_request_id) - self._done_receiving_count.pop(p_request_id) + + # Clear the done receiving count for the requests that are done + for p_request_id in all_done_recving: + self._done_receiving_count.pop(p_request_id) return all_done_recving else: self.tp_group.send_object(ready_requests, dst=0) @@ -1095,7 +1102,7 @@ def get_kv_specs(self, p_request_id: str, p_request_id (str): The original request id from prefiller. layer_id (int): The layer id of the request. """ - ret = [] + ret: list[DecoderKVSpec] = [] if (p_request_id, layer_id) not in self._request_specs: logger.warning("Request %s not found in request specs", (p_request_id, layer_id)) diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 25d2f2cf5c6e..62a65cf8e6ba 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import atexit from typing import TYPE_CHECKING, Optional from vllm import envs @@ -62,6 +63,7 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: if envs.VLLM_USE_V1: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1( config=vllm_config, role=KVConnectorRole.WORKER) + atexit.register(_KV_CONNECTOR_AGENT.close, ) else: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0( rank=get_world_group().rank, From 1f019211bba34fa1a4271727764c2ae4f054d69e Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 28 May 2025 17:02:55 -0700 Subject: [PATCH 20/28] [fix] concurrency bug in TP > 1 Signed-off-by: ApostaC --- .../cpu_kv_integration/output_decode.txt | 8 +-- .../cpu_kv_integration/toy_decode.py | 11 ++-- .../kv_transfer/kv_connector/v1/base.py | 3 +- .../kv_connector/v1/cpu_connector.py | 58 +++++++++++++------ .../kv_connector/v1/nixl_cpu_utils.py | 9 +++ 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt b/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt index 2384fe2ab883..e688555677d2 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt +++ b/tests/v1/kv_connector/cpu_kv_integration/output_decode.txt @@ -1,4 +1,4 @@ -Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hello, my name is [Your Name] and I am a [Your -Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi Hi The capital of France is Paris. The capital of France is Paris. The -Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Hello Your name is not in the list. Please check your email for -How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How How The capital of China is Beijing. Beijing is a city in northern China. + Hi Hi Hi Hi Hello, my name is [Your Name] and I am a [Your +Hi Hi The capital of France is Paris. The capital of France is Paris. The +Hello Hello Hello Your name is not in the list. Please check your email for +ow How The capital of China is Beijing. Beijing is a city in northern China. diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index 89310d5fdcd1..2b1716f98989 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -69,7 +69,7 @@ def get_kv_transfer_params(req_id: int): for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) + new_prompts.append(prompt[-30:] + generated_text) #print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # Write new_prompts to output.txt @@ -78,8 +78,9 @@ def get_kv_transfer_params(req_id: int): f.write(prompt + "\n") print(f"Saved {len(new_prompts)} prompts to output.txt") - # HACK: for offline single-process inference only - # Wait for all send finishes from vllm.distributed.kv_transfer import get_kv_transfer_group - cpu_connector = get_kv_transfer_group() - cpu_connector.close() + try: + cpu_connector = get_kv_transfer_group() + cpu_connector.close() + except Exception: + pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 5e0f6315357e..386cfde3dfd8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -262,5 +262,4 @@ def close(self) -> None: Close the connector. This is called when the connector is no longer needed. """ - logger.debug("Closing KVConnectorBase_V1") - pass + return diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 1ad2050d1584..dfef3841e40c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -464,7 +464,8 @@ def __init__(self, vllm_config: "VllmConfig", self._decode_req_metas: dict[str, DecodeReqMeta] = {} # Decode h2d cuda events - self._decoder_cuda_events: list[torch.cuda.Event] = [] + # layer id -> cuda event + self._decoder_cuda_events: dict[int, torch.cuda.Event] = {} # In-progress kv load requests's prefill request ids self._inflight_h2d_requests: set[str] = set() @@ -663,25 +664,31 @@ def start_load_kv(self, forward_context: "ForwardContext", assert isinstance(meta, CPUConnectorMetadata), \ "Connector metadata is not of type CPUConnectorMetadata" + ready_decode_metas = [] + total_expected_tokens = [] for decode_meta in meta.decode_meta: self._connect_request_ids(decode_meta.prefill_req_id, decode_meta.req_id) - if not decode_meta.is_ready: - continue - - total_expected_tokens = len(decode_meta.block_ids) * \ - self._block_size - - self._inflight_h2d_requests.add(decode_meta.prefill_req_id) - for layer_id in range(len(self._gpu_kv_caches)): + if decode_meta.is_ready: + ready_decode_metas.append(decode_meta) + total_expected_tokens.append( + len(decode_meta.block_ids) * \ + self._block_size) + self._inflight_h2d_requests.add(decode_meta.prefill_req_id) + + # Vars needed: + # decode_meta.prefill_req_id + for layer_id in range(len(self._gpu_kv_caches)): + for decode_meta, total_expected in zip(ready_decode_metas, + total_expected_tokens): decode_specs = self._kv_receiver.get_kv_specs( decode_meta.prefill_req_id, layer_id) layer_name = self._layer_id_to_name[layer_id] dst_layer = self._gpu_kv_caches[layer_name] for decode_spec in decode_specs: start = decode_spec.start - stop = min(decode_spec.stop, total_expected_tokens) - if start >= total_expected_tokens: + stop = min(decode_spec.stop, total_expected) + if start >= total_expected: continue src_buffer = decode_spec.buffer block_ids = decode_meta.block_ids @@ -689,9 +696,25 @@ def start_load_kv(self, forward_context: "ForwardContext", with torch.cuda.stream(self._cuda_stream): h2d_page_copy(src_buffer, dst_layer, block_ids, start, stop, self._block_size) - event = torch.cuda.Event() - event.record(self._cuda_stream) - self._decoder_cuda_events.append(event) + + # Record the cuda event for this layer + event = torch.cuda.Event() + event.record(self._cuda_stream) + self._decoder_cuda_events[layer_id] = event + + #for decode_meta in meta.decode_meta: + # self._connect_request_ids(decode_meta.prefill_req_id, + # decode_meta.req_id) + # if not decode_meta.is_ready: + # continue + + # total_expected_tokens = len(decode_meta.block_ids) * \ + # self._block_size + + # self._inflight_h2d_requests.add(decode_meta.prefill_req_id) + #event = torch.cuda.Event() + #event.record(self._cuda_stream) + #self._decoder_cuda_events[ def wait_for_layer_load(self, layer_name: str) -> None: """ @@ -709,7 +732,8 @@ def wait_for_layer_load(self, layer_name: str) -> None: return layer_id = self._get_layer_id(layer_name) - self._decoder_cuda_events[layer_id].synchronize() + event = self._decoder_cuda_events.pop(layer_id) + event.synchronize() if layer_id == len(self._gpu_kv_caches) - 1: # Free the memory for the whole request @@ -848,11 +872,7 @@ def close(self): This prevents overwrites of paged KV buffer before saving done. """ - logger.warning("Closing the CPUConnector") if hasattr(self, "_kv_sender") and self._kv_sender is not None: self._kv_sender.close() if hasattr(self, "_kv_receiver") and self._kv_receiver is not None: self._kv_receiver.close() - - def __del__(self): - self.close() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 30114fb7b752..beb7a4bbc068 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -992,6 +992,10 @@ def __init__(self, buffer_size: int, host: str, port: int) -> None: self._done_receiving_count: defaultdict[str, int] = defaultdict(lambda: 0) + # Already 'ready' request, we don't want to check and return it + # again. + self._already_ready_requests: set[str] = set() + def _check_receive_and_update(self): """Checks the KV cache receiving status and update the internal states @@ -1038,6 +1042,10 @@ def get_finished(self, num_expected_layers: int) -> list[str]: ready_requests = [] self._check_receive_and_update() for p_request_id in self._expected_tokens: + if p_request_id in self._already_ready_requests: + # Already checked and ready, skip it + continue + expected_tokens = self._expected_tokens[p_request_id] assert p_request_id in self._received_tokens # check if all the layers are there @@ -1052,6 +1060,7 @@ def get_finished(self, num_expected_layers: int) -> list[str]: break if ready: ready_requests.append(p_request_id) + self._already_ready_requests.add(p_request_id) if self.world_size == 1: return ready_requests From af03fd5c3b494643cdfeaf23dedf9b3b068a7644 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 28 May 2025 18:23:10 -0700 Subject: [PATCH 21/28] [Add] nsys analysis and add potential optimizations Signed-off-by: ApostaC --- .../cpu_kv_integration/run_nsys.sh | 20 ++++++++++-- .../cpu_kv_integration/toy_decode.py | 2 +- .../cpu_kv_integration/toy_example.py | 2 +- .../kv_connector/v1/cpu_connector.py | 32 ++++++++++--------- .../kv_connector/v1/nixl_cpu_utils.py | 8 +++-- 5 files changed, 43 insertions(+), 21 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh index 3b77790da39a..dae01d303952 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh +++ b/tests/v1/kv_connector/cpu_kv_integration/run_nsys.sh @@ -1,8 +1,24 @@ +#!/bin/bash + +if [[ $1 == "decoder" ]]; then +echo "Running decoder" CUDA_VISIBLE_DEVICES=7 nsys profile \ - --trace=cuda,nvtx,osrt,ucx \ + --trace=cuda,nvtx,osrt \ --gpu-metrics-devices=cuda-visible \ --python-sampling=true \ --trace-fork-before-exec=true \ - --output=prefiller \ + --output=decoder \ --force-overwrite=true \ python3 toy_decode.py + +else +echo "Running prefiller" +CUDA_VISIBLE_DEVICES=6 nsys profile \ + --trace=cuda,nvtx,osrt \ + --gpu-metrics-devices=cuda-visible \ + --python-sampling=true \ + --trace-fork-before-exec=true \ + --output=prefiller \ + --force-overwrite=true \ + python3 toy_example.py +fi diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py index 2b1716f98989..6965f33f5eb4 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_decode.py @@ -56,7 +56,7 @@ def get_kv_transfer_params(req_id: int): max_model_len=2048, max_num_batched_tokens=2048, block_size=128, - tensor_parallel_size=2, + tensor_parallel_size=1, ) # 1ST generation (prefill instance) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py index 6862eabe16ed..a0ab6b74c43e 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_example.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_example.py @@ -42,7 +42,7 @@ max_model_len=2048, max_num_batched_tokens=2048, block_size=128, - tensor_parallel_size=2, + tensor_parallel_size=1, ) # 1ST generation (prefill instance) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index dfef3841e40c..b953781f759c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional +import nvtx import torch from lmcache.utils import _lmcache_nvtx_annotate @@ -640,6 +641,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self._kv_page_shape = kv_caches[list(kv_caches.keys())[0]].shape[2:] + @_lmcache_nvtx_annotate def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: """ @@ -678,6 +680,9 @@ def start_load_kv(self, forward_context: "ForwardContext", # Vars needed: # decode_meta.prefill_req_id + if len(ready_decode_metas) == 0: + return + for layer_id in range(len(self._gpu_kv_caches)): for decode_meta, total_expected in zip(ready_decode_metas, total_expected_tokens): @@ -694,28 +699,22 @@ def start_load_kv(self, forward_context: "ForwardContext", block_ids = decode_meta.block_ids with torch.cuda.stream(self._cuda_stream): + rng = nvtx.start_range("h2d_page_copy") h2d_page_copy(src_buffer, dst_layer, block_ids, start, stop, self._block_size) + nvtx.end_range(rng) # Record the cuda event for this layer event = torch.cuda.Event() event.record(self._cuda_stream) self._decoder_cuda_events[layer_id] = event - #for decode_meta in meta.decode_meta: - # self._connect_request_ids(decode_meta.prefill_req_id, - # decode_meta.req_id) - # if not decode_meta.is_ready: - # continue - - # total_expected_tokens = len(decode_meta.block_ids) * \ - # self._block_size - - # self._inflight_h2d_requests.add(decode_meta.prefill_req_id) - #event = torch.cuda.Event() - #event.record(self._cuda_stream) - #self._decoder_cuda_events[ + # TODO (ApostaC): Potential optimizations + # 1. coalesce the h2d page copy to a single call + # 2. Don't launch all the layers, but just first 2 layers + # 2.1 launch the rest of the layers during the `wait_for_layer_load` + @_lmcache_nvtx_annotate def wait_for_layer_load(self, layer_name: str) -> None: """ Block until the KV for a specific layer is loaded into vLLM's @@ -732,8 +731,9 @@ def wait_for_layer_load(self, layer_name: str) -> None: return layer_id = self._get_layer_id(layer_name) - event = self._decoder_cuda_events.pop(layer_id) - event.synchronize() + event = self._decoder_cuda_events.pop(layer_id, None) + if event is not None: + event.synchronize() if layer_id == len(self._gpu_kv_caches) - 1: # Free the memory for the whole request @@ -800,9 +800,11 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, self._cuda_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self._cuda_stream): # Copy the data from the GPU to the CPU buffer page by page + rng = nvtx.start_range("d2h_page_copy") d2h_page_copy(src_layer=kv_layer, dst_buffer=buffer, block_ids=prefill_req.blocks_to_save) + nvtx.end_range(rng) # record the cuda stream task.cuda_event = torch.cuda.Event() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index beb7a4bbc068..e268dfcea9c2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -555,6 +555,9 @@ def _nixl_handshake(self, destination_spec: DestinationSpec) -> None: destination_spec) def close(self) -> None: + if not hasattr(self, "_nixl_wrapper"): + return + if self._reg_dlist is not None: self._nixl_wrapper.deregister_memory(self._reg_dlist) for agent in self._remote_agents.values(): @@ -778,8 +781,9 @@ def close(self): "Watermark information before closing: (low: %d, high: %d)", self._allocator.low_watermark, self._allocator.high_watermark) self.stop_handshake_listener() - self._nixl_wrapper.deregister_memory(self._reg_dlist) - del self._nixl_wrapper + if hasattr(self, "_nixl_wrapper"): + self._nixl_wrapper.deregister_memory(self._reg_dlist) + del self._nixl_wrapper @contextlib.contextmanager From 483ed5a12d01210b64fb3072b082c5cbe3b64427 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 30 May 2025 14:18:54 -0700 Subject: [PATCH 22/28] [Add] small fixes for corner cases Signed-off-by: ApostaC --- .../cpu_kv_integration/online_test.sh | 51 +++++ .../cpu_kv_integration/toy_proxy_server.py | 209 ++++++++++++++++++ .../kv_connector/v1/cpu_connector.py | 21 +- 3 files changed, 279 insertions(+), 2 deletions(-) create mode 100644 tests/v1/kv_connector/cpu_kv_integration/online_test.sh create mode 100644 tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py diff --git a/tests/v1/kv_connector/cpu_kv_integration/online_test.sh b/tests/v1/kv_connector/cpu_kv_integration/online_test.sh new file mode 100644 index 000000000000..cf320087fd21 --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/online_test.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [model]" + exit 1 +fi + +if [[ $# -eq 1 ]]; then + echo "Using default model: meta-llama/Llama-3.1-8B-Instruct" + MODEL="meta-llama/Llama-3.1-8B-Instruct" +else + echo "Using model: $2" + MODEL=$2 +fi + + +if [[ $1 == "prefiller" ]]; then + # Prefiller listens on port 8100 + #UCX_TLS=cuda_ipc,cuda_copy,tcp \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + CUDA_VISIBLE_DEVICES=0 \ + vllm serve $MODEL \ + --port 8100 \ + --disable-log-requests \ + --enforce-eager \ + --kv-transfer-config \ + '{"kv_connector":"CPUConnector","kv_role":"kv_producer","kv_connector_extra_config": {"host": "localhost", "port": "54321", "size": 8}}' + + +elif [[ $1 == "decoder" ]]; then + # Decoder listens on port 8200 + #UCX_TLS=cuda_ipc,cuda_copy,tcp \ + VLLM_ENABLE_V1_MULTIPROCESSING=1 \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + CUDA_VISIBLE_DEVICES=1 \ + vllm serve $MODEL \ + --port 8200 \ + --disable-log-requests \ + --enforce-eager \ + --kv-transfer-config \ + '{"kv_connector":"CPUConnector","kv_role":"kv_consumer","kv_connector_extra_config": {"host": "localhost", "port": "54321", "size": 8}}' + + +else + echo "Invalid role: $1" + echo "Should be either prefiller, decoder" + exit 1 +fi diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py b/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py new file mode 100644 index 000000000000..690d4f3286ce --- /dev/null +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +import time +from contextlib import asynccontextmanager + +import httpx +import numpy as np +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + Lifespan context manager to handle startup and shutdown events. + """ + # Startup: Initialize clients + prefiller_base_url = ( + f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1") + decoder_base_url = ( + f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1") + + app.state.prefill_client = httpx.AsyncClient(timeout=None, + base_url=prefiller_base_url) + app.state.decode_client = httpx.AsyncClient(timeout=None, + base_url=decoder_base_url) + + yield + + # Shutdown: Close clients + await app.state.prefill_client.aclose() + await app.state.decode_client.aclose() + + +# Update FastAPI app initialization to use lifespan +app = FastAPI(lifespan=lifespan) + + +class StatsCalculator: + + def __init__(self): + self._stats = [] + self._last_log_time = time.time() + + def add(self, value): + self._stats.append(value) + if time.time() - self._last_log_time > 5: + self._log_stats() + self._last_log_time = time.time() + + def _log_stats(self): + # Print average, median, and 99th percentile + np_arr = np.array(self._stats) + output_str = ( + f"\nNum requests: {len(self._stats)}" + + "\nPrefill node TTFT stats:" + + f"\n - Average (ms): {np.mean(np_arr)}" + + f"\n - Median (ms): {np.median(np_arr)}" + + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n") + print( + "===============================", + output_str, + "===============================", + ) + + +stats_calculator = StatsCalculator() +counter = 0 + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--prefiller-host", type=str, default="localhost") + parser.add_argument("--prefiller-port", type=int, default=8100) + parser.add_argument("--decoder-host", type=str, default="localhost") + parser.add_argument("--decoder-port", type=int, default=8200) + args = parser.parse_args() + return args + + +# Initialize variables to hold the persistent clients +app.state.prefill_client = None +app.state.decode_client = None + + +async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, + req_data: dict): + """ + Send a request to a service using a persistent client. + """ + req_data = req_data.copy() + req_data["max_tokens"] = 1 + if "max_completion_tokens" in req_data: + req_data["max_completion_tokens"] = 1 + + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + response = await client.post(endpoint, json=req_data, headers=headers) + response.raise_for_status() + return response + + +async def stream_service_response(client: httpx.AsyncClient, endpoint: str, + req_data: dict): + """ + Asynchronously stream the response from a service using a persistent client. + """ + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + async with client.stream("POST", endpoint, json=req_data, + headers=headers) as response: + response.raise_for_status() + async for chunk in response.aiter_bytes(): + yield chunk + + +@app.post("/v1/completions") +async def handle_completions(request: Request): + global counter, stats_calculator + counter += 1 + + st = time.time() + try: + req_data = await request.json() + + # Send request to prefill service, ignore the response + response = await send_request_to_service(app.state.prefill_client, + "/completions", req_data) + + # Extract the needed fields + response_json = response.json() + kv_transfer_params = response_json.get('kv_transfer_params', {}) + print("Got kv_transfer_params:", kv_transfer_params) + if kv_transfer_params: + req_data["kv_transfer_params"] = kv_transfer_params + + et = time.time() + stats_calculator.add(et - st) + + # Stream response from decode service + async def generate_stream(): + async for chunk in stream_service_response(app.state.decode_client, + "/completions", + req_data): + yield chunk + + return StreamingResponse(generate_stream(), + media_type="text/event-stream") + + except Exception as e: + import sys + import traceback + + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server", + "- completions endpoint") + print(e) + print("".join(traceback.format_exception(*exc_info))) + raise + + +@app.post("/v1/chat/completions") +async def handle_chat_completions(request: Request): + global counter, stats_calculator + counter += 1 + + st = time.time() + try: + req_data = await request.json() + + # Send request to prefill service, ignore the response + await send_request_to_service(app.state.prefill_client, + "/chat/completions", req_data) + + et = time.time() + stats_calculator.add(et - st) + + # Stream response from decode service + async def generate_stream(): + async for chunk in stream_service_response(app.state.decode_client, + "/chat/completions", + req_data): + yield chunk + + return StreamingResponse(generate_stream(), + media_type="text/event-stream") + + except Exception as e: + import sys + import traceback + + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server ", + "- chat completions endpoint") + print(e) + print("".join(traceback.format_exception(*exc_info))) + raise + + +if __name__ == "__main__": + global global_args + global_args = parse_args() + + import uvicorn + + uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index b953781f759c..9bcda857af37 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -544,6 +544,14 @@ def get_num_new_matched_tokens( "total_num_tokens is %d", request_id, num_computed_tokens, num_tokens) + if num_tokens < self._block_size: + # If the request is smaller than the block size, we don't need + # to do anything special + logger.info( + "Request %s is smaller than block size %d, " + "no async loading", request_id, self._block_size) + return 0, False + if request.request_id in self._should_be_ready_reqs: self._should_be_ready_reqs.remove(request.request_id) return 0, False @@ -582,6 +590,13 @@ def update_state_after_alloc(self, request: "Request", self._decode_req_metas[request.request_id].is_ready = True return + if request.request_id not in self._decode_req_id_to_prefill_req_id: + # This should not happen, but just in case + logger.warning( + "Request %s does not have prefill request id, " + "skipping decode meta creation", request.request_id) + return + p_req_id = self._decode_req_id_to_prefill_req_id[request.request_id] block_ids = [] for blks in blocks.get_block_ids(): @@ -611,8 +626,10 @@ def request_finished( request: "Request", block_ids: list[int], ) -> tuple[bool, Optional[dict[str, Any]]]: - print("In request_finished") - return False, None + if self.kv_role == "kv_consumer": + return False, None + # For prefiller, send back the prefiller request id + return False, dict(prefill_request_id=request.request_id) ############################################################# # Worker Side Methods From b27c101789fd160a0a1457688b8313a2a3d4fd2d Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 30 May 2025 14:32:36 -0700 Subject: [PATCH 23/28] temp fix for pending request ids Signed-off-by: ApostaC --- vllm/v1/core/sched/scheduler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 5ad05485e8f3..2662f707cf11 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -97,7 +97,8 @@ def __init__( # This is flushed at the end of each scheduling step. self.finished_req_ids: set[str] = set() - # P/D: requests in process of recving KV transfers + # P/D: requests in process of sending/recving KV transfers + self.pending_free_req_ids: set[str] = set() self.finished_recving_kv_req_ids: set[str] = set() # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -869,6 +870,7 @@ def _free_request(self, request: Request) -> Optional[dict[str, Any]]: if not delay_free_blocks: self._free_blocks(request) + self.pending_free_req_ids.add(request.request_id) return kv_xfer_params @@ -880,7 +882,8 @@ def _free_blocks(self, request: Request): del self.requests[request.request_id] def get_num_unfinished_requests(self) -> int: - return len(self.waiting) + len(self.running) + return len(self.waiting) + len(self.running) + len( + self.pending_free_req_ids) def has_finished_requests(self) -> bool: return len(self.finished_req_ids) > 0 @@ -997,3 +1000,4 @@ def _update_from_kv_xfer_finished(self, for req_id in (model_runner_output.finished_sending or ()): logger.debug("Finished sending KV transfer for request %s", req_id) self._free_blocks(self.requests[req_id]) + self.pending_free_req_ids.remove(req_id) From 78e6c025445f35b6eb9fbaca6328d70ef5ac0a83 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 30 May 2025 14:48:10 -0700 Subject: [PATCH 24/28] [fix] online problems Signed-off-by: ApostaC --- .../distributed/kv_transfer/kv_connector/v1/cpu_connector.py | 1 + .../kv_transfer/kv_connector/v1/cpu_connector_utils.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 9bcda857af37..439fad1f4a92 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -881,6 +881,7 @@ def get_finished( return None, ret else: + self._kv_sender.progress() return None, None def close(self): diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index 7c8146238c81..d40008eeb9d3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -249,8 +249,9 @@ def progress(self) -> None: # Update after going through all send tasks self.post_progress_hook() - logger.info("KVSender progress: sent %d, freed %d", num_sent, - num_freed) + if num_sent > 0 or num_freed > 0: + logger.debug("KVSender progress: sent %d, freed %d", num_sent, + num_freed) ###################################################### # Abstract methods (to be implemented by subclasses) # From 32dc419fa5e6582414d9e689bf544253ea955877 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Fri, 30 May 2025 17:33:03 -0700 Subject: [PATCH 25/28] [Add] passed the initial benchmark test Signed-off-by: ApostaC --- .../cpu_kv_integration/toy_proxy_server.py | 6 ++ .../kv_connector/v1/cpu_connector.py | 60 +++++++++++++++---- .../kv_connector/v1/nixl_cpu_utils.py | 19 +++++- vllm/v1/core/sched/scheduler.py | 6 +- 4 files changed, 74 insertions(+), 17 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py b/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py index 690d4f3286ce..8da7e4ae8cde 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py @@ -95,11 +95,15 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, """ req_data = req_data.copy() req_data["max_tokens"] = 1 + req_data["stream"] = False + if "stream_options" in req_data: + del req_data["stream_options"] if "max_completion_tokens" in req_data: req_data["max_completion_tokens"] = 1 headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} response = await client.post(endpoint, json=req_data, headers=headers) + print("Got the response:", response.json()) response.raise_for_status() return response @@ -125,6 +129,7 @@ async def handle_completions(request: Request): st = time.time() try: req_data = await request.json() + print("Received a new request!") # Send request to prefill service, ignore the response response = await send_request_to_service(app.state.prefill_client, @@ -142,6 +147,7 @@ async def handle_completions(request: Request): # Stream response from decode service async def generate_stream(): + print("Streaming response from decode service") async for chunk in stream_service_response(app.state.decode_client, "/completions", req_data): diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 439fad1f4a92..fab5b799a4e8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -1,4 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +import threading +import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional @@ -16,7 +18,7 @@ NixlDecodeManager, NixlPrefillManager, NixlSendTask) from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils import cdiv, round_down from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: @@ -432,6 +434,14 @@ def __init__(self, vllm_config: "VllmConfig", # Prefiller side sender if self.kv_role == "kv_producer": self._kv_sender = NixlPrefillManager(self._kv_size) + self._kv_sender_lock = threading.Lock() + self._kv_sender_stop_event = threading.Event() + self._kv_sender_thread = threading.Thread( + target=self._kv_sender_processor, + daemon=True, + ) + self._kv_sender_thread.start() + elif self.kv_role == "kv_consumer": self._kv_receiver = NixlDecodeManager( self._kv_size, @@ -544,7 +554,10 @@ def get_num_new_matched_tokens( "total_num_tokens is %d", request_id, num_computed_tokens, num_tokens) - if num_tokens < self._block_size: + num_extra_tokens = round_down(num_tokens, + self._block_size) - num_computed_tokens + + if num_extra_tokens < self._block_size: # If the request is smaller than the block size, we don't need # to do anything special logger.info( @@ -552,6 +565,8 @@ def get_num_new_matched_tokens( "no async loading", request_id, self._block_size) return 0, False + # Seen this request before, which means it should be ready this time, + # so we don't need to do async loading again if request.request_id in self._should_be_ready_reqs: self._should_be_ready_reqs.remove(request.request_id) return 0, False @@ -570,7 +585,7 @@ def get_num_new_matched_tokens( # the async flag is true (see _update_waiting_for_remote_kv in # scheduler.py). We need to carefully deal with it when copying # the KV cache at worker side - return num_tokens // self._block_size * self._block_size, True + return num_extra_tokens, True def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks", @@ -629,11 +644,19 @@ def request_finished( if self.kv_role == "kv_consumer": return False, None # For prefiller, send back the prefiller request id + logger.info("Prefill request %s finished", request.request_id) return False, dict(prefill_request_id=request.request_id) ############################################################# # Worker Side Methods ############################################################# + def _kv_sender_processor(self) -> None: + """Process the KV sender tasks in a separate thread.""" + while not self._kv_sender_stop_event.is_set(): + with self._kv_sender_lock: + self._kv_sender.progress() + time.sleep(0.001) # Sleep for a short time to avoid busy waiting + def _get_layer_id(self, layer_name: str) -> int: assert layer_name in self._layer_name_to_id, \ f"Layer {layer_name} not found in layer name to id map" @@ -755,7 +778,10 @@ def wait_for_layer_load(self, layer_name: str) -> None: if layer_id == len(self._gpu_kv_caches) - 1: # Free the memory for the whole request for p_req_id in self._inflight_h2d_requests: - logger.info("Freeing request %s", p_req_id) + logger.info("Freeing request %s, current watermark: [%d, %d]", + p_req_id, + self._kv_receiver._allocator.low_watermark, + self._kv_receiver._allocator.high_watermark) self._kv_receiver.free_request(p_req_id) self._inflight_h2d_requests.clear() @@ -805,10 +831,11 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, ) # Create the send task - task = self._kv_sender.create_send_task( - source_spec=source_spec, - destination_spec=dest_spec, - ) + with self._kv_sender_lock: + task = self._kv_sender.create_send_task( + source_spec=source_spec, + destination_spec=dest_spec, + ) assert isinstance(task, NixlSendTask), \ "Send task is not of type NixlSendTask" @@ -851,7 +878,7 @@ def wait_for_save(self): for task in self._inflight_copy_tasks: if task.cuda_event is not None: task.cuda_event.synchronize() - self._kv_sender.progress() + #self._kv_sender.progress() self._inflight_copy_tasks.clear() def get_finished( @@ -874,14 +901,20 @@ def get_finished( len(self._gpu_kv_caches)) ret = set() for p_req_id in p_ready_reqs: - ret.add(self._prefill_req_id_to_decode_req_id[p_req_id]) + if p_req_id in self._prefill_req_id_to_decode_req_id: + ret.add(self._prefill_req_id_to_decode_req_id[p_req_id]) + else: + # We haven't seen the corresponding decode request + # before. Therefore, we should make the receiver + # to return the request id again in the next + # call to get_finished. + self._kv_receiver.remove_ready_request(p_req_id) if ret: logger.info("Got finished requests: %s", ret) return None, ret else: - self._kv_sender.progress() return None, None def close(self): @@ -893,6 +926,11 @@ def close(self): This prevents overwrites of paged KV buffer before saving done. """ if hasattr(self, "_kv_sender") and self._kv_sender is not None: + self._kv_sender_stop_event.set() + if hasattr(self, "_kv_sender_thread") and \ + self._kv_sender_thread is not None: + self._kv_sender_thread.join() self._kv_sender.close() + if hasattr(self, "_kv_receiver") and self._kv_receiver is not None: self._kv_receiver.close() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index e268dfcea9c2..5eb42a71fd3b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -659,6 +659,8 @@ def _process_allocation_requests(self): # NOTE: an alternative is to try allocation for other requests # and then come back to this one, but this may create # starvation + logger.info("No space available for request %s, skipping", + req_uuid) break # Add the request to the inflight requests @@ -1038,7 +1040,11 @@ def progress(self) -> None: def get_finished(self, num_expected_layers: int) -> list[str]: """Get the prefill node request_ids of the requests that finishes receiving (which means the KV caches of all tokens and all layers - are in CPU memory) + are in CPU memory). + + By default, if a request's id will only be returned once. However, + the caller can call `remove_ready_request` to force the get_finished + to return the request id again in the next call. Returns: list[str]: A list of prefill-side request ids. @@ -1095,6 +1101,15 @@ def get_finished(self, num_expected_layers: int) -> list[str]: self.tp_group.send_object(ready_requests, dst=0) return ready_requests + def remove_ready_request(self, p_request_id: str) -> None: + """Remove the request from the 'ready' request list so that + it will be checked again in the next of get_finished. + + Args: + p_request_id (str): The prefill-side request id. + """ + self._already_ready_requests.discard(p_request_id) + def _create_decoder_kv_spec(self, source_spec: SourceSpec, vaddr: int) -> DecoderKVSpec: """Create a DecoderKVSpec from the source spec and the virtual address. @@ -1155,5 +1170,7 @@ def free_request(self, p_request_id): logger.warning("Request %s not found in received tokens", p_request_id) + self.remove_ready_request(p_request_id) + def close(self): self._nixl_receiver.close() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2662f707cf11..d740ab835497 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -98,7 +98,6 @@ def __init__( self.finished_req_ids: set[str] = set() # P/D: requests in process of sending/recving KV transfers - self.pending_free_req_ids: set[str] = set() self.finished_recving_kv_req_ids: set[str] = set() # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -870,7 +869,6 @@ def _free_request(self, request: Request) -> Optional[dict[str, Any]]: if not delay_free_blocks: self._free_blocks(request) - self.pending_free_req_ids.add(request.request_id) return kv_xfer_params @@ -882,8 +880,7 @@ def _free_blocks(self, request: Request): del self.requests[request.request_id] def get_num_unfinished_requests(self) -> int: - return len(self.waiting) + len(self.running) + len( - self.pending_free_req_ids) + return len(self.waiting) + len(self.running) def has_finished_requests(self) -> bool: return len(self.finished_req_ids) > 0 @@ -1000,4 +997,3 @@ def _update_from_kv_xfer_finished(self, for req_id in (model_runner_output.finished_sending or ()): logger.debug("Finished sending KV transfer for request %s", req_id) self._free_blocks(self.requests[req_id]) - self.pending_free_req_ids.remove(req_id) From 9683b480ee44e045b24125179b88b8babf72cbb7 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Sun, 15 Jun 2025 16:13:55 -0700 Subject: [PATCH 26/28] Address the review comments Signed-off-by: ApostaC --- .../kv_connector/v1/cpu_connector.py | 122 ++++++++---------- .../kv_connector/v1/cpu_connector_utils.py | 2 - .../kv_connector/v1/nixl_cpu_utils.py | 2 - vllm/v1/core/sched/scheduler.py | 2 +- 4 files changed, 55 insertions(+), 73 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index fab5b799a4e8..51a4478ec6b0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -4,9 +4,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional -import nvtx import torch -from lmcache.utils import _lmcache_nvtx_annotate from vllm import _custom_ops as ops from vllm.config import VllmConfig @@ -52,10 +50,11 @@ def d2h_page_copy(src_layer: torch.Tensor, dst_buffer: torch.Tensor, ops.swap_blocks(src_layer[1], dst_buffer[1], block_mapping) -def h2d_copy_leading_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, - src_block_id: int, dst_block_id: int, - end_position_in_block: int) -> None: - """Copy the leading tokens in 1 block from host buffer to device layer. +def h2d_copy_part_block(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + src_block_id: int, dst_block_id: int, + start_position_in_block: int, + end_position_in_block: Optional[int]) -> None: + """Copy the part of a block from host buffer to device layer. Args: src_buffer (torch.Tensor): The source buffer on host, shape is @@ -64,20 +63,29 @@ def h2d_copy_leading_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, (2, num_vllm_blocks, page_size, ...remaining dims...) src_block_id (int): The source block id to copy. dst_block_id (int): The destination block id to copy. + start_position_in_block (int): The start position in the block to copy. end_position_in_block (int): The end position in the block to copy. """ - dst_k = dst_layer[0][dst_block_id][:end_position_in_block] - src_k = src_buffer[0][src_block_id][:end_position_in_block] - dst_v = dst_layer[1][dst_block_id][:end_position_in_block] - src_v = src_buffer[1][src_block_id][:end_position_in_block] + if end_position_in_block is None: + # If end_position_in_block is None, copy until the end of the block + end_position_in_block = src_buffer[0][0].shape[0] + + dst_k = dst_layer[0][dst_block_id][ + start_position_in_block:end_position_in_block] + src_k = src_buffer[0][src_block_id][ + start_position_in_block:end_position_in_block] + dst_v = dst_layer[1][dst_block_id][ + start_position_in_block:end_position_in_block] + src_v = src_buffer[1][src_block_id][ + start_position_in_block:end_position_in_block] dst_k.copy_(src_k, non_blocking=True) dst_v.copy_(src_v, non_blocking=True) -def h2d_copy_trailing_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, - src_block_id: int, dst_block_id: int, - start_position_in_block: int) -> None: - """Copy the trailing tokens in 1 block from host buffer to device layer. +def h2d_copy_leading_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + src_block_id: int, dst_block_id: int, + end_position_in_block: int) -> None: + """Copy the leading tokens in 1 block from host buffer to device layer. Args: src_buffer (torch.Tensor): The source buffer on host, shape is @@ -86,21 +94,16 @@ def h2d_copy_trailing_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, (2, num_vllm_blocks, page_size, ...remaining dims...) src_block_id (int): The source block id to copy. dst_block_id (int): The destination block id to copy. - start_position_in_block (int): The start position in the block to copy. + end_position_in_block (int): The end position in the block to copy. """ - dst_k = dst_layer[0][dst_block_id][start_position_in_block:] - src_k = src_buffer[0][src_block_id][start_position_in_block:] - dst_v = dst_layer[1][dst_block_id][start_position_in_block:] - src_v = src_buffer[1][src_block_id][start_position_in_block:] - dst_k.copy_(src_k, non_blocking=True) - dst_v.copy_(src_v, non_blocking=True) + h2d_copy_part_block(src_buffer, dst_layer, src_block_id, dst_block_id, 0, + end_position_in_block) -def h2d_copy_part_block(src_buffer: torch.Tensor, dst_layer: torch.Tensor, - src_block_id: int, dst_block_id: int, - start_position_in_block: int, - end_position_in_block: int) -> None: - """Copy the part of a block from host buffer to device layer. +def h2d_copy_trailing_tokens(src_buffer: torch.Tensor, dst_layer: torch.Tensor, + src_block_id: int, dst_block_id: int, + start_position_in_block: int) -> None: + """Copy the trailing tokens in 1 block from host buffer to device layer. Args: src_buffer (torch.Tensor): The source buffer on host, shape is @@ -110,18 +113,9 @@ def h2d_copy_part_block(src_buffer: torch.Tensor, dst_layer: torch.Tensor, src_block_id (int): The source block id to copy. dst_block_id (int): The destination block id to copy. start_position_in_block (int): The start position in the block to copy. - end_position_in_block (int): The end position in the block to copy. """ - dst_k = dst_layer[0][dst_block_id][ - start_position_in_block:end_position_in_block] - src_k = src_buffer[0][src_block_id][ - start_position_in_block:end_position_in_block] - dst_v = dst_layer[1][dst_block_id][ - start_position_in_block:end_position_in_block] - src_v = src_buffer[1][src_block_id][ - start_position_in_block:end_position_in_block] - dst_k.copy_(src_k, non_blocking=True) - dst_v.copy_(src_v, non_blocking=True) + h2d_copy_part_block(src_buffer, dst_layer, src_block_id, dst_block_id, + start_position_in_block, None) def h2d_page_copy(src_buffer: torch.Tensor, dst_layer: torch.Tensor, @@ -429,7 +423,6 @@ def __init__(self, vllm_config: "VllmConfig", if role == KVConnectorRole.SCHEDULER: self._should_be_ready_reqs: set[str] = set() - pass elif role == KVConnectorRole.WORKER: # Prefiller side sender if self.kv_role == "kv_producer": @@ -532,7 +525,7 @@ def build_decode_meta(self, scheduler_output: SchedulerOutput, updated_decode_req_metas[req_meta.req_id] = req_meta # NOTE (ApostaC): Even if the request is not ready, we still # want the worker connector to know about it, so that it can - # connector the decode request id to the prefill request id + # connect the decode request id to the prefill request id output_meta.add_decode(req_meta) self._decode_req_metas = updated_decode_req_metas @@ -681,7 +674,6 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self._kv_page_shape = kv_caches[list(kv_caches.keys())[0]].shape[2:] - @_lmcache_nvtx_annotate def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: """ @@ -739,10 +731,8 @@ def start_load_kv(self, forward_context: "ForwardContext", block_ids = decode_meta.block_ids with torch.cuda.stream(self._cuda_stream): - rng = nvtx.start_range("h2d_page_copy") h2d_page_copy(src_buffer, dst_layer, block_ids, start, stop, self._block_size) - nvtx.end_range(rng) # Record the cuda event for this layer event = torch.cuda.Event() @@ -754,7 +744,6 @@ def start_load_kv(self, forward_context: "ForwardContext", # 2. Don't launch all the layers, but just first 2 layers # 2.1 launch the rest of the layers during the `wait_for_layer_load` - @_lmcache_nvtx_annotate def wait_for_layer_load(self, layer_name: str) -> None: """ Block until the KV for a specific layer is loaded into vLLM's @@ -785,7 +774,6 @@ def wait_for_layer_load(self, layer_name: str) -> None: self._kv_receiver.free_request(p_req_id) self._inflight_h2d_requests.clear() - @_lmcache_nvtx_annotate def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata: "AttentionMetadata", **kwargs) -> None: """ @@ -844,11 +832,9 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, self._cuda_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self._cuda_stream): # Copy the data from the GPU to the CPU buffer page by page - rng = nvtx.start_range("d2h_page_copy") d2h_page_copy(src_layer=kv_layer, dst_buffer=buffer, block_ids=prefill_req.blocks_to_save) - nvtx.end_range(rng) # record the cuda stream task.cuda_event = torch.cuda.Event() @@ -861,7 +847,6 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, # 2. use a single cuda event instead of a list of cuda events # 3. use a cuda event pool to prevent the creation overhead - @_lmcache_nvtx_annotate def wait_for_save(self): """ Block until all the save operations is done. This is called @@ -894,29 +879,30 @@ def get_finished( The finished saves/sends req ids must belong to a set provided in a call to this method (this call or a prior one). """ - if self.kv_role == "kv_consumer": - # decoder side - self._kv_receiver.progress() - p_ready_reqs = self._kv_receiver.get_finished( - len(self._gpu_kv_caches)) - ret = set() - for p_req_id in p_ready_reqs: - if p_req_id in self._prefill_req_id_to_decode_req_id: - ret.add(self._prefill_req_id_to_decode_req_id[p_req_id]) - else: - # We haven't seen the corresponding decode request - # before. Therefore, we should make the receiver - # to return the request id again in the next - # call to get_finished. - self._kv_receiver.remove_ready_request(p_req_id) - - if ret: - logger.info("Got finished requests: %s", ret) - - return None, ret - else: + if self.kv_role != "kv_consumer": return None, None + # decoder (kv_consumer) side + self._kv_receiver.progress() + p_ready_reqs = self._kv_receiver.get_finished(len(self._gpu_kv_caches)) + ret = set() + for p_req_id in p_ready_reqs: + if d_req_id := self._decode_req_id_to_prefill_req_id.get(p_req_id): + # We have seen the corresponding decode request before. + # Therefore, we can return the request id. + ret.add(d_req_id) + else: + # We haven't seen the corresponding decode request + # before. Therefore, we should make the receiver + # to return the request id again in the next + # call to get_finished. + self._kv_receiver.remove_ready_request(p_req_id) + + if ret: + logger.info("Got finished requests: %s", ret) + + return None, ret + def close(self): """ Block until all the transfers are done. This is called diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py index d40008eeb9d3..5147c710ac0d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector_utils.py @@ -6,7 +6,6 @@ import msgspec import torch -from lmcache.utils import _lmcache_nvtx_annotate from vllm.logger import init_logger @@ -215,7 +214,6 @@ def get_send_tasks(self) -> list[SendTask]: """ return self._send_tasks - @_lmcache_nvtx_annotate def progress(self) -> None: """A fast, non-blocking function to check and update the states of all send tasks. This function should be called periodically to ensure that diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py index 5eb42a71fd3b..dab1810fc765 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_cpu_utils.py @@ -11,7 +11,6 @@ import msgspec import torch import zmq -from lmcache.utils import _lmcache_nvtx_annotate from vllm.distributed.kv_transfer.kv_connector.v1.cpu_connector_utils import ( DecoderKVSpec, DestinationSpec, KVSenderInterface, SendTask, SendTaskState, @@ -831,7 +830,6 @@ class NixlSendTask(SendTask): def __post_init__(self) -> None: self.creation_time = time.time() - @_lmcache_nvtx_annotate def update_states(self) -> None: """Update the states of the send task. """ diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d740ab835497..806b5f09ce16 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -97,7 +97,7 @@ def __init__( # This is flushed at the end of each scheduling step. self.finished_req_ids: set[str] = set() - # P/D: requests in process of sending/recving KV transfers + # P/D: requests in process of loading/recving KV transfers self.finished_recving_kv_req_ids: set[str] = set() # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating From 3720cf8ae85d004ae08358d627069c384dedb6c1 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Mon, 23 Jun 2025 21:58:21 -0700 Subject: [PATCH 27/28] [fix] crash problem Signed-off-by: ApostaC --- .../cpu_kv_integration/online_test.sh | 4 ++-- .../cpu_kv_integration/toy_proxy_server.py | 4 ++-- .../kv_transfer/kv_connector/v1/cpu_connector.py | 16 +++++++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/v1/kv_connector/cpu_kv_integration/online_test.sh b/tests/v1/kv_connector/cpu_kv_integration/online_test.sh index cf320087fd21..c1eee5cfc69f 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/online_test.sh +++ b/tests/v1/kv_connector/cpu_kv_integration/online_test.sh @@ -27,7 +27,7 @@ if [[ $1 == "prefiller" ]]; then --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ - '{"kv_connector":"CPUConnector","kv_role":"kv_producer","kv_connector_extra_config": {"host": "localhost", "port": "54321", "size": 8}}' + '{"kv_connector":"CPUConnector","kv_role":"kv_producer","kv_connector_extra_config": {"host": "localhost", "port": "54321", "size": 40}}' elif [[ $1 == "decoder" ]]; then @@ -41,7 +41,7 @@ elif [[ $1 == "decoder" ]]; then --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ - '{"kv_connector":"CPUConnector","kv_role":"kv_consumer","kv_connector_extra_config": {"host": "localhost", "port": "54321", "size": 8}}' + '{"kv_connector":"CPUConnector","kv_role":"kv_consumer","kv_connector_extra_config": {"host": "localhost", "port": "54321", "size": 40}}' else diff --git a/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py b/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py index 8da7e4ae8cde..636ed81dd6c8 100644 --- a/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/cpu_kv_integration/toy_proxy_server.py @@ -154,7 +154,7 @@ async def generate_stream(): yield chunk return StreamingResponse(generate_stream(), - media_type="text/event-stream") + media_type="application/json") except Exception as e: import sys @@ -192,7 +192,7 @@ async def generate_stream(): yield chunk return StreamingResponse(generate_stream(), - media_type="text/event-stream") + media_type="application/json") except Exception as e: import sys diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index 51a4478ec6b0..c8b8220fda9b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -143,6 +143,8 @@ def h2d_page_copy(src_buffer: torch.Tensor, dst_layer: torch.Tensor, src_block_ids = torch.arange(start_block_id, end_block_id, dtype=torch.long) + num_blocks = len(src_block_ids) + if separate_first_block: src_block_ids = src_block_ids[1:] # NOTE: we don't need to add the last block id here, because the @@ -156,13 +158,17 @@ def h2d_page_copy(src_buffer: torch.Tensor, dst_layer: torch.Tensor, vllm_block_ids = torch.tensor(block_ids, dtype=torch.long) dst_block_ids = vllm_block_ids[src_block_ids] + real_src_block_ids = src_block_ids - start_block_id # Step 2: copy the first and last block separately if needed if start_block_id == end_block_id: # Only one block to copy start_position_in_block = start_token_idx % block_size end_position_in_block = stop_token_idx % block_size - h2d_copy_part_block(src_buffer, dst_layer, start_block_id, + #h2d_copy_part_block(src_buffer, dst_layer, start_block_id, + # vllm_block_ids[start_block_id], + # start_position_in_block, end_position_in_block) + h2d_copy_part_block(src_buffer, dst_layer, 0, vllm_block_ids[start_block_id], start_position_in_block, end_position_in_block) return @@ -171,18 +177,18 @@ def h2d_page_copy(src_buffer: torch.Tensor, dst_layer: torch.Tensor, first_block_id_src = start_block_id first_block_id_dst = vllm_block_ids[first_block_id_src] start_token_idx_in_block = start_token_idx % block_size - h2d_copy_trailing_tokens(src_buffer, dst_layer, first_block_id_src, - first_block_id_dst, start_token_idx_in_block) + h2d_copy_trailing_tokens(src_buffer, dst_layer, 0, first_block_id_dst, + start_token_idx_in_block) if separate_last_block: last_block_id_src = end_block_id last_block_id_dst = vllm_block_ids[last_block_id_src] stop_token_idx_in_block = stop_token_idx % block_size - h2d_copy_leading_tokens(src_buffer, dst_layer, last_block_id_src, + h2d_copy_leading_tokens(src_buffer, dst_layer, num_blocks - 1, last_block_id_dst, stop_token_idx_in_block) # Step 3: copy the middle blocks - block_mapping = torch.stack([src_block_ids, dst_block_ids], dim=1) + block_mapping = torch.stack([real_src_block_ids, dst_block_ids], dim=1) ops.swap_blocks(src_buffer[0], dst_layer[0], block_mapping) ops.swap_blocks(src_buffer[1], dst_layer[1], block_mapping) From 7d40a1d5857694af07ec4ea5b8513dc889b00198 Mon Sep 17 00:00:00 2001 From: ApostaC Date: Wed, 25 Jun 2025 20:54:05 -0700 Subject: [PATCH 28/28] fix the hang problem Signed-off-by: ApostaC --- vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py index c8b8220fda9b..c6a1f0738056 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/cpu_connector.py @@ -893,7 +893,7 @@ def get_finished( p_ready_reqs = self._kv_receiver.get_finished(len(self._gpu_kv_caches)) ret = set() for p_req_id in p_ready_reqs: - if d_req_id := self._decode_req_id_to_prefill_req_id.get(p_req_id): + if d_req_id := self._prefill_req_id_to_decode_req_id.get(p_req_id): # We have seen the corresponding decode request before. # Therefore, we can return the request id. ret.add(d_req_id)