diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 22e696141b84..6a2f7d159a69 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -43,6 +43,15 @@ def __init__( *, allowed_local_media_path: str = "", ) -> None: + """ + Args: + media_io_kwargs: Additional args passed to process media + inputs, keyed by modalities. For example, + to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'` + connection: HTTP connection client to download media contents. + allowed_local_media_path: A local directory to load media files from. + """ super().__init__() self.media_io_kwargs: dict[str, dict[ @@ -277,15 +286,6 @@ def fetch_image_embedding( return image_embedding_io.load_base64("", data) -global_media_connector = MediaConnector() -"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector] -instance used by vLLM.""" - -fetch_audio = global_media_connector.fetch_audio -fetch_image = global_media_connector.fetch_image -fetch_video = global_media_connector.fetch_video - - def encode_audio_base64( audio: np.ndarray, sampling_rate: float, @@ -441,3 +441,51 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor, dim=0) vision_embeddings = vision_embeddings[:num_chunks, ...] return vision_embeddings + + +def fetch_audio( + audio_url: str, + audio_io_kwargs: Optional[dict[str, Any]] = None, +) -> tuple[np.ndarray, Union[int, float]]: + """ + Args: + audio_url: URL of the audio file to fetch. + audio_io_kwargs: Additional kwargs passed to handle audio IO. + """ + media_io_kwargs = None if not audio_io_kwargs else { + "audio": audio_io_kwargs + } + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_audio(audio_url) + + +def fetch_image( + image_url: str, + image_io_kwargs: Optional[dict[str, Any]] = None, +) -> Image.Image: + """ + Args: + image_url: URL of the image file to fetch. + image_io_kwargs: Additional kwargs passed to handle image IO. + """ + media_io_kwargs = None if not image_io_kwargs else { + "image": image_io_kwargs + } + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_image(image_url) + + +def fetch_video( + video_url: str, + video_io_kwargs: Optional[dict[str, Any]] = None, +) -> tuple[npt.NDArray, dict[str, Any]]: + """ + Args: + video_url: URL of the video file to fetch. + video_io_kwargs: Additional kwargs passed to handle video IO. + """ + media_io_kwargs = None if not video_io_kwargs else { + "video": video_io_kwargs + } + media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + return media_connector.fetch_video(video_url)