Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 57 additions & 9 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@
*,
allowed_local_media_path: str = "",
) -> None:
"""
Args:
media_io_kwargs: Additional args passed to process media
inputs, keyed by modalities. For example,
to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`

Check failure on line 51 in vllm/multimodal/utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/multimodal/utils.py:51:81: E501 Line too long (81 > 80)
connection: HTTP connection client to download media contents.
allowed_local_media_path: A local directory to load media files from.

Check failure on line 53 in vllm/multimodal/utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/multimodal/utils.py:53:81: E501 Line too long (81 > 80)
"""
super().__init__()

self.media_io_kwargs: dict[str, dict[
Expand Down Expand Up @@ -277,15 +286,6 @@
return image_embedding_io.load_base64("", data)


global_media_connector = MediaConnector()
"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector]
instance used by vLLM."""

fetch_audio = global_media_connector.fetch_audio
fetch_image = global_media_connector.fetch_image
fetch_video = global_media_connector.fetch_video


def encode_audio_base64(
audio: np.ndarray,
sampling_rate: float,
Expand Down Expand Up @@ -441,3 +441,51 @@
dim=0)
vision_embeddings = vision_embeddings[:num_chunks, ...]
return vision_embeddings


def fetch_audio(
audio_url: str,
audio_io_kwargs: Optional[dict[str, Any]] = None,
) -> tuple[np.ndarray, Union[int, float]]:
"""
Args:
audio_url: URL of the audio file to fetch.
audio_io_kwargs: Additional kwargs passed to handle audio IO.
"""
media_io_kwargs = None if not audio_io_kwargs else {
"audio": audio_io_kwargs
}
media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
return media_connector.fetch_audio(audio_url)


def fetch_image(
image_url: str,
image_io_kwargs: Optional[dict[str, Any]] = None,
) -> Image.Image:
"""
Args:
image_url: URL of the image file to fetch.
image_io_kwargs: Additional kwargs passed to handle image IO.
"""
media_io_kwargs = None if not image_io_kwargs else {
"image": image_io_kwargs
}
media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
return media_connector.fetch_image(image_url)


def fetch_video(
video_url: str,
video_io_kwargs: Optional[dict[str, Any]] = None,
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Args:
video_url: URL of the video file to fetch.
video_io_kwargs: Additional kwargs passed to handle video IO.
"""
media_io_kwargs = None if not video_io_kwargs else {
"video": video_io_kwargs
}
media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
return media_connector.fetch_video(video_url)
Loading