10
10
import importlib .util
11
11
import inspect
12
12
import ipaddress
13
+ import multiprocessing
13
14
import os
14
15
import re
15
16
import resource
20
21
import tempfile
21
22
import threading
22
23
import time
24
+ import traceback
23
25
import uuid
24
26
import warnings
25
27
import weakref
29
31
from dataclasses import dataclass , field
30
32
from functools import lru_cache , partial , wraps
31
33
from typing import (TYPE_CHECKING , Any , AsyncGenerator , Awaitable , Callable ,
32
- Dict , Generator , Generic , List , Literal , NamedTuple ,
33
- Optional , Tuple , Type , TypeVar , Union , overload )
34
+ Dict , Generator , Generic , Iterator , List , Literal ,
35
+ NamedTuple , Optional , Tuple , Type , TypeVar , Union ,
36
+ overload )
34
37
from uuid import uuid4
35
38
36
39
import numpy as np
39
42
import torch
40
43
import torch .types
41
44
import yaml
45
+ import zmq
46
+ import zmq .asyncio
42
47
from packaging .version import Version
43
48
from torch .library import Library
44
49
from typing_extensions import ParamSpec , TypeIs , assert_never
@@ -1844,7 +1849,7 @@ def memory_profiling(
1844
1849
result .non_kv_cache_memory_in_bytes = result .non_torch_increase_in_bytes + result .torch_peak_increase_in_bytes + result .weights_memory_in_bytes # noqa
1845
1850
1846
1851
1847
- # Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840 /python/sglang/srt/utils.py#L630 # noqa: E501Curre
1852
+ # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1 /python/sglang/srt/utils.py#L630 # noqa: E501
1848
1853
def set_ulimit (target_soft_limit = 65535 ):
1849
1854
resource_type = resource .RLIMIT_NOFILE
1850
1855
current_soft , current_hard = resource .getrlimit (resource_type )
@@ -1859,3 +1864,82 @@ def set_ulimit(target_soft_limit=65535):
1859
1864
"with error %s. This can cause fd limit errors like"
1860
1865
"`OSError: [Errno 24] Too many open files`. Consider "
1861
1866
"increasing with ulimit -n" , current_soft , e )
1867
+
1868
+
1869
+ # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
1870
+ def get_exception_traceback ():
1871
+ etype , value , tb = sys .exc_info ()
1872
+ err_str = "" .join (traceback .format_exception (etype , value , tb ))
1873
+ return err_str
1874
+
1875
+
1876
+ # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
1877
+ def make_zmq_socket (
1878
+ ctx : Union [zmq .asyncio .Context , zmq .Context ], # type: ignore[name-defined]
1879
+ path : str ,
1880
+ type : Any ,
1881
+ ) -> Union [zmq .Socket , zmq .asyncio .Socket ]: # type: ignore[name-defined]
1882
+ """Make a ZMQ socket with the proper bind/connect semantics."""
1883
+
1884
+ mem = psutil .virtual_memory ()
1885
+ socket = ctx .socket (type )
1886
+
1887
+ # Calculate buffer size based on system memory
1888
+ total_mem = mem .total / 1024 ** 3
1889
+ available_mem = mem .available / 1024 ** 3
1890
+ # For systems with substantial memory (>32GB total, >16GB available):
1891
+ # - Set a large 0.5GB buffer to improve throughput
1892
+ # For systems with less memory:
1893
+ # - Use system default (-1) to avoid excessive memory consumption
1894
+ if total_mem > 32 and available_mem > 16 :
1895
+ buf_size = int (0.5 * 1024 ** 3 ) # 0.5GB in bytes
1896
+ else :
1897
+ buf_size = - 1 # Use system default buffer size
1898
+
1899
+ if type == zmq .constants .PULL :
1900
+ socket .setsockopt (zmq .constants .RCVHWM , 0 )
1901
+ socket .setsockopt (zmq .constants .RCVBUF , buf_size )
1902
+ socket .connect (path )
1903
+ elif type == zmq .constants .PUSH :
1904
+ socket .setsockopt (zmq .constants .SNDHWM , 0 )
1905
+ socket .setsockopt (zmq .constants .SNDBUF , buf_size )
1906
+ socket .bind (path )
1907
+ else :
1908
+ raise ValueError (f"Unknown Socket Type: { type } " )
1909
+
1910
+ return socket
1911
+
1912
+
1913
+ @contextlib .contextmanager
1914
+ def zmq_socket_ctx (
1915
+ path : str ,
1916
+ type : Any ) -> Iterator [zmq .Socket ]: # type: ignore[name-defined]
1917
+ """Context manager for a ZMQ socket"""
1918
+
1919
+ ctx = zmq .Context (io_threads = 2 ) # type: ignore[attr-defined]
1920
+ try :
1921
+ yield make_zmq_socket (ctx , path , type )
1922
+
1923
+ except KeyboardInterrupt :
1924
+ logger .debug ("Got Keyboard Interrupt." )
1925
+
1926
+ finally :
1927
+ ctx .destroy (linger = 0 )
1928
+
1929
+
1930
+ def _check_multiproc_method ():
1931
+ if (cuda_is_initialized ()
1932
+ and os .environ .get ("VLLM_WORKER_MULTIPROC_METHOD" ) != "spawn" ):
1933
+ logger .warning ("CUDA was previously initialized. We must use "
1934
+ "the `spawn` multiprocessing start method. Setting "
1935
+ "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
1936
+ "See https://docs.vllm.ai/en/latest/getting_started/"
1937
+ "debugging.html#python-multiprocessing "
1938
+ "for more information." )
1939
+ os .environ ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
1940
+
1941
+
1942
+ def get_mp_context ():
1943
+ _check_multiproc_method ()
1944
+ mp_method = envs .VLLM_WORKER_MULTIPROC_METHOD
1945
+ return multiprocessing .get_context (mp_method )
0 commit comments