diff --git a/docs/source/index.rst b/docs/source/index.rst index b7c0d5b8800..f5d8627596a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,6 +81,7 @@ Documentation serving/env_vars serving/usage_stats serving/integrations + serving/tensorizer .. toctree:: :maxdepth: 1 diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst new file mode 100644 index 00000000000..a44696507fb --- /dev/null +++ b/docs/source/serving/tensorizer.rst @@ -0,0 +1,12 @@ +.. _tensorizer: + +Loading Models with CoreWeave's Tensorizer +========================================== +vLLM supports loading models with `CoreWeave's Tensorizer `_. +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the `vLLM example script `_. \ No newline at end of file diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 227de5475b9..ba53b5c86fa 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -230,7 +230,7 @@ def add_cli_args( '* "dummy" will initialize the weights with random values, ' 'which is mainly for profiling.\n' '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'section for more information.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n')