Skip to content

Commit 1ca44ba

Browse files
committed
Merge remote-tracking branch 'upstream/main' into private/kzawora/upstream_mp_executor
2 parents 0933513 + fe2e10c commit 1ca44ba

File tree

98 files changed

+4140
-1978
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+4140
-1978
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,19 @@ steps:
3939
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
4040
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
4141
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
42+
43+
- label: "Build and publish TPU release image"
44+
depends_on: ~
45+
if: build.env("NIGHTLY") == "1"
46+
agents:
47+
queue: tpu_queue_postmerge
48+
commands:
49+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
50+
- "docker push vllm/vllm-tpu:nightly"
51+
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
52+
plugins:
53+
- docker-login#v3.0.0:
54+
username: vllm
55+
password-env: DOCKERHUB_TOKEN
56+
env:
57+
DOCKER_BUILDKIT: "1"
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
name: Lint and Deploy Charts
2+
3+
on: pull_request
4+
5+
jobs:
6+
lint-and-deploy:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- name: Checkout
10+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
11+
with:
12+
fetch-depth: 0
13+
14+
- name: Set up Helm
15+
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
16+
with:
17+
version: v3.14.4
18+
19+
#Python is required because ct lint runs Yamale and yamllint which require Python.
20+
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
21+
with:
22+
python-version: '3.13'
23+
24+
- name: Set up chart-testing
25+
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
26+
with:
27+
version: v3.10.1
28+
29+
- name: Run chart-testing (lint)
30+
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
31+
32+
- name: Setup minio
33+
run: |
34+
docker network create vllm-net
35+
docker run -d -p 9000:9000 --name minio --net vllm-net \
36+
-e "MINIO_ACCESS_KEY=minioadmin" \
37+
-e "MINIO_SECRET_KEY=minioadmin" \
38+
-v /tmp/data:/data \
39+
-v /tmp/config:/root/.minio \
40+
minio/minio server /data
41+
export AWS_ACCESS_KEY_ID=minioadmin
42+
export AWS_SECRET_ACCESS_KEY=minioadmin
43+
export AWS_EC2_METADATA_DISABLED=true
44+
mkdir opt-125m
45+
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
46+
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
47+
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
48+
49+
- name: Create kind cluster
50+
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
51+
52+
- name: Build the Docker image vllm cpu
53+
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
54+
55+
- name: Configuration of docker images, network and namespace for the kind cluster
56+
run: |
57+
docker pull amazon/aws-cli:2.6.4
58+
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
59+
kind load docker-image vllm-cpu-env:latest --name chart-testing
60+
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
61+
kubectl create ns ns-vllm
62+
63+
- name: Run chart-testing (install)
64+
run: |
65+
export AWS_ACCESS_KEY_ID=minioadmin
66+
export AWS_SECRET_ACCESS_KEY=minioadmin
67+
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
68+
69+
- name: curl test
70+
run: |
71+
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
72+
sleep 10
73+
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
74+
--header "Content-Type: application/json" \
75+
--data '{
76+
"model": "opt-125m",
77+
"prompt": "San Francisco is a",
78+
"max_tokens": 7,
79+
"temperature": 0
80+
}'):$CODE"
81+
echo "$CODE"

Dockerfile.neuron

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# default base image
2-
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
2+
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
3+
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
34

45
FROM $BASE_IMAGE
56

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
1616
---
1717

1818
*Latest News* 🔥
19+
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
1920
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
2021
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
2122
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!

csrc/cache_kernels.cu

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,20 @@ void reshape_and_cache_flash(
307307
torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
308308
torch::Tensor&
309309
value_cache, // [num_blocks, block_size, num_heads, head_size]
310-
torch::Tensor& slot_mapping, // [num_tokens]
310+
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
311311
const std::string& kv_cache_dtype, const double k_scale,
312312
const double v_scale) {
313-
int num_tokens = key.size(0);
313+
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
314+
// slot_mapping.size(0) because of padding for CUDA graphs.
315+
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
316+
// both include padding.
317+
// In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
318+
// since key includes padding for CUDA graphs, while slot_mapping does not.
319+
// In this case, slot_mapping.size(0) represents the actual number of tokens
320+
// before padding.
321+
// For compatibility with both cases, we use slot_mapping.size(0) as the
322+
// number of tokens.
323+
int num_tokens = slot_mapping.size(0);
314324
int num_heads = key.size(1);
315325
int head_size = key.size(2);
316326
int block_size = key_cache.size(1);

docs/requirements-docs.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@ mistral_common >= 1.5.0
1616
aiohttp
1717
starlette
1818
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
19+
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
1920
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
2021
requests

docs/source/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ Documentation
8282
serving/openai_compatible_server
8383
serving/deploying_with_docker
8484
serving/deploying_with_k8s
85+
serving/deploying_with_helm
8586
serving/deploying_with_nginx
8687
serving/distributed_serving
8788
serving/metrics
@@ -102,6 +103,7 @@ Documentation
102103

103104
usage/lora
104105
usage/multimodal_inputs
106+
usage/tool_calling
105107
usage/structured_outputs
106108
usage/spec_decode
107109
usage/compatibility_matrix
968 KB
Loading

0 commit comments

Comments
 (0)