InftyAI · cr7258 · May 19, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
diff --git a/api/inference/v1alpha1/playground_types.go b/api/inference/v1alpha1/playground_types.go
@@ -73,6 +73,9 @@ const (
 	PlaygroundProgressing = "Progressing"
 	// PlaygroundAvailable indicates the corresponding inference service is available now.
 	PlaygroundAvailable string = "Available"
+	// SkipModelLoaderAnnoKey indicates whether to skip the model loader,
+	// enabling the inference engine to manage model loading directly.
+	SkipModelLoaderAnnoKey = "llmaz.io/skip-model-loader"
 )
 
 // PlaygroundStatus defines the observed state of Playground

diff --git a/docs/examples/README.md b/docs/examples/README.md
@@ -68,6 +68,10 @@ llama.cpp supports speculative decoding to significantly improve inference perfo
 
 [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.
 
+### Loading models with Run:ai Model Streamer with vLLM
+
+[Run:ai Model Streamer](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md) is a library to read tensors in concurrency, while streaming it to GPU memory. vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer. See [example](./runai-streamer/) here. 
+
 ### Multi-Host Inference
 
 Model size is growing bigger and bigger, Llama 3.1 405B FP16 LLM requires more than 750 GB GPU for weights only, leaving kv cache unconsidered, even with 8 x H100 Nvidia GPUs, 80 GB size of HBM each, can not fit in a single host, requires a multi-host deployment, see [example](./multi-nodes/) here.

diff --git a/docs/examples/runai-streamer/playground-streaming-from-file-system.yaml b/docs/examples/runai-streamer/playground-streaming-from-file-system.yaml
@@ -0,0 +1,44 @@
+# This example demonstrates how to use the Run:ai Model Streamer to load models from the local file system.
+# The model-loader initContainer first downloads the model from Hugging Face.
+# By using `--load-format runai_streamer`, vLLM leverages the Run:ai Model Streamer to stream models from the local file system.
+# While this approach may be slightly slower than streaming directly from S3 (due to the initial download to local disk),
+# it still offers faster model loading compared to not using the Streamer, 
+# as it utilizes multiple threads to concurrently read tensor data from files into a dedicated CPU buffer,
+# and then transfers the tensors to GPU memory.
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: deepseek-r1-distill-qwen-1-5b
+spec:
+  familyName: deepseek
+  source:
+    modelHub:
+      modelID: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+  inferenceConfig:
+    flavors:
+      - name: t4 # GPU type
+        limits:
+          nvidia.com/gpu: 1
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: deepseek-r1-distill-qwen-1-5b
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: deepseek-r1-distill-qwen-1-5b
+  backendRuntimeConfig:
+    backendName: vllm # currently, only vllm supports runai streamer
+    args:
+      - --load-format
+      - runai_streamer
+    resources:
+      limits:
+        cpu: "4"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
+      requests:
+        cpu: "4"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
diff --git a/docs/examples/runai-streamer/playground-streaming-from-s3.yaml b/docs/examples/runai-streamer/playground-streaming-from-s3.yaml
@@ -0,0 +1,54 @@
+# This example demonstrates how to use the Run:ai Model Streamer to load models directly from S3.
+# Adding the annotation `llmaz.io/skip-model-loader: "true"` skips the model-loader initContainer,
+# allowing the inference engine to load models directly from remote storage (e.g., S3).
+# By using `--load-format runai_streamer`, the vLLM leverages the Run:ai Model Streamer to stream models from S3.
+# vLLM will load models into the CPU buffer and then into GPU memory, without the need to download them to local disk first.
+# This can significantly improve model loading speed and reduce disk usage.
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: deepseek-r1-distill-qwen-1-5b
+spec:
+  familyName: deepseek
+  source:
+    # Note: You need to replace <YOUR_S3_BUCKET> with your actual S3 bucket name
+    # If the s3 bucket need AWS credentials for authentication,
+    # please run `kubectl create secret generic aws-access-secret --from-literal=AWS_ACCESS_KEY_ID=<YOUR_ACCESS_KEY_ID> --from-literal=AWS_SECRET_ACCESS_KEY=<YOUR_SECRET_ACCESS_KEY>` ahead.
+    uri: s3://<YOUR_S3_BUCKET>/DeepSeek-R1-Distill-Qwen-1.5B
+  inferenceConfig:
+    flavors:
+      - name: t4 # GPU type
+        limits:
+          nvidia.com/gpu: 1
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: deepseek-r1-distill-qwen-1-5b
+  annotations:
+    llmaz.io/skip-model-loader: "true"
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: deepseek-r1-distill-qwen-1-5b
+  backendRuntimeConfig:
+    backendName: vllm # currently, only vllm supports runai streamer
+    args:
+      - --load-format
+      - runai_streamer
+    envs:
+      # The default value is 1 second. Increase it to 10 seconds to avoid timeouts in case of slow network conditions.
+      - name: RUNAI_STREAMER_S3_REQUEST_TIMEOUT_MS
+        value: "10000"
+      # Controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer, the default value is 16
+      #- name: RUNAI_STREAMER_CONCURRENCY
+      #  value: "32"
+    resources:
+      limits:
+        cpu: "4"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
+      requests:
+        cpu: "4"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
@@ -201,6 +201,15 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf
 	// Build metadata
 	serviceApplyConfiguration := inferenceclientgo.Service(playground.Name, playground.Namespace)
 
+	if annotations := playground.GetAnnotations(); annotations != nil {
+		// Propagate llmaz.io/skip-model-loader annotation to Inference Service.
+		if value, exists := annotations[inferenceapi.SkipModelLoaderAnnoKey]; exists {
+			serviceApplyConfiguration.WithAnnotations(map[string]string{
+				inferenceapi.SkipModelLoaderAnnoKey: value,
+			})
+		}
+	}
+
 	// Build spec.
 	spec := inferenceclientgo.ServiceSpec()
 

diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go
@@ -196,10 +196,26 @@ func injectModelProperties(template *applyconfigurationv1.LeaderWorkerTemplateAp
 
 	for i, model := range models {
 		source := modelSource.NewModelSourceProvider(model)
+		// Skip model-loader initContainer if llmaz.io/skip-model-loader annotation is set.
+		if !helper.SkipModelLoader(service) {
+			if isMultiNodesInference {
+				source.InjectModelLoader(template.LeaderTemplate, i)
+			}
+			source.InjectModelLoader(template.WorkerTemplate, i)
+		} else {
+			if isMultiNodesInference {
+				source.InjectModelEnvVars(template.LeaderTemplate)
+			}
+			source.InjectModelEnvVars(template.WorkerTemplate)
+		}
+	}
+
+	// If model-loader initContainer is injected, we should mount the model-volume to the model-runner container.
+	if !helper.SkipModelLoader(service) {
 		if isMultiNodesInference {
-			source.InjectModelLoader(template.LeaderTemplate, i)
+			modelSource.InjectModelVolume(template.LeaderTemplate)
 		}
-		source.InjectModelLoader(template.WorkerTemplate, i)
+		modelSource.InjectModelVolume(template.WorkerTemplate)
 	}
 
 	// We only consider the main model's requirements for now.

diff --git a/pkg/controller_helper/backendruntime/backendruntime.go b/pkg/controller_helper/backendruntime/backendruntime.go
@@ -65,14 +65,14 @@ func (p *BackendRuntimeParser) Args() ([]string, error) {
 
 	source := modelSource.NewModelSourceProvider(mainModel)
 	modelInfo := map[string]string{
-		"ModelPath": source.ModelPath(),
+		"ModelPath": source.ModelPath(helper.SkipModelLoader(p.playground)),
 		"ModelName": source.ModelName(),
 	}
 
 	// TODO: This is not that reliable because two models doesn't always means speculative-decoding.
 	// Revisit this later.
 	if len(p.models) > 1 {
-		modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(p.models[1]).ModelPath()
+		modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(p.models[1]).ModelPath(helper.SkipModelLoader(p.playground))
 	}
 
 	for _, recommend := range p.backendRuntime.Spec.RecommendedConfigs {

diff --git a/pkg/controller_helper/helper.go b/pkg/controller_helper/helper.go
@@ -21,6 +21,7 @@ import (
 
 	coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
 	inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
@@ -122,3 +123,10 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play
 
 	return nil
 }
+
+func SkipModelLoader(obj metav1.Object) bool {
+	if annotations := obj.GetAnnotations(); annotations != nil {
+		return annotations[inferenceapi.SkipModelLoaderAnnoKey] == "true"
+	}
+	return false
+}
diff --git a/pkg/controller_helper/modelsource/modelhub.go b/pkg/controller_helper/modelsource/modelhub.go
@@ -50,7 +50,13 @@ func (p *ModelHubProvider) ModelName() string {
 //   - modelID: Qwen/Qwen2-0.5B-Instruct-GGUF
 //     fileName: qwen2-0_5b-instruct-q5_k_m.gguf
 //     modelPath: /workspace/models/qwen2-0_5b-instruct-q5_k_m.gguf
-func (p *ModelHubProvider) ModelPath() string {
+func (p *ModelHubProvider) ModelPath(skipModelLoader bool) string {
+	// Skip the model loader to allow the inference engine to handle loading models directly from model hub (e.g., Hugging Face, ModelScope).
+	// In this case, the model ID should be returned (e.g., facebook/opt-125m).
+	if skipModelLoader {
+		return p.modelID
+	}
+
 	if p.fileName != nil {
 		return CONTAINER_MODEL_PATH + *p.fileName
 	}
@@ -108,59 +114,87 @@ func (p *ModelHubProvider) InjectModelLoader(template *corev1.PodTemplateSpec, i
 	// Both HUGGING_FACE_HUB_TOKEN and HF_TOKEN works.
 	initContainer.Env = append(initContainer.Env,
 		corev1.EnvVar{
-			Name: "HUGGING_FACE_HUB_TOKEN",
+			Name: HUGGING_FACE_HUB_TOKEN,
 			ValueFrom: &corev1.EnvVarSource{
 				SecretKeyRef: &corev1.SecretKeySelector{
 					LocalObjectReference: corev1.LocalObjectReference{
 						Name: MODELHUB_SECRET_NAME, // if secret not exists, the env is empty.
 					},
-					Key:      HUGGINGFACE_TOKEN_KEY,
+					Key:      HUGGING_FACE_TOKEN_KEY,
 					Optional: ptr.To[bool](true),
 				},
 			},
-		}, corev1.EnvVar{
-			Name: "HF_TOKEN",
+		})
+
+	initContainer.Env = append(initContainer.Env,
+		corev1.EnvVar{
+			Name: HUGGING_FACE_TOKEN_KEY,
 			ValueFrom: &corev1.EnvVarSource{
 				SecretKeyRef: &corev1.SecretKeySelector{
 					LocalObjectReference: corev1.LocalObjectReference{
 						Name: MODELHUB_SECRET_NAME,
 					},
-					Key:      HUGGINGFACE_TOKEN_KEY,
+					Key:      HUGGING_FACE_TOKEN_KEY,
 					Optional: ptr.To[bool](true),
 				},
 			},
-		},
-	)
-	template.Spec.InitContainers = append(template.Spec.InitContainers, *initContainer)
+		})
 
-	// Return once not the main model, because all the below has already been injected.
-	if index != 0 {
-		return
-	}
+	template.Spec.InitContainers = append(template.Spec.InitContainers, *initContainer)
+}
 
-	// Handle container.
+func spreadEnvToInitContainer(containerEnv []corev1.EnvVar, initContainer *corev1.Container) {
+	initContainer.Env = append(initContainer.Env, containerEnv...)
+}
 
+func (p *ModelHubProvider) InjectModelEnvVars(template *corev1.PodTemplateSpec) {
 	for i := range template.Spec.Containers {
-		// We only consider this container.
 		if template.Spec.Containers[i].Name == MODEL_RUNNER_CONTAINER_NAME {
-			template.Spec.Containers[i].VolumeMounts = append(template.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{
-				Name:      MODEL_VOLUME_NAME,
-				MountPath: CONTAINER_MODEL_PATH,
-				ReadOnly:  true,
-			})
+			// Check if HuggingFace token environment variables already exist
+			hfHubTokenExists := false
+			hfTokenExists := false
+			for _, env := range template.Spec.Containers[i].Env {
+				if env.Name == HUGGING_FACE_HUB_TOKEN {
+					hfHubTokenExists = true
+				}
+				if env.Name == HUGGING_FACE_TOKEN_KEY {
+					hfTokenExists = true
+				}
+			}
+
+			// Add HUGGING_FACE_HUB_TOKEN if it doesn't exist
+			if !hfHubTokenExists {
+				template.Spec.Containers[i].Env = append(template.Spec.Containers[i].Env,
+					corev1.EnvVar{
+						Name: HUGGING_FACE_HUB_TOKEN,
+						ValueFrom: &corev1.EnvVarSource{
+							SecretKeyRef: &corev1.SecretKeySelector{
+								LocalObjectReference: corev1.LocalObjectReference{
+									Name: MODELHUB_SECRET_NAME, // if secret not exists, the env is empty.
+								},
+								Key:      HUGGING_FACE_TOKEN_KEY,
+								Optional: ptr.To[bool](true),
+							},
+						},
+					})
+			}
+
+			// Add HF_TOKEN if it doesn't exist
+			if !hfTokenExists {
+				template.Spec.Containers[i].Env = append(template.Spec.Containers[i].Env,
+					corev1.EnvVar{
+						Name: HUGGING_FACE_TOKEN_KEY,
+						ValueFrom: &corev1.EnvVarSource{
+							SecretKeyRef: &corev1.SecretKeySelector{
+								LocalObjectReference: corev1.LocalObjectReference{
+									Name: MODELHUB_SECRET_NAME,
+								},
+								Key:      HUGGING_FACE_TOKEN_KEY,
+								Optional: ptr.To[bool](true),
+							},
+						},
+					})
+			}
 		}
 	}
-
-	// Handle spec.
-
-	template.Spec.Volumes = append(template.Spec.Volumes, corev1.Volume{
-		Name: MODEL_VOLUME_NAME,
-		VolumeSource: corev1.VolumeSource{
-			EmptyDir: &corev1.EmptyDirVolumeSource{},
-		},
-	})
-}
-
-func spreadEnvToInitContainer(containerEnv []corev1.EnvVar, initContainer *corev1.Container) {
-	initContainer.Env = append(initContainer.Env, containerEnv...)
 }