From 7602b3e9b71d308eadec35878348adc9b4c73539 Mon Sep 17 00:00:00 2001 From: Liz Johnson Date: Wed, 23 Apr 2025 13:20:54 -0700 Subject: [PATCH 1/6] fixed model deployment watch command --- ai-quick-actions/troubleshooting-tips.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-quick-actions/troubleshooting-tips.md b/ai-quick-actions/troubleshooting-tips.md index 78743502..a8914f7b 100644 --- a/ai-quick-actions/troubleshooting-tips.md +++ b/ai-quick-actions/troubleshooting-tips.md @@ -40,7 +40,7 @@ To successfully debug an issue, always select logging while creating model deplo Once the model deployment is intiated, you can monitor the logs by running on your notebook terminal- -`ads watch --auth resource_principal` +`ads opctl watch --auth resource_principal` To fetch the model deployment ocid - 1. Go to model deployments tab on AI Quick Actions From e0c0a16d05c1518ef92fd9e79a32cd2772a53df0 Mon Sep 17 00:00:00 2001 From: Liz Johnson Date: Mon, 28 Apr 2025 14:06:04 -0700 Subject: [PATCH 2/6] added documentation for embedding support --- .../multimodel-deployment-tips.md | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/ai-quick-actions/multimodel-deployment-tips.md b/ai-quick-actions/multimodel-deployment-tips.md index 08d5d677..0b0e1cbe 100644 --- a/ai-quick-actions/multimodel-deployment-tips.md +++ b/ai-quick-actions/multimodel-deployment-tips.md @@ -360,8 +360,13 @@ ads aqua deployment create [OPTIONS] `--models [str]` -The String representation of a JSON array, where each object defines a model’s OCID and the number of GPUs assigned to it. The gpu count should always be a **power of two (e.g., 1, 2, 4, 8)**.
-Example: `'[{"model_id":"", "gpu_count":1},{"model_id":"", "gpu_count":1}]'` for `VM.GPU.A10.2` shape.
+The String representation of a JSON array, where each object defines a model’s OCID, number of GPUs assigned to it.
The gpu count should always be a **power of two (e.g., 1, 2, 4, 8)**. + +Example: `'[{"model_id":"", "gpu_count":1},{"model_id":"", "gpu_count":1}]'` for `VM.GPU.A10.2` shape. + +For deploying embedding models, model_task must be specified. For best practice, model_task should be supplied. (Supported tasks: text_generation, image_text_to_text, code_synthesis, text_embedding) + +Example: `'[{"model_id":"", "gpu_count":1, "model_task": "text_generation"},{"model_id":"", "gpu_count":1, "model_task": "image_text_to_text"}]'` for `VM.GPU.A10.2` shape. `--instance_shape [str]` @@ -439,7 +444,8 @@ ads aqua deployment create \ --container_image_uri "dsmc://odsc-vllm-serving:0.6.4.post1.2" \ --models '[{"model_id":"ocid1.log.oc1.iad.", "gpu_count":1}, {"model_id":"ocid1.log.oc1.iad.", "gpu_count":1}]' \ --instance_shape "VM.GPU.A10.2" \ - --display_name "modelDeployment_multmodel_model1_model2" + --display_name "modelDeployment_multmodel_model1_model2" \ + --env_var '{"MODEL_DEPLOY_PREDICT_ENDPOINT": "/v1/completions"}' ``` @@ -499,7 +505,8 @@ ads aqua deployment create \ --models '[{"model_id":"ocid1.log.oc1.iad.", "gpu_count":1}, {"model_id":"ocid1.log.oc1.iad.", "gpu_count":1}]' \ --env-var '{"MODEL_DEPLOY_PREDICT_ENDPOINT":"/v1/chat/completions"}' \ --instance_shape "VM.GPU.A10.2" \ - --display_name "modelDeployment_multmodel_model1_model2" + --display_name "modelDeployment_multmodel_model1_model2" \ + --env_var '{"MODEL_DEPLOY_PREDICT_ENDPOINT": "/v1/chat/completions"}' ``` @@ -550,7 +557,23 @@ ads aqua deployment create \ "MULTI_MODEL_CONFIG": "{\"models\": [{\"params\": \"--served-model-name mistralai/Mistral-7B-v0.1 --seed 42 --tensor-parallel-size 1 --max-model-len 4096\", \"model_path\": \"service_models/Mistral-7B-v0.1/78814a9/artifact\"}, {\"params\": \"--served-model-name tiiuae/falcon-7b --seed 42 --tensor-parallel-size 1 --trust-remote-code\", \"model_path\": \"service_models/falcon-7b/f779652/artifact\"}]}", "MODEL_DEPLOY_ENABLE_STREAMING": "true", ``` +#### Create MultiModel (1 Embedding Model, 1 LLM) deployment with `/v1/completions` + +Note: will need to pass {"route": "v1/embeddings"} as a header for all inference requests to embedding model + +``` +headers={'route':'/v1/embeddings','Content-Type':'application/json'} +``` +- for /v1/chat/completions, modify "MODEL_DEPLOY_PREDICT_ENDPOINT" +```bash +ads aqua deployment create \ + --container_image_uri "dsmc://odsc-vllm-serving:0.6.4.post1.2" \ + --models '[{"model_id":"ocid1.log.oc1.iad.", "gpu_count":1, "model_task": "embedding"}, {"model_id":"ocid1.log.oc1.iad.", "gpu_count":1, "model_task": "text_generation"}]' \ + --instance_shape "VM.GPU.A10.2" \ + --display_name "modelDeployment_multmodel_model1_model2" \ + --env_var '{"MODEL_DEPLOY_PREDICT_ENDPOINT": "/v1/completions"}' +``` ## Manage MultiModel Deployments From 224ad7324e051d15d734f9a452f394cc407a0e37 Mon Sep 17 00:00:00 2001 From: Liz Johnson Date: Mon, 16 Jun 2025 12:00:15 -0700 Subject: [PATCH 3/6] init docs for FT MMD and UI for MMD --- .../multimodel-deployment-tips.md | 266 ++++++++++++++---- .../web_assets/create-deployment.png | Bin 0 -> 83897 bytes ai-quick-actions/web_assets/deploy-mmd.png | Bin 0 -> 174793 bytes ai-quick-actions/web_assets/mmd-details.png | Bin 0 -> 256382 bytes ai-quick-actions/web_assets/mmd-ex.png | Bin 0 -> 81932 bytes 5 files changed, 211 insertions(+), 55 deletions(-) create mode 100644 ai-quick-actions/web_assets/create-deployment.png create mode 100644 ai-quick-actions/web_assets/deploy-mmd.png create mode 100644 ai-quick-actions/web_assets/mmd-details.png create mode 100644 ai-quick-actions/web_assets/mmd-ex.png diff --git a/ai-quick-actions/multimodel-deployment-tips.md b/ai-quick-actions/multimodel-deployment-tips.md index c218fadd..6e674c22 100644 --- a/ai-quick-actions/multimodel-deployment-tips.md +++ b/ai-quick-actions/multimodel-deployment-tips.md @@ -1,40 +1,158 @@ -# **AI Quick Actions MultiModel Deployment (Available through CLI only)** - -# Table of Contents -- # Introduction to MultiModel Deployment and Serving -- [Models](#models) +# **Multi-Model Deployment with AI Quick Actions (AQUA)** +Multi-Model inference and serving refers to efficiently hosting and managing multiple large language models (LLMs) on a single compute shape on Oracle Cloud Infrastructure. This allows for inference requests to multiple models while being hosted on one compute instance, with one model deployment endpoint. + +We use the 'name' parameter within model payloads to reach specific models. The Data Science service has a prebuilt **vLLM service container** that makes deploying and serving multiple large language model on **single GPU Compute shape** very easy, simplifying the deployment process and reducing operational complexity. This container comes with preinstalled [**LiteLLM proxy server**](https://docs.litellm.ai/docs/simple_proxy) which routes requests to the appropriate model, ensuring seamless prediction. + +![mmd-example](./web_assets/mmd-ex.png) + +# Models supported by Multi-Model Deployment + +**Multi-Model Deployment is currently in beta. At this time, the following is supported:** +- **[AQUA User Interface on Notebooks](#using-aqua-ui-interface-for-multi-model-deployment)** + - only Multi-Model Deployments with **base service LLM models (text-generation)** can be deployed with AQUA UI. + - fine-tuned/registered/custom models in Multi-Model Deployment must be deployed through [AQUA CLI](cli-tips.md) +- **[AQUA CLI](#using-aqua-cli-for-multi-model-deployment)** + - along w/ base service LLM models, **fine-tuned models, [custom models](#custom-models), multi-modal models** (image-text-to-text models) can be used in Multi-Model Deployments + +## Contents +- [**Multi-Model Deployment with AI Quick Actions (AQUA)**](#multi-model-deployment-with-ai-quick-actions-aqua) +- [Models supported by Multi-Model Deployment](#models-supported-by-multi-model-deployment) + - [Contents](#contents) + - [Setup](#setup) + - [For AQUA CLI](#for-aqua-cli) +- [Using AQUA UI Interface for Multi-Model Deployment](#using-aqua-ui-interface-for-multi-model-deployment) + - [Select the 'Create deployment' Button](#select-the-create-deployment-button) + - [Select 'Deploy Multi Model'](#select-deploy-multi-model) + - [Inferencing with Multi-Model Deployment](#inferencing-with-multi-model-deployment) +- [Using AQUA CLI for Multi-Model Deployment](#using-aqua-cli-for-multi-model-deployment) + - [1. Obtain Model OCIDs](#1-obtain-model-ocids) + - [Service Managed Models](#service-managed-models) + - [Fine-Tuned Models](#fine-tuned-models) - [Custom Models](#custom-models) -- [MultiModel Deployment](#multimodel-deployment) - - [List Available Shapes](#list-available-shapes) - - [Get MultiModel Configuration](#get-multimodel-configuration) - - [Create MultiModel Deployment](#create-multimodel-deployment) - - [Manage MultiModel Deployments](#manage-multimodel-deployments) -- [MultiModel Inferencing](#multimodel-inferencing) -- [MultiModel Evaluation](#multimodel-evaluation) - - [Create Model Evaluation](#create-model-evaluations) -- [Limitation](#limitations) + - [2. Before Deployment, Check Resource Limits](#2-before-deployment-check-resource-limits) + - [List Available Shapes](#list-available-shapes) + - [Usage](#usage) + - [Optional Parameters](#optional-parameters) + - [Example](#example) + - [CLI Output](#cli-output) + - [Obtain Model Configurations for Multi-Model Deployment](#obtain-model-configurations-for-multi-model-deployment) + - [Usage](#usage-1) + - [Required Parameters](#required-parameters) + - [Optional Parameters](#optional-parameters-1) + - [Example](#example-1) + - [CLI Output](#cli-output-1) + - [3. Create Multi-Model Deployment](#3-create-multi-model-deployment) + - [Description](#description) + - [Usage](#usage-2) + - [Required Parameters](#required-parameters-1) + - [Example with Base Models:](#example-with-base-models) + - [Example with Fine-Tuned-Model:](#example-with-fine-tuned-model) + - [Example with Multi-Modal \& Embedding Models:](#example-with-multi-modal--embedding-models) + - [Optional Parameters](#optional-parameters-2) + - [Example](#example-2) + - [Create Multi-Model deployment with `/v1/completions`](#create-multi-model-deployment-with-v1completions) + - [CLI Output](#cli-output-2) + - [Create Multi-Model deployment with `/v1/chat/completions`](#create-multi-model-deployment-with-v1chatcompletions) + - [CLI Output](#cli-output-3) + - [Create Multi-Model (1 Embedding Model, 1 LLM) deployment with `/v1/completions`](#create-multi-model-1-embedding-model-1-llm-deployment-with-v1completions) + - [Manage Multi-Model Deployments](#manage-multi-model-deployments) +- [Multi-Model Inferencing](#multi-model-inferencing) + - [Using oci-cli](#using-oci-cli) + - [Using Python SDK (without streaming)](#using-python-sdk-without-streaming) + - [Using Python SDK (with streaming)](#using-python-sdk-with-streaming) + - [Using Python SDK for /v1/chat/completions endpoint](#using-python-sdk-for-v1chatcompletions-endpoint) + - [Using Java (with streaming)](#using-java-with-streaming) + - [Multiple Inference endpoints](#multiple-inference-endpoints) +- [Multi-Model Evaluations](#multi-model-evaluations) + - [Create Model Evaluations](#create-model-evaluations) + - [Description](#description-1) + - [Usage](#usage-3) + - [Required Parameters](#required-parameters-2) + - [Optional Parameters](#optional-parameters-3) + - [Example](#example-3) + - [CLI Output](#cli-output-4) +- [Limitations](#limitations) - [Supported Service Models](#supported-service-models) -- [Tensor Parallelism VS Multi-Instance GPU (MIG)](#tensor-parallelism-vs-multi-instance-gpu) +- [Tensor Parallelism VS Multi-Instance GPU](#tensor-parallelism-vs-multi-instance-gpu) + - [Key Differences](#key-differences) + - [Summary](#summary) + + +## Setup + +- Ensure requirements (AQUA policies & OCI Bucket Versioning) are met [here](register-tips.md#Prerequisites). + +#### For AQUA CLI +- latest version of Accelerated Data Science (ADS) to run Multi-Model Deployments, installation instructions are available [here](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/quickstart.html). + +# Using AQUA UI Interface for Multi-Model Deployment +Only Multi-Model Deployments with **base service LLM models (text-generation)** can be deployed with AQUA UI. +- For fine-tuned,registered, or custom models in Multi-Model Deployment, use the AQUA CLI as detailed [here](#using-aqua-cli-for-multi-model-deployment) + +### Select the 'Create deployment' Button +![create-deployment](./web_assets/create-deployment.png) + +### Select 'Deploy Multi Model' +- Based on the 'models' field, a Compute Shape will be recommended to accomidate both models. +- Select logging and endpoints (/v1/completions | /v1/chat/completions). +- Submit form via 'Deploy Button' at bottom. +![mmd-form](./web_assets/deploy-mmd.png) + +### Inferencing with Multi-Model Deployment + +There are two ways to send inference requests to models within a Multi-Model Deployment + +1. Python SDK (recommended)- see [here](#Multi-Model-Inferencing) +2. Using AQUA UI (see below, ok for testing) + +Once the Deployment is Active, view the model deployment details and inferencing form by clicking on the 'Deployments' Tab and selecting the model within the Model Deployment list. +Use the dropdown labeled 'Model parameters' to select a specific model for inference. -# Introduction to MultiModel Deployment and Serving +![mmd-details](./web_assets/mmd-details.png) -MultiModel inference and serving refers to efficiently hosting and managing multiple large language models simultaneously to serve inference requests using shared resources. The Data Science server has prebuilt **vLLM service container** that make deploying and serving multiple large language model on **single GPU Compute shape** very easy, simplifying the deployment process and reducing operational complexity. This container comes with preinstalled [**LiteLLM proxy server**](https://docs.litellm.ai/docs/simple_proxy) which routes requests to the appropriate model, ensuring seamless prediction. +# Using AQUA CLI for Multi-Model Deployment +This document provides documentation on how to use ADS CLI to create Multi-Model deployment using AI Quick Actions (AQUA) model deployments, and evaluate the models. There are three steps: -**MultiModel Deployment is currently in beta and is only available through the CLI. At this time, only base service LLM models are supported, and fine-tuned/registered models cannot be deployed.** +## 1. Obtain Model OCIDs -This document provides documentation on how to use ADS CLI to create MultiModel deployment using AI Quick Actions (AQUA) model deployments, and evaluate the models. you'll need the latest version of ADS to run these, installation instructions are available [here](https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/quickstart.html). +Again, only **base service managed LLM models (text-generation)** obtained on the AQUA User Interface on OCI Notebooks. +- use the **[AQUA CLI](cli-tips.md)** for **[fine-tuned models](#fine-tuned-models), [custom models](#custom-models), multi-modal models (image-text-to-text models)** +### Service Managed Models -# Models +- obtain the OCID from the AQUA user interface by clicking on the model card and selecting the `Copy OCID` button from the `More Options` dropdown in the top-right corner of the screen. +- Refer to [AQUA CLI tips](cli-tips.md#get-service-model-ocid) for detailed instructions on how to obtain the OCIDs of service-managed models via CLI. -First step in process is to get the OCIDs of the desired base service LLM AQUA models, which are required to initiate the MultiModel deployment process. Refer to [AQUA CLI tips](cli-tips.md) for detailed instructions on how to obtain the OCIDs of base service LLM AQUA models. +### Fine-Tuned Models +To use a fine-tuned model in a Multi-Model Deployment, we must create the fine-tuned model first as specified below: +- **Register the Base Model** -You can also obtain the OCID from the AQUA user interface by clicking on the model card and selecting the `Copy OCID` button from the `More Options` dropdown in the top-right corner of the screen. + Register the **base model** using either the **AI Quick Actions UI** or **ADS CLI**. This will upload the model to the Object Storage and associate it with a Model Catalog record. -## Custom Models +- **Fine-Tune the Model** -Out of the box, MultiModel Deployment currently supports only AI Quick Actions service LLM models (see [requirements](#introduction-to-multimodel-deployment-and-serving) above). However, it is also possible to enable support for *custom-registered* models by manually adding a deployment configuration to the model artifact folder in Object Storage. + Fine-tune your model using **[AI Quick Actions UI](fine-tuning-tips.md#Fine-Tune-a-Model)** or **[ADS CLI](cli-tips.md#Model-Fine-Tuning)** . This will create a Data Science Job using your dataset (.jsonl file) and create a LoRA module uploaded to the specified bucket. + +- **Obtain the Base Model OCID** + - On the model tab, obtain the OCID from the AQUA user interface by clicking on the **base** model card and selecting the `Copy OCID` button from the `More Options` dropdown in the top-right corner of the screen. + - To obtain the OCIDs of Service/ User-Registered Models **via CLI**: [docs](cli-tips.md#get-service-model-ocid) + +- **Obtain Fine-Tuned Model OCID, Fine-Tuned Model Name** + - On the model tab, select the Fine-Tuned tab, and obtain the OCID & Model Name from the AQUA user interface by clicking on the **Fine-Tuned** model card and selecting the `Copy OCID` button from the `More Options` dropdown in the top-right corner of the screen. + - To Obtain Fine-Tuned Model OCID via CLI: [docs](cli-tips.md#get-fine-tuned-model-ocid) + - To Obtain Fine-Tuned Model Name via CLI: [docs](cli-tips.md#get-model-details) + - The "name" field in the CLI output will be the Fine-Tuned Model Name + - Use the Fine-Tuned Model OCID for CLI command + +- **Use Fine-Tuned model within a Multi-Model Deployment via AQUA CLI** + - Complete the following two steps: + - [Ensure](#2-before-deployment-check-resource-limits) that models within Multi-Model Deployment is compatible with selected compute shape and compute shape is available in your tenancy. + - [Using AQUA CLI](#3-create-multi-model-deployment), create a Multi-Model Deployment w/ Fine-Tuned Model as shown [here](#example-w-fine-tuned-model) + +### Custom Models + +Using the AQUA CLI (not supported in AQUA UI), it is also possible to enable support for *custom-registered* models by manually adding a deployment configuration to the model artifact folder in Object Storage. Follow the steps below to enable MultiModel Deployment support for your custom models: @@ -131,27 +249,28 @@ Follow the steps below to enable MultiModel Deployment support for your custom m Use the [MultiModel Configuration command](#get-multimodel-configuration) to check whether the selected model is compatible with multi-model deployment now. -# MultiModel Deployment +## 2. Before Deployment, Check Resource Limits +Before starting a Multi-Model Deployment, we must check the following resource limits: +- Determine the compute shapes available for model deployment in our OCI Environment, seen [here](#list-available-shapes). +- After selecting an available compute shape, we check if the models selected are compatible with the selected shape, seen [here](#Obtain-Model-Configurations-for-Multi-Model-Deployment). -## List Available Shapes - -### Description +### List Available Shapes Lists the available **Compute Shapes** with basic information such as name, configuration, CPU/GPU specifications, and memory capacity for the shapes supported by the Model Deployment service in your compartment. -### Usage +#### Usage ```bash ads aqua deployment list_shapes ``` -### Optional Parameters +#### Optional Parameters `--compartment_id [str]` The compartment OCID where model deployment is to be created. If not provided, then it defaults to user's compartment. -### Example +#### Example ```bash ads aqua deployment list_shapes ``` @@ -184,25 +303,24 @@ ads aqua deployment list_shapes ``` -## Get MultiModel Configuration +### Obtain Model Configurations for Multi-Model Deployment -### Description +Retrieves the deployment configuration for multiple models and calculates the GPU allocations for all compatible shapes. +- **For Fine-Tuned Models**, use the OCID on the fine-tuned model card/ fine-tuned model catalog entry (not base model OCID) -Retrieves the deployment configuration for multiple base Aqua service models and calculates the GPU allocations for all compatible shapes. - -### Usage +#### Usage ```bash ads aqua deployment get_multimodel_deployment_config [OPTIONS] ``` -### Required Parameters +#### Required Parameters `--model_ids [list]` A list of OCIDs for the Aqua models
-### Optional Parameters +#### Optional Parameters `--primary_model_id [str]` @@ -221,7 +339,7 @@ If B is the primary model, the gpu allocation is [2, 4, 2] as B always gets the The compartment OCID to retrieve the models and available model deployment shapes. -### Example +#### Example ```bash ads aqua deployment get_multimodel_deployment_config --model_ids '["ocid1.datasciencemodel.oc1.iad.","ocid1.datasciencemodel.oc1.iad."]' @@ -343,7 +461,7 @@ ads aqua deployment get_multimodel_deployment_config --model_ids '["ocid1.datasc } ``` -## Create MultiModel Deployment +## 3. Create Multi-Model Deployment Only **base service LLM models** are supported for MultiModel Deployment. All selected models will run on the same **GPU shape**, sharing the available compute resources. Make sure to choose a shape that meets the needs of all models in your deployment using [MultiModel Configuration command](#get-multimodel-configuration) @@ -362,13 +480,53 @@ ads aqua deployment create [OPTIONS] `--models [str]` -The String representation of a JSON array, where each object defines a model’s OCID, number of GPUs assigned to it.
The gpu count should always be a **power of two (e.g., 1, 2, 4, 8)**. +The String representation of a JSON array, where each object defines: +- model’s OCID +- number of GPUs assigned to it +- Fine Tuned Weights (if Fine-Tuned Model only). + -Example: `'[{"model_id":"", "gpu_count":1},{"model_id":"", "gpu_count":1}]'` for `VM.GPU.A10.2` shape. +The gpu count should always be a **power of two (e.g., 1, 2, 4, 8)**. -For deploying embedding models, model_task must be specified. For best practice, model_task should be supplied. (Supported tasks: text_generation, image_text_to_text, code_synthesis, text_embedding) +#### Example with Base Models: + +`'[{"model_id":"", "gpu_count":1},{"model_id":"", "gpu_count":1}]'` for `VM.GPU.A10.2` shape. + +#### Example with Fine-Tuned-Model: + +For **fine-tuned models**, specify the OCID of the **base model** in the top-level "model_id" field. +List your fine-tuned models in the **"fine_tune_weights" array**, where each entry includes the **OCID and name of a fine-tuned model** derived from the base model. +``` +'[ # Fine-Tuned-Model + { + "model_id": "ocid1.datasciencemodel.oc1.iad.", + "gpu_count": 1, + "model_name": "meta-llama/Meta-Llama-3.1-8B", + "model_task": "text_generation", + "fine_tune_weights": [ + { + "model_id": "ocid1.datasciencemodel.oc1.iad.<>", + "model_name": "meta-llama/Meta-Llama-3.1-8B-FT1" + }, + { + "model_id": "ocid1.datasciencemodel.oc1.iad.<>", + "model_name": "meta-llama/Meta-Llama-3.1-8B-FT2" + } + ] + }, # 2nd Model + { + "model_task": "text_generation", + "model_id": "ocid1.datasciencemodel.oc1.iad.", + "gpu_count": 1 + } + ]' +``` +#### Example with Multi-Modal & Embedding Models: + +`'[{"model_id":"", "gpu_count":1, "model_task": "text_generation"},{"model_id":"", "gpu_count":1, "model_task": "image_text_to_text"}]'` for `VM.GPU.A10.2` shape. + +For deploying multi-modal and embedding models, model_task must be specified. For best practice, model_task should be supplied. (Supported tasks: text_generation, image_text_to_text, code_synthesis, text_embedding) -Example: `'[{"model_id":"", "gpu_count":1, "model_task": "text_generation"},{"model_id":"", "gpu_count":1, "model_task": "image_text_to_text"}]'` for `VM.GPU.A10.2` shape. `--instance_shape [str]` @@ -439,7 +597,7 @@ The private endpoint id of model deployment. ### Example -#### Create MultiModel deployment with `/v1/completions` +#### Create Multi-Model deployment with `/v1/completions` ```bash ads aqua deployment create \ @@ -499,7 +657,7 @@ ads aqua deployment create \ "MODEL_DEPLOY_ENABLE_STREAMING": "true", ``` -#### Create MultiModel deployment with `/v1/chat/completions` +#### Create Multi-Model deployment with `/v1/chat/completions` ```bash ads aqua deployment create \ @@ -559,7 +717,7 @@ ads aqua deployment create \ "MULTI_MODEL_CONFIG": "{\"models\": [{\"params\": \"--served-model-name mistralai/Mistral-7B-v0.1 --seed 42 --tensor-parallel-size 1 --max-model-len 4096\", \"model_path\": \"service_models/Mistral-7B-v0.1/78814a9/artifact\"}, {\"params\": \"--served-model-name tiiuae/falcon-7b --seed 42 --tensor-parallel-size 1 --trust-remote-code\", \"model_path\": \"service_models/falcon-7b/f779652/artifact\"}]}", "MODEL_DEPLOY_ENABLE_STREAMING": "true", ``` -#### Create MultiModel (1 Embedding Model, 1 LLM) deployment with `/v1/completions` +#### Create Multi-Model (1 Embedding Model, 1 LLM) deployment with `/v1/completions` Note: will need to pass {"route": "v1/embeddings"} as a header for all inference requests to embedding model @@ -577,17 +735,15 @@ ads aqua deployment create \ ``` -## Manage MultiModel Deployments - -### Description +## Manage Multi-Model Deployments -To list all AQUA deployments (both MultiModel and single-model) within a specified compartment or project, or to get detailed information on a specific MultiModel deployment, kindly refer to the [AQUA CLI tips](cli-tips.md) documentation. +To list all AQUA deployments (both Multi-Model and single-model) within a specified compartment or project, or to get detailed information on a specific Multi-Model deployment, kindly refer to the [AQUA CLI tips](cli-tips.md) documentation. -Note: MultiModel deployments are identified by the tag `"aqua_multimodel": "true",` associated with them. +Note: Multi-Model deployments are identified by the tag `"aqua_multimodel": "true",` associated with them. -# MultiModel Inferencing +# Multi-Model Inferencing -The only change required to infer a specific model from a MultiModel deployment is to update the value of `"model"` parameter in the request payload. The values for this parameter can be found in the Model Deployment details, under the field name `"model_name"`. This parameter segregates the request flow, ensuring that the inference request is directed to the correct model within the MultiModel deployment. +The only change required to infer a specific model from a Multi-Model deployment is to update the value of `"model"` parameter in the request payload. The values for this parameter can be found in the Model Deployment details, under the field name `"model_name"`. This parameter segregates the request flow, ensuring that the inference request is directed to the correct model within the MultiModel deployment. ## Using oci-cli @@ -895,7 +1051,7 @@ oci raw-request \ ``` -# MultiModel Evaluations +# Multi-Model Evaluations ## Create Model Evaluations diff --git a/ai-quick-actions/web_assets/create-deployment.png b/ai-quick-actions/web_assets/create-deployment.png new file mode 100644 index 0000000000000000000000000000000000000000..159b2673d11c4490a108b9d5ba9939485a520f0e GIT binary patch literal 83897 zcmZU)1yClz4z4jbyX&r zB)?=jJySE?T@xlRD+UjP4FduK0xuyhtOx=ES^On0LP2~T9pcVdK|tW7&4q;IC4__s z>a?QP&A!R*`tdrI<~zyhdXdEqGQIY! zXpT@0&0kF_hk<%Fg`JW(PVD^c8vvoS7Xono-QfoX6%}qT)AUDIX9qNrvFC(9+5ObV zr)FayaT^E-Da;N_o^~wkwE)N!xxaEUB!~ix`)!%KBAE_+T{IjQlzkG4X{lWTN=eyO z5-_6`KsP7{LRfM8Tn`7r(oQrrifP1;#}$deArWW*J?MkEuSzo1sE(>e}UoTr%5@uD*4Pge%d5s7h_}yVXZt#tB(a3k2BcH)l-E0 zsN)(WAT>vhL$U<(+cD73=|W*a&GUdAX^*H9>5`+O~B2-#)n8Jn*`J$ornlm zdJao5y%s7^{@C&q;Mn>F()$fYeXiFi_WNMf&-DITsc{3m+eUsMPlhh%z#eL;&?9;7B2mghB;`wbnr%`?{{|-0lMw zuuOOH!R)Q9r5(^jPW`Yi~*~i!t+5v@IACzwzE_K+j`UY7fi0% zvD*&<>A`h+dN>9%)I)e=hunf&_rtsb^Cv_T{>2dsPA+)Kyv>#3U#3T%3K`B1t&Xth z=THl6^<71eeHrmY0F?uJ@|V;WOe^R`|H2%oj-SmHf|n>#VBjc^9n7qm{a+P3qP z`5X2J(&cauvKhG0FW$a{UXn->Bcx)KaHyid^FX0M=BuA4gq}iC)bE@h>1a~n6lC$lh0-N71y%(tigZd`lsRRZ%9iEqtkLNrSz@#W zi;7e7mkOJuWo2n)?xi9XVG9T4NoD6M+%*sK=LPM}?O9xEydppIx|O{{UZC%}Pi)Ot z($ObklBLHAXp2E5<=-AHKYOFf5Kb)kvLV$5P{PaZxxw^%! z=87y;EN$st6D#TH=^?DP<_~6!jiTlv7W)>oGpl9J0;g?k5^Iv&@>-=`{GJsLKh8(s zBK0S?n7|{SBdsHuBN6cOqni-qD5Oe+OZi2Aa zGKaFITGY;LMG7q0Wo_#Wd>@Dixg_|=AhIws1sHM+*j4C9=*Ps>Db&Wzn`O=XD4;Ve zuFJ7czP8vk9L^$#LnT11`(u%?OYU3iq=Bo7P!Z{zzF=)(hHX&Juwk%j@v<_$O7pjJ z+A3zn+p(N;jc3EF$-Tk7B`9o2Jt7T*6{AN(k!B@1E!mQGN~61_zFD$S(@e$oeqD1a z-Sns4PebpiV3S=-=Aq)*gLc}PWa|v;j=qO*Jc~n%6mvCw*A@IG*9Ocr+ge*30l+q+ zAXC1>AT6XbWVBGXP}G2sSg~QNVV+^v?pPcKNrG54z};q#okFbyx5Qf3Rm?mF22X{f z+hZl-