From cecbf9b8321d73eec8af2887dbdf41cbb18622bd Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:22:30 +0200
Subject: [PATCH] docs: air-gapped deployments

---
 README.md                    | 44 ++++++++++++++++++++++++++----------
 docs/source/en/quick_tour.md | 23 +++++++++++++++++++
 router/src/main.rs           |  4 ++--
 3 files changed, 57 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
index 837f3f54..8f9bf9d4 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ length of 512 tokens:
     - [Docker Images](#docker-images)
     - [API Documentation](#api-documentation)
     - [Using a private or gated model](#using-a-private-or-gated-model)
+    - [Air gapped deployment](#air-gapped-deployment)
     - [Using Re-rankers models](#using-re-rankers-models)
     - [Using Sequence Classification models](#using-sequence-classification-models)
     - [Using SPLADE pooling](#using-splade-pooling)
@@ -100,11 +101,10 @@ Below are some examples of the currently supported models:
 ### Docker
 
 ```shell
-model=BAAI/bge-large-en-v1.5
-revision=refs/pr/5
+model=Alibaba-NLP/gte-base-en-v1.5
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id $model
 ```
 
 And then you can make requests like
@@ -347,6 +347,29 @@ token=<your cli READ token>
 docker run --gpus all -e HF_API_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id $model
 ```
 
+### Air gapped deployment
+
+To deploy Text Embeddings Inference in an air-gapped environment, first download the weights and then mount them inside
+the container using a volume.
+
+For example:
+
+```shell
+# (Optional) create a `models` directory
+mkdir models
+cd models
+
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5
+
+# Set the models directory as the volume path
+volume=$PWD
+
+# Mount the models directory inside the container with a volume and set the model ID
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id /data/gte-base-en-v1.5
+```
+
 ### Using Re-rankers models
 
 `text-embeddings-inference` v0.4.0 added support for CamemBERT, RoBERTa and XLM-RoBERTa Sequence Classification models.
@@ -428,11 +451,10 @@ found [here](https://github.com/huggingface/text-embeddings-inference/blob/main/
 You can use the gRPC API by adding the `-grpc` tag to any TEI Docker image. For example:
 
 ```shell
-model=BAAI/bge-large-en-v1.5
-revision=refs/pr/5
+model=Alibaba-NLP/gte-base-en-v1.5
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4-grpc --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4-grpc --model-id $model
 ```
 
 ```shell
@@ -463,10 +485,9 @@ cargo install --path router -F metal
 You can now launch Text Embeddings Inference on CPU with:
 
 ```shell
-model=BAAI/bge-large-en-v1.5
-revision=refs/pr/5
+model=Alibaba-NLP/gte-base-en-v1.5
 
-text-embeddings-router --model-id $model --revision $revision --port 8080
+text-embeddings-router --model-id $model --port 8080
 ```
 
 **Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
@@ -502,10 +523,9 @@ cargo install --path router -F candle-cuda -F http --no-default-features
 You can now launch Text Embeddings Inference on GPU with:
 
 ```shell
-model=BAAI/bge-large-en-v1.5
-revision=refs/pr/5
+model=Alibaba-NLP/gte-base-en-v1.5
 
-text-embeddings-router --model-id $model --revision $revision --port 8080
+text-embeddings-router --model-id $model --port 8080
 ```
 
 ## Docker build
diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md
index b557f267..74d68f0d 100644
--- a/docs/source/en/quick_tour.md
+++ b/docs/source/en/quick_tour.md
@@ -121,3 +121,26 @@ curl 127.0.0.1:8080/predict \
     -d '{"inputs":[["I like you."], ["I hate pineapples"]]}' \
     -H 'Content-Type: application/json'
 ```
+
+## Air gapped deployment
+
+To deploy Text Embeddings Inference in an air-gapped environment, first download the weights and then mount them inside
+the container using a volume.
+
+For example:
+
+```shell
+# (Optional) create a `models` directory
+mkdir models
+cd models
+
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5
+
+# Set the models directory as the volume path
+volume=$PWD
+
+# Mount the models directory inside the container with a volume and set the model ID
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id /data/gte-base-en-v1.5
+```
diff --git a/router/src/main.rs b/router/src/main.rs
index a72caa8b..e9cf084f 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -14,10 +14,10 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 struct Args {
     /// The name of the model to load.
     /// Can be a MODEL_ID as listed on <https://hf.co/models> like
-    /// `thenlper/gte-base`.
+    /// `Alibaba-NLP/gte-base-en-v1.5`.
     /// Or it can be a local directory containing the necessary files
     /// as saved by `save_pretrained(...)` methods of transformers
-    #[clap(default_value = "thenlper/gte-base", long, env)]
+    #[clap(default_value = "Alibaba-NLP/gte-base-en-v1.5", long, env)]
     #[redact(partial)]
     model_id: String,