From cecbf9b8321d73eec8af2887dbdf41cbb18622bd Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:22:30 +0200 Subject: [PATCH] docs: air-gapped deployments --- README.md | 44 ++++++++++++++++++++++++++---------- docs/source/en/quick_tour.md | 23 +++++++++++++++++++ router/src/main.rs | 4 ++-- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 837f3f54..8f9bf9d4 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ length of 512 tokens: - [Docker Images](#docker-images) - [API Documentation](#api-documentation) - [Using a private or gated model](#using-a-private-or-gated-model) + - [Air gapped deployment](#air-gapped-deployment) - [Using Re-rankers models](#using-re-rankers-models) - [Using Sequence Classification models](#using-sequence-classification-models) - [Using SPLADE pooling](#using-splade-pooling) @@ -100,11 +101,10 @@ Below are some examples of the currently supported models: ### Docker ```shell -model=BAAI/bge-large-en-v1.5 -revision=refs/pr/5 +model=Alibaba-NLP/gte-base-en-v1.5 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id $model ``` And then you can make requests like @@ -347,6 +347,29 @@ token= docker run --gpus all -e HF_API_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id $model ``` +### Air gapped deployment + +To deploy Text Embeddings Inference in an air-gapped environment, first download the weights and then mount them inside +the container using a volume. + +For example: + +```shell +# (Optional) create a `models` directory +mkdir models +cd models + +# Make sure you have git-lfs installed (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5 + +# Set the models directory as the volume path +volume=$PWD + +# Mount the models directory inside the container with a volume and set the model ID +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id /data/gte-base-en-v1.5 +``` + ### Using Re-rankers models `text-embeddings-inference` v0.4.0 added support for CamemBERT, RoBERTa and XLM-RoBERTa Sequence Classification models. @@ -428,11 +451,10 @@ found [here](https://github.com/huggingface/text-embeddings-inference/blob/main/ You can use the gRPC API by adding the `-grpc` tag to any TEI Docker image. For example: ```shell -model=BAAI/bge-large-en-v1.5 -revision=refs/pr/5 +model=Alibaba-NLP/gte-base-en-v1.5 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4-grpc --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4-grpc --model-id $model ``` ```shell @@ -463,10 +485,9 @@ cargo install --path router -F metal You can now launch Text Embeddings Inference on CPU with: ```shell -model=BAAI/bge-large-en-v1.5 -revision=refs/pr/5 +model=Alibaba-NLP/gte-base-en-v1.5 -text-embeddings-router --model-id $model --revision $revision --port 8080 +text-embeddings-router --model-id $model --port 8080 ``` **Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run: @@ -502,10 +523,9 @@ cargo install --path router -F candle-cuda -F http --no-default-features You can now launch Text Embeddings Inference on GPU with: ```shell -model=BAAI/bge-large-en-v1.5 -revision=refs/pr/5 +model=Alibaba-NLP/gte-base-en-v1.5 -text-embeddings-router --model-id $model --revision $revision --port 8080 +text-embeddings-router --model-id $model --port 8080 ``` ## Docker build diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md index b557f267..74d68f0d 100644 --- a/docs/source/en/quick_tour.md +++ b/docs/source/en/quick_tour.md @@ -121,3 +121,26 @@ curl 127.0.0.1:8080/predict \ -d '{"inputs":[["I like you."], ["I hate pineapples"]]}' \ -H 'Content-Type: application/json' ``` + +## Air gapped deployment + +To deploy Text Embeddings Inference in an air-gapped environment, first download the weights and then mount them inside +the container using a volume. + +For example: + +```shell +# (Optional) create a `models` directory +mkdir models +cd models + +# Make sure you have git-lfs installed (https://git-lfs.com) +git lfs install +git clone https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5 + +# Set the models directory as the volume path +volume=$PWD + +# Mount the models directory inside the container with a volume and set the model ID +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.4 --model-id /data/gte-base-en-v1.5 +``` diff --git a/router/src/main.rs b/router/src/main.rs index a72caa8b..e9cf084f 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -14,10 +14,10 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; struct Args { /// The name of the model to load. /// Can be a MODEL_ID as listed on like - /// `thenlper/gte-base`. + /// `Alibaba-NLP/gte-base-en-v1.5`. /// Or it can be a local directory containing the necessary files /// as saved by `save_pretrained(...)` methods of transformers - #[clap(default_value = "thenlper/gte-base", long, env)] + #[clap(default_value = "Alibaba-NLP/gte-base-en-v1.5", long, env)] #[redact(partial)] model_id: String,