huggingface · OlivierDehaene · Dec 12, 2024 · Jul 28, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ Ember, GTE and E5. TEI implements many features such as:
 #### Text Embeddings
 
 Text Embeddings Inference currently supports Nomic, BERT, CamemBERT, XLM-RoBERTa models with absolute positions, JinaBERT
-model with Alibi positions and Mistral, Alibaba GTE and Qwen2 models with Rope positions.
+model with Alibi positions and Mistral, Alibaba GTE, Qwen2 models with Rope positions, and MPNet.
 
 Below are some examples of the currently supported models:
 
@@ -81,7 +81,7 @@ Below are some examples of the currently supported models:
 | N/A       | 0.1B                | NomicBert   | [nomic-ai/nomic-embed-text-v1.5](https://hf.co/nomic-ai/nomic-embed-text-v1.5)                   |
 | N/A       | 0.1B                | JinaBERT    | [jinaai/jina-embeddings-v2-base-en](https://hf.co/jinaai/jina-embeddings-v2-base-en)             |
 | N/A       | 0.1B                | JinaBERT    | [jinaai/jina-embeddings-v2-base-code](https://hf.co/jinaai/jina-embeddings-v2-base-code)         |
-
+| N/A       | 0.1B                | MPNet       | [sentence-transformers/all-mpnet-base-v2](https://hf.co/sentence-transformers/all-mpnet-base-v2)            |
 
 To explore the list of best performing text embeddings models, visit the
 [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).

diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -12,7 +12,7 @@ use crate::compute_cap::{
 };
 use crate::models::{
     BertConfig, BertModel, DistilBertConfig, DistilBertModel, GTEConfig, GTEModel, JinaBertModel,
-    JinaCodeBertModel, MistralConfig, Model, NomicBertModel, NomicConfig, Qwen2Config,
+    JinaCodeBertModel, MPNetConfig, MPNetModel, MistralConfig, Model, NomicBertModel, NomicConfig, Qwen2Config,
 };
 #[cfg(feature = "cuda")]
 use crate::models::{
@@ -60,6 +60,8 @@ enum Config {
     #[serde(rename = "new")]
     Gte(GTEConfig),
     Qwen2(Qwen2Config),
+    #[serde(rename = "mpnet")]
+    MPNet(MPNetConfig),
 }
 
 pub struct CandleBackend {
@@ -226,6 +228,10 @@ impl CandleBackend {
                 "Qwen2 is only supported on Cuda devices in fp16 with flash attention enabled"
                     .to_string(),
             )),
+            (Config::MPNet(config), Device::Cpu | Device::Metal(_)) => {
+                tracing::info!("Starting MPNet model on {:?}", device);
+                Ok(Box::new(MPNetModel::load(vb, &config, model_type).s()?))
+            }
             #[cfg(feature = "cuda")]
             (Config::Bert(config), Device::Cuda(_)) => {
                 if cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
@@ -368,6 +374,24 @@ impl CandleBackend {
                     FlashQwen2Model::load(vb, &config, model_type).s()?,
                 ))
             }
+            #[cfg(feature = "cuda")]
+            (Config::MPNet(config), Device::Cuda(_)) => {
+                if cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
+                    && dtype == DType::F16
+                    // Allow disabling because of flash attention v1 precision problems
+                    // See: https://github.com/huggingface/text-embeddings-inference/issues/37
+                    && &std::env::var("USE_FLASH_ATTENTION").unwrap_or("True".to_string()).to_lowercase() == "true"
+                {
+                    // TODO: FLASH ATTENTION does not support (additive) `attention bias` for now.
+                    // See: https://github.com/Dao-AILab/flash-attention/issues/342
+                    return Err(BackendError::Start(
+                        "MPNet is only supported on Cuda devices in fp32.".to_string(),
+                    ));
+                } else {
+                    tracing::info!("Starting MPNet model on {:?}", device);
+                    Ok(Box::new(MPNetModel::load(vb, &config, model_type).s()?))
+                }
+            }
         };
 
         Ok(Self {

diff --git a/backends/candle/src/models/mod.rs b/backends/candle/src/models/mod.rs
@@ -34,6 +34,7 @@ mod flash_mistral;
 #[cfg(feature = "cuda")]
 mod flash_qwen2;
 mod gte;
+mod mpnet;
 mod qwen2;
 
 pub use bert::{BertConfig, BertModel, PositionEmbeddingType};
@@ -44,6 +45,7 @@ pub use gte::{GTEClassificationHead, GTEConfig, GTEModel, GTEMLP};
 pub use jina::JinaBertModel;
 pub use jina_code::JinaCodeBertModel;
 pub use mistral::MistralConfig;
+pub use mpnet::{MPNetConfig, MPNetModel};
 pub use nomic::{NomicBertModel, NomicConfig};
 pub use qwen2::Qwen2Config;
 use text_embeddings_backend_core::Batch;