diff --git a/.github/workflows/sagemaker_build_documentation.yml b/.github/workflows/sagemaker_build_documentation.yml
index 2911887ee4..27adac2209 100644
--- a/.github/workflows/sagemaker_build_documentation.yml
+++ b/.github/workflows/sagemaker_build_documentation.yml
@@ -1,11 +1,13 @@
-name: Build sagemaker documentation
+name: Build SageMaker Documentation
 
 on:
   push:
-    paths:
-      - "docs/sagemaker/**"
     branches:
       - main
+      - doc-builder*
+    paths:
+      - docs/sagemaker/**
+      - .github/workflows/sagemaker_build_documentation.yaml
 
 jobs:
    build:
@@ -14,7 +16,9 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: hub-docs
       package_name: sagemaker
-      path_to_docs: hub-docs/docs/sagemaker/
+      path_to_docs: hub-docs/docs/sagemaker/source
       additional_args: --not_python_module
+      pre_command: cd hub-docs/docs/sagemaker && make docs
     secrets:
+      token: ${{ secrets.HUGGINGFACE_PUSH }}
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/sagemaker_build_pr_documentation.yml b/.github/workflows/sagemaker_build_pr_documentation.yml
index f7b289b651..2f4d11cb34 100644
--- a/.github/workflows/sagemaker_build_pr_documentation.yml
+++ b/.github/workflows/sagemaker_build_pr_documentation.yml
@@ -1,9 +1,10 @@
-name: Build sagemaker PR Documentation
+name: Build SageMaker PR Documentation
 
 on:
   pull_request:
     paths:
-      - "docs/sagemaker/**"
+      - docs/sagemaker/**
+      - .github/workflows/sagemaker_build_pr_documentation.yaml
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,5 +18,6 @@ jobs:
       pr_number: ${{ github.event.number }}
       package: hub-docs
       package_name: sagemaker
-      path_to_docs: hub-docs/docs/sagemaker/
+      path_to_docs: hub-docs/docs/sagemaker/source
       additional_args: --not_python_module
+      pre_command: cd hub-docs/docs/sagemaker && make docs
diff --git a/.github/workflows/sagemaker_delete_doc_comment.yml b/.github/workflows/sagemaker_delete_doc_comment.yml
index 2eca708a27..57872ac6ff 100644
--- a/.github/workflows/sagemaker_delete_doc_comment.yml
+++ b/.github/workflows/sagemaker_delete_doc_comment.yml
@@ -1,10 +1,9 @@
-name: Delete sagemaker doc comment trigger
+name: Delete SageMaker PR Documentation Comment
 
 on:
   pull_request:
     types: [ closed ]
 
-
 jobs:
   delete:
     uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
diff --git a/.github/workflows/sagemaker_upload_pr_documentation.yml b/.github/workflows/sagemaker_upload_pr_documentation.yml
index 3d8a3717c1..cfc441c2a1 100644
--- a/.github/workflows/sagemaker_upload_pr_documentation.yml
+++ b/.github/workflows/sagemaker_upload_pr_documentation.yml
@@ -1,8 +1,8 @@
-name: Upload sagemaker PR Documentation
+name: Upload SageMaker PR Documentation
 
 on:
   workflow_run:
-    workflows: ["Build sagemaker PR Documentation"]
+    workflows: ["Build SageMaker PR Documentation"]
     types:
       - completed
 
@@ -13,4 +13,4 @@ jobs:
       package_name: sagemaker
     secrets:
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/docs/sagemaker/Makefile b/docs/sagemaker/Makefile
new file mode 100644
index 0000000000..ea17c64619
--- /dev/null
+++ b/docs/sagemaker/Makefile
@@ -0,0 +1,34 @@
+.PHONY: docs clean help
+
+docs: clean
+	@echo "Processing README.md files from examples/gke, examples/cloud-run, and examples/vertex-ai..."
+	@mkdir -p source/examples
+	@echo "Converting Jupyter Notebooks to MDX..."
+	@doc-builder notebook-to-mdx notebooks/sagemaker-sdk/
+	@echo "Auto-generating example files for documentation..."
+	@python scripts/auto-generate-examples.py
+	@echo "Cleaning up generated Markdown Notebook files..."
+	@find notebooks/sagemaker-sdk -name "sagemaker-notebook.md" -type f -delete
+	@echo "Generating YAML tree structure and appending to _toctree.yml..."
+	@python scripts/auto-update-toctree.py
+	@echo "YAML tree structure appended to docs/source/_toctree.yml"
+	@echo "Documentation setup complete."
+
+clean:
+	@echo "Cleaning up generated documentation..."
+	@rm -rf source/examples
+	@awk '/^# GENERATED CONTENT DO NOT EDIT!/,/^# END GENERATED CONTENT/{next} {print}' source/_toctree.yml > source/_toctree.yml.tmp && mv source/_toctree.yml.tmp source/_toctree.yml
+	@echo "Cleaning up generated Markdown Notebook files (if any)..."
+	@find notebooks/sagemaker-sdk -name "sagemaker-notebook.md" -type f -delete
+	@echo "Cleanup complete."
+
+serve:
+	@echo "Serving documentation via doc-builder"
+	doc-builder preview sagemaker source/ --not_python_module
+
+help:
+	@echo "Usage:"
+	@echo "  make docs   - Auto-generate the examples for the docs"
+	@echo "  make clean  - Remove the auto-generated docs"
+	@echo "  make help   - Display this help message"
+
diff --git a/docs/sagemaker/_toctree.yml b/docs/sagemaker/_toctree.yml
deleted file mode 100644
index d5464737d4..0000000000
--- a/docs/sagemaker/_toctree.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-- local: index
-  title: Hugging Face on Amazon SageMaker
-- local: getting-started
-  title: Get started
-- local: train
-  title: Run training on Amazon SageMaker
-- local: inference
-  title: Deploy models to Amazon SageMaker
-- local: reference
-  title: Reference
\ No newline at end of file
diff --git a/docs/sagemaker/index.md b/docs/sagemaker/index.md
deleted file mode 100644
index b887aee9ce..0000000000
--- a/docs/sagemaker/index.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# Hugging Face on Amazon SageMaker
-
-![cover](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sagemaker/cover.png)
-
-## Deep Learning Containers
-
-Deep Learning Containers (DLCs) are Docker images pre-installed with deep learning frameworks and libraries such as 🤗 Transformers, 🤗 Datasets, and 🤗 Tokenizers. The DLCs allow you to start training models immediately, skipping the complicated process of building and optimizing your training environments from scratch. Our DLCs are thoroughly tested and optimized for deep learning environments, requiring no configuration or maintenance on your part. In particular, the Hugging Face Inference DLC comes with a pre-written serving stack which drastically lowers the technical bar of deep learning serving.
-
-Our DLCs are available everywhere [Amazon SageMaker](https://aws.amazon.com/sagemaker/) is [available](https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/). While it is possible to use the DLCs without the SageMaker Python SDK, there are many advantages to using SageMaker to train your model:
-
-- Cost-effective: Training instances are only live for the duration of your job. Once your job is complete, the training cluster stops, and you won't be billed anymore. SageMaker also supports [Spot instances](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html), which can reduce costs up to 90%.
-- Built-in automation: SageMaker automatically stores training metadata and logs in a serverless managed metastore and fully manages I/O operations with S3 for your datasets, checkpoints, and model artifacts.
-- Multiple security mechanisms: SageMaker offers [encryption at rest](https://docs.aws.amazon.com/sagemaker/latest/dg/encryption-at-rest-nbi.html), [in transit](https://docs.aws.amazon.com/sagemaker/latest/dg/encryption-in-transit.html), [Virtual Private Cloud](https://docs.aws.amazon.com/sagemaker/latest/dg/interface-vpc-endpoint.html) connectivity, and [Identity and Access Management](https://docs.aws.amazon.com/sagemaker/latest/dg/security_iam_service-with-iam.html) to secure your data and code.
-
-Hugging Face DLCs are open source and licensed under Apache 2.0. Feel free to reach out on our [community forum](https://discuss.huggingface.co/c/sagemaker/17) if you have any questions. For premium support, our [Expert Acceleration Program](https://huggingface.co/support) gives you direct dedicated support from our team.
-
-## Features & benefits 🔥
-
-Hugging Face DLCs make it easier than ever to train Transformer models in SageMaker. Here is why you should consider using Hugging Face DLCs to train and deploy your next machine learning models:
-
-**One command is all you need**
-
-With the new Hugging Face DLCs, train cutting-edge Transformers-based NLP models in a single line of code. Choose from multiple DLC variants, each one optimized for TensorFlow and PyTorch, single-GPU, single-node multi-GPU, and multi-node clusters.
-
-**Accelerate machine learning from science to production**
-
-In addition to Hugging Face DLCs, we created a first-class Hugging Face extension for the SageMaker Python SDK to accelerate data science teams, reducing the time required to set up and run experiments from days to minutes.
-
-You can use the Hugging Face DLCs with SageMaker's automatic model tuning to optimize your training hyperparameters and increase the accuracy of your models.
-
-Deploy your trained models for inference with just one more line of code or select any of the 10,000+ publicly available models from the [model Hub](https://huggingface.co/models) and deploy them with SageMaker.
-
-Easily track and compare your experiments and training artifacts in SageMaker Studio's web-based integrated development environment (IDE).
-
-**Built-in performance**
-
-Hugging Face DLCs feature built-in performance optimizations for PyTorch and TensorFlow to train NLP models faster. The DLCs also give you the flexibility to choose a training infrastructure that best aligns with the price/performance ratio for your workload.
-
-The Hugging Face Training DLCs are fully integrated with SageMaker distributed training libraries to train models faster than ever, using the latest generation of instances available on Amazon Elastic Compute Cloud.
-
-Hugging Face Inference DLCs provide you with production-ready endpoints that scale quickly with your AWS environment, built-in monitoring, and a ton of enterprise features. 
-
----
-
-## Resources, Documentation & Samples 📄
-
-Take a look at our published blog posts, videos, documentation, sample notebooks and scripts for additional help and more context about Hugging Face DLCs on SageMaker.
-
-### Blogs and videos
-
-- [AWS: Embracing natural language processing with Hugging Face](https://aws.amazon.com/de/blogs/opensource/embracing-natural-language-processing-with-hugging-face/)
-- [Deploy Hugging Face models easily with Amazon SageMaker](https://huggingface.co/blog/deploy-hugging-face-models-easily-with-amazon-sagemaker)
-- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing models](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
-- [Walkthrough: End-to-End Text Classification](https://youtu.be/ok3hetb42gU)
-- [Working with Hugging Face models on Amazon SageMaker](https://youtu.be/leyrCgLAGjMn)
-- [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq)
-- [Deploy a Hugging Face Transformers Model from S3 to Amazon SageMaker](https://youtu.be/pfBGgSGnYLs)
-- [Deploy a Hugging Face Transformers Model from the Model Hub to Amazon SageMaker](https://youtu.be/l9QZuazbzWM)
-
-### Documentation
-
-- [Run training on Amazon SageMaker](/docs/sagemaker/train)
-- [Deploy models to Amazon SageMaker](/docs/sagemaker/inference)
-- [Reference](/docs/sagemaker/reference)
-- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
-- [Python SDK SageMaker documentation for Hugging Face](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
-- [Deep Learning Container](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers)
-- [SageMaker's Distributed Data Parallel Library](https://docs.aws.amazon.com/sagemaker/latest/dg/data-parallel.html)
-- [SageMaker's Distributed Model Parallel Library](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html)
-
-### Sample notebooks
-
-- [All notebooks](https://github.com/huggingface/notebooks/tree/master/sagemaker)
-- [Getting Started with Pytorch](https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb)
-- [Getting Started with Tensorflow](https://github.com/huggingface/notebooks/blob/main/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)
-- [Distributed Training Data Parallelism](https://github.com/huggingface/notebooks/blob/main/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
-- [Distributed Training Model Parallelism](https://github.com/huggingface/notebooks/blob/main/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
-- [Spot Instances and continue training](https://github.com/huggingface/notebooks/blob/main/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)
-- [SageMaker Metrics](https://github.com/huggingface/notebooks/blob/main/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)
-- [Distributed Training Data Parallelism Tensorflow](https://github.com/huggingface/notebooks/blob/main/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
-- [Distributed Training Summarization](https://github.com/huggingface/notebooks/blob/main/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb)
-- [Image Classification with Vision Transformer](https://github.com/huggingface/notebooks/blob/main/sagemaker/09_image_classification_vision_transformer/sagemaker-notebook.ipynb)
-- [Deploy one of the 10 000+ Hugging Face Transformers to Amazon SageMaker for Inference](https://github.com/huggingface/notebooks/blob/main/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb)
-- [Deploy a Hugging Face Transformer model from S3 to SageMaker for inference](https://github.com/huggingface/notebooks/blob/main/sagemaker/10_deploy_model_from_s3/deploy_transformer_model_from_s3.ipynb)
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/deploy-embedding-models/assets/cw.png b/docs/sagemaker/notebooks/sagemaker-sdk/deploy-embedding-models/assets/cw.png
new file mode 100644
index 0000000000..4762d9906d
Binary files /dev/null and b/docs/sagemaker/notebooks/sagemaker-sdk/deploy-embedding-models/assets/cw.png differ
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/deploy-embedding-models/sagemaker-notebook.ipynb b/docs/sagemaker/notebooks/sagemaker-sdk/deploy-embedding-models/sagemaker-notebook.ipynb
new file mode 100644
index 0000000000..8d6ff6f0d4
--- /dev/null
+++ b/docs/sagemaker/notebooks/sagemaker-sdk/deploy-embedding-models/sagemaker-notebook.ipynb
@@ -0,0 +1,327 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to deploy Embedding Models to Amazon SageMaker using new Hugging Face Embedding DLC\n",
+    "\n",
+    "This is an example on how to deploy the open Embedding Models, like [Snowflake/snowflake-arctic-embed-l](https://huggingface.co/Snowflake/snowflake-arctic-embed-l), [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) or [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) to Amazon SageMaker for inference using the new Hugging Face Embedding Inference Container. We will deploy the [Snowflake/snowflake-arctic-embed-m](https://huggingface.co/Snowflake/snowflake-arctic-embed-m) one of the best open Embedding Models for retrieval and ranking on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). \n",
+    "\n",
+    "The example covers:\n",
+    "1. [Setup development environment](#1-setup-development-environment)\n",
+    "2. [Retrieve the new Hugging Face Embedding Container](#2-retrieve-the-new-hugging-face-embedding-container)\n",
+    "3. [Deploy Snowflake Arctic to Amazon SageMaker](#3-deploy-snowflake-arctic-to-amazon-sagemaker)\n",
+    "4. [Run and evaluate Inference performance](#4-run-and-evaluate-inference-performance)\n",
+    "5. [Delete model and endpoint](#5-delete-model-and-endpoint)\n",
+    "\n",
+    "## What is Hugging Face Embedding DLC?\n",
+    "\n",
+    "The Hugging Face Embedding DLC is a new purpose-built Inference Container to easily deploy Embedding Models in a secure and managed environment. The DLC is powered by [Text Embedding Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) a blazing fast and memory efficient solution for deploying and serving Embedding Models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5. TEI implements many features such as:\n",
+    "\n",
+    "* No model graph compilation step\n",
+    "* Small docker images and fast boot times\n",
+    "* Token based dynamic batching\n",
+    "* Optimized transformers code for inference using Flash Attention, Candle and cuBLASLt\n",
+    "* Safetensors weight loading\n",
+    "* Production ready (distributed tracing with Open Telemetry, Prometheus metrics)\n",
+    "\n",
+    "TEI supports the following model architectures\n",
+    "* BERT/CamemBERT, e.g. [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) or [Snowflake/snowflake-arctic-embed-m](https://huggingface.co/Snowflake/snowflake-arctic-embed-m)\n",
+    "* RoBERTa, [sentence-transformers/all-roberta-large-v1](https://huggingface.co/sentence-transformers/all-roberta-large-v1) \n",
+    "* XLM-RoBERTa, e.g. [sentence-transformers/paraphrase-xlm-r-multilingual-v1](https://huggingface.co/sentence-transformers/paraphrase-xlm-r-multilingual-v1)\n",
+    "* NomicBert, e.g. [jinaai/jina-embeddings-v2-base-en](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)\n",
+    "* JinaBert, e.g. [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)\n",
+    "\n",
+    "Lets get started!\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup development environment\n",
+    "\n",
+    "We are going to use the `sagemaker` python SDK to deploy Snowflake Arctic to Amazon SageMaker. We need to make sure to have an AWS account configured and the `sagemaker` python SDK installed. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install \"sagemaker>=2.221.1\" --upgrade --quiet\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import boto3\n",
+    "sess = sagemaker.Session()\n",
+    "# sagemaker session bucket -> used for uploading data, models and logs\n",
+    "# sagemaker will automatically create this bucket if it not exists\n",
+    "sagemaker_session_bucket=None\n",
+    "if sagemaker_session_bucket is None and sess is not None:\n",
+    "    # set to default bucket if a bucket name is not given\n",
+    "    sagemaker_session_bucket = sess.default_bucket()\n",
+    "\n",
+    "try:\n",
+    "    role = sagemaker.get_execution_role()\n",
+    "except ValueError:\n",
+    "    iam = boto3.client('iam')\n",
+    "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
+    "\n",
+    "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
+    "\n",
+    "print(f\"sagemaker role arn: {role}\")\n",
+    "print(f\"sagemaker session region: {sess.boto_region_name}\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Retrieve the new Hugging Face Embedding Container\n",
+    "\n",
+    "Compared to deploying regular Hugging Face models we first need to retrieve the container uri and provide it to our `HuggingFaceModel` model class with a `image_uri` pointing to the image. To retrieve the new Hugging Face Embedding Container in Amazon SageMaker, we can use the `get_huggingface_llm_image_uri` method provided by the `sagemaker` SDK. This method allows us to retrieve the URI for the desired Hugging Face Embedding Container. Important to note is that TEI has 2 different versions for cpu and gpu, so we create a helper function to retrieve the correct image uri based on the instance type. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import get_huggingface_llm_image_uri\n",
+    "\n",
+    "# retrieve the image uri based on instance type\n",
+    "def get_image_uri(instance_type):\n",
+    "  key = \"huggingface-tei\" if instance_type.startswith(\"ml.g\") or instance_type.startswith(\"ml.p\") else \"huggingface-tei-cpu\"\n",
+    "  return get_huggingface_llm_image_uri(key, version=\"1.4.0\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Deploy Snowflake Arctic to Amazon SageMaker\n",
+    "\n",
+    "To deploy  [Snowflake/snowflake-arctic-embed-m](https://huggingface.co/Snowflake/snowflake-arctic-embed-m) to Amazon SageMaker we create a `HuggingFaceModel` model class and define our endpoint configuration including the `HF_MODEL_ID`, `instance_type` etc. We will use a `c6i.2xlarge` instance type, which has 4 Intel Ice-Lake vCPUs, 8GB of memory and costs around $0.204 per hour. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from sagemaker.huggingface import HuggingFaceModel\n",
+    "\n",
+    "# sagemaker config\n",
+    "instance_type = \"ml.g5.xlarge\"\n",
+    "\n",
+    "# Define Model and Endpoint configuration parameter\n",
+    "config = {\n",
+    "  'HF_MODEL_ID': \"Snowflake/snowflake-arctic-embed-m\", # model_id from hf.co/models\n",
+    "}\n",
+    "\n",
+    "# create HuggingFaceModel with the image uri\n",
+    "emb_model = HuggingFaceModel(\n",
+    "  role=role,\n",
+    "  image_uri=get_image_uri(instance_type),\n",
+    "  env=config\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the `deploy` method. We will deploy the model with the `ml.c6i.2xlarge` instance type."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Deploy model to an endpoint\n",
+    "# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy\n",
+    "emb = emb_model.deploy(\n",
+    "  initial_instance_count=1,\n",
+    "  instance_type=instance_type,\n",
+    ")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SageMaker will now create our endpoint and deploy the model to it. This can takes  ~5 minutes. "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Run and evaluate Inference performance\n",
+    "\n",
+    "After our endpoint is deployed we can run inference on it. We will use the `predict` method from the `predictor` to run inference on our endpoint.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\n",
+    "  \"inputs\": \"the mesmerizing performances of the leads keep the film grounded and keep the audience riveted .\",\n",
+    "}\n",
+    " \n",
+    "res = emb.predict(data=data)\n",
+    " \n",
+    " \n",
+    "# print some results\n",
+    "print(f\"length of embeddings: {len(res[0])}\")\n",
+    "print(f\"first 10 elements of embeddings: {res[0][:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Awesome we can now generate embeddings with our model, Lets test the performance of our model.\n",
+    "\n",
+    "We will send 3,900 requests to our endpoint use threading with 10 concurrent threads. We will measure the average latency and throughput of our endpoint. We are going to sent an input of 256 tokens to have a total of ~1 Million tokens. We decided to use 256 tokens as input length to find the balance between shorter and longer inputs.\n",
+    "\n",
+    "Note: When running the load test, the requests are sent from europe and the endpoint is deployed in us-east-1. This adds a network overhead to it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import threading\n",
+    "import time\n",
+    "number_of_threads = 10\n",
+    "number_of_requests = int(3900 // number_of_threads)\n",
+    "print(f\"number of threads: {number_of_threads}\")\n",
+    "print(f\"number of requests per thread: {number_of_requests}\")\n",
+    " \n",
+    "def send_rquests():\n",
+    "    for _ in range(number_of_requests):\n",
+    "        # input counted at https://huggingface.co/spaces/Xenova/the-tokenizer-playground for 100 tokens\n",
+    "        emb.predict(data={\"inputs\": \"Hugging Face is a company and a popular platform in the field of natural language processing (NLP) and machine learning. They are known for their contributions to the development of state-of-the-art models for various NLP tasks and for providing a platform that facilitates the sharing and usage of pre-trained models. One of the key offerings from Hugging Face is the Transformers library, which is an open-source library for working with a variety of pre-trained transformer models, including those for text generation, translation, summarization, question answering, and more. The library is widely used in the research and development of NLP applications and is supported by a large and active community. Hugging Face also provides a model hub where users can discover, share, and download pre-trained models. Additionally, they offer tools and frameworks to make it easier for developers to integrate and use these models in their own projects. The company has played a significant role in advancing the field of NLP and making cutting-edge models more accessible to the broader community. Hugging Face also provides a model hub where users can discover, share, and download pre-trained models. Additionally, they offer tools and frameworks to make it easier for developers and ma\"})\n",
+    " \n",
+    "# Create multiple threads\n",
+    "threads = [threading.Thread(target=send_rquests) for _ in range(number_of_threads) ]\n",
+    "# start all threads\n",
+    "start = time.time()\n",
+    "[t.start() for t in threads]\n",
+    "# wait for all threads to finish\n",
+    "[t.join() for t in threads]\n",
+    "print(f\"total time: {round(time.time() - start)} seconds\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sending 3,900 requests or embedding 1 million tokens took around 841 seconds. This means we can run around ~5 requests per second. But keep in mind that includes the network latency from europe to us-east-1. When we inspect the latency of the endpoint through cloudwatch we can see that latency for our Embeddings model is 2s at 10 concurrent requests. This is very impressive for a small & old CPU instance, which cost ~150$ per month. You can deploy the model to a GPU instance to get faster inference times.\n",
+    "\n",
+    "_Note: We ran the same test on a `ml.g5.xlarge` with 1x NVIDIA A10G GPU. Embedding 1 million tokens took around 30 seconds. This means we can run around ~130 requests per second. The latency for the endpoint is 4ms at 10 concurrent requests. The `ml.g5.xlarge` costs around $1.408 per hour on Amazon SageMaker._\n",
+    "\n",
+    "GPU instance are much faster than CPU instances, but they are also more expensive. If you want to bulk process embeddings, you can use a GPU instance. If you want to run a small endpoint with low costs, you can use a CPU instance. We plan to work on a dedicated benchmark for the Hugging Face Embedding DLC in the future."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"https://console.aws.amazon.com/cloudwatch/home?region={sess.boto_region_name}#metricsV2:graph=~(metrics~(~(~'AWS*2fSageMaker~'ModelLatency~'EndpointName~'{emb.endpoint_name}~'VariantName~'AllTraffic))~view~'timeSeries~stacked~false~region~'{sess.boto_region_name}~start~'-PT5M~end~'P0D~stat~'Average~period~30);query=~'*7bAWS*2fSageMaker*2cEndpointName*2cVariantName*7d*20{emb.endpoint_name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![cw](./assets/cw.png)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Delete model and endpoint\n",
+    "\n",
+    "To clean up, we can delete the model and endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emb.delete_model()\n",
+    "emb.delete_endpoint()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hf",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5fcf248a74081676ead7e77f54b2c239ba2921b952f7cbcdbbe5427323165924"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/deploy-llama-3-3-70b-inferentia2/sagemaker-notebook.ipynb b/docs/sagemaker/notebooks/sagemaker-sdk/deploy-llama-3-3-70b-inferentia2/sagemaker-notebook.ipynb
new file mode 100644
index 0000000000..2f28d0b995
--- /dev/null
+++ b/docs/sagemaker/notebooks/sagemaker-sdk/deploy-llama-3-3-70b-inferentia2/sagemaker-notebook.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Deploy Llama 3.3 70B on AWS Inferentia2\n",
+    "\n",
+    "In this tutorial you will learn how to deploy [/meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) model on AWS Inferentia2 with Hugging Face Optimum on Amazon SageMaker. We are going to use the Hugging Face TGI Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by[ Text Generation Inference](https://huggingface.co/docs/text-generation-inference/index) and [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index).\n",
+    "\n",
+    "\n",
+    "We will cover how to:\n",
+    "1. [Setup development environment](#1-setup-development-environment)\n",
+    "2. [Retrieve the new Hugging Face TGI Neuron DLC](#2-retrieve-the-new-hugging-face-tgi-neuron-dlc)\n",
+    "3. [Deploy Llama 3.3 70B to inferentia2](#3-deploy-llama-33-70b-to-inferentia2)\n",
+    "4. [Clean up](#4-clean-up)\n",
+    "\n",
+    "Lets get started! 🚀\n",
+    "\n",
+    "[AWS inferentia (Inf2)](https://aws.amazon.com/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family.\n",
+    "\n",
+    "| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) |\n",
+    "| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- |\n",
+    "| inf2.xlarge   | 1            | 2            | 32                 | 4    | 16         | 0.76                  |\n",
+    "| inf2.8xlarge  | 1            | 2            | 32                 | 32   | 128        | 1.97                  |\n",
+    "| inf2.24xlarge | 6            | 12           | 192                | 96   | 384        | 6.49                  |\n",
+    "| inf2.48xlarge | 12           | 24           | 384                | 192  | 768        | 12.98                 |\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup development environment\n",
+    "\n",
+    "For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the `sagemaker` python SDK to deploy Llama 3.3 70B to a SageMaker inference endpoint.\n",
+    "\n",
+    "Make sur you have the latest version of the SageMaker SDK installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install sagemaker --upgrade --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, instantiate the sagemaker role and session."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import boto3\n",
+    "sess = sagemaker.Session()\n",
+    "# sagemaker session bucket -> used for uploading data, models and logs\n",
+    "# sagemaker will automatically create this bucket if it not exists\n",
+    "sagemaker_session_bucket=None\n",
+    "if sagemaker_session_bucket is None and sess is not None:\n",
+    "    # set to default bucket if a bucket name is not given\n",
+    "    sagemaker_session_bucket = sess.default_bucket()\n",
+    "\n",
+    "try:\n",
+    "    role = sagemaker.get_execution_role()\n",
+    "except ValueError:\n",
+    "    iam = boto3.client('iam')\n",
+    "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
+    "\n",
+    "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
+    "\n",
+    "print(f\"sagemaker role arn: {role}\")\n",
+    "print(f\"sagemaker session region: {sess.boto_region_name}\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Retrieve the latest Hugging Face TGI Neuron DLC\n",
+    "\n",
+    "The latest Hugging Face TGI Neuron DLCs can be used to run inference on AWS Inferentia2. You can use the `get_huggingface_llm_image_uri` method of the `sagemaker` SDK to retrieve the appropriate Hugging Face TGI Neuron DLC URI based on your desired `backend`, `session`, `region`, and `version`. You can find the latest version of the container [here](https://huggingface.co/docs/optimum-neuron/containers), if not yet added to the SageMaker SDK.\n",
+    "\n",
+    "At the time of the tutorial, the latest version of the container is not yet added to the Sagemaker SDK so we will not use `get_huggingface_llm_image_uri`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "# pulled from https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json\n",
+    "account_id_dict= {\n",
+    "    \"ap-northeast-1\": \"763104351884\",\n",
+    "    \"ap-south-1\": \"763104351884\",\n",
+    "    \"ap-south-2\": \"772153158452\",\n",
+    "    \"ap-southeast-1\": \"763104351884\",\n",
+    "    \"ap-southeast-2\": \"763104351884\",\n",
+    "    \"ap-southeast-4\": \"457447274322\",\n",
+    "    \"ap-southeast-5\": \"550225433462\",\n",
+    "    \"ap-southeast-7\": \"590183813437\",\n",
+    "    \"cn-north-1\": \"727897471807\",\n",
+    "    \"cn-northwest-1\": \"727897471807\",\n",
+    "    \"eu-central-1\": \"763104351884\",\n",
+    "    \"eu-central-2\": \"380420809688\",\n",
+    "    \"eu-south-2\": \"503227376785\",\n",
+    "    \"eu-west-1\": \"763104351884\",\n",
+    "    \"eu-west-3\": \"763104351884\",\n",
+    "    \"il-central-1\": \"780543022126\",\n",
+    "    \"mx-central-1\":\"637423239942\",\n",
+    "    \"sa-east-1\": \"763104351884\",\n",
+    "    \"us-east-1\": \"763104351884\",\n",
+    "    \"us-east-2\": \"763104351884\",\n",
+    "    \"us-gov-east-1\": \"446045086412\",\n",
+    "    \"us-gov-west-1\": \"442386744353\",\n",
+    "    \"us-west-2\": \"763104351884\",\n",
+    "    \"ca-west-1\": \"204538143572\"\n",
+    "}\n",
+    "\n",
+    "region = boto3.Session().region_name\n",
+    "llm_image = f\"{account_id_dict[region]}.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.2-optimum0.0.28-neuronx-py310-ubuntu22.04\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Deploy Llama 3.3 70B to Inferentia2\n",
+    "\n",
+    "At the time of writing, [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes), which means that we need to specify our sequence length and batch size ahead of time.\n",
+    "To make it easier for customers to utilize the full power of Inferentia2, we created a [neuron model cache](https://huggingface.co/docs/optimum-neuron/guides/cache_system), which contains pre-compiled configurations for the most popular LLMs, including Llama 3.3 70B. \n",
+    "\n",
+    "This means we don't need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the [Hugging Face Hub](https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config). If your desired configuration is not yet cached, you can compile it yourself using the [Optimum CLI](https://huggingface.co/docs/optimum-neuron/guides/export_model) or open a request at the [Cache repository](https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions).\n",
+    "\n",
+    "**Deploying Llama 3.3 70B to a SageMaker Endpoint**  \n",
+    "\n",
+    "Before deploying the model to Amazon SageMaker, we must define the TGI Neuron endpoint configuration. We need to make sure the following additional parameters are defined: \n",
+    "\n",
+    "- `HF_NUM_CORES`: Number of Neuron Cores used for the compilation.\n",
+    "- `HF_BATCH_SIZE`: The batch size that was used to compile the model.\n",
+    "- `HF_SEQUENCE_LENGTH`: The sequence length that was used to compile the model.\n",
+    "- `HF_AUTO_CAST_TYPE`: The auto cast type that was used to compile the model.\n",
+    "\n",
+    "We still need to define traditional TGI parameters with:\n",
+    "\n",
+    "- `HF_MODEL_ID`: The Hugging Face model ID.\n",
+    "- `HF_TOKEN`: The Hugging Face API token to access gated models.\n",
+    "- `MAX_BATCH_SIZE`: The maximum batch size that the model can handle, equal to the batch size used for compilation.\n",
+    "- `MAX_INPUT_TOKEN`: The maximum input length that the model can handle. \n",
+    "- `MAX_TOTAL_TOKENS`: The maximum total tokens the model can generate, equal to the sequence length used for compilation.\n",
+    "\n",
+    "Optionnaly, you can configure the endpoint to support chat templates:\n",
+    "- `MESSAGES_API_ENABLED`: Enable Messages API \n",
+    "\n",
+    "**Select the right instance type**\n",
+    "\n",
+    "Llama 3.3 70B is a large model and requires a lot of memory. We are going to use the `inf2.48xlarge` instance type, which has 192 vCPUs and 384 GB of accelerator memory. The `inf2.48xlarge` instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. If you want to find the cached configurations for Llama 3.3 70B, you can find them [here](https://huggingface.co/aws-neuron/optimum-neuron-cache/blob/main/inference-cache-config/llama3-70b.json#L16). In our case we will use a batch size of 4 and a sequence length of 4096. \n",
+    "\n",
+    "\n",
+    "Before we can deploy Llama 3.3 70B to Inferentia2, we need to make sure we have the necessary permissions to access the model. You can request access to the model [here](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) and create a User access token following this [guide](https://huggingface.co/docs/hub/en/security-tokens).\n",
+    "\n",
+    "\n",
+    "After that we can create our endpoint configuration and deploy the model to Amazon SageMaker. We will deploy the endpoint with the Messages API enabled, so that it is fully compatible with the OpenAI Chat Completion API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import HuggingFaceModel\n",
+    "\n",
+    "# sagemaker config\n",
+    "instance_type = \"ml.inf2.48xlarge\"\n",
+    "health_check_timeout=3600 # additional time to load the model\n",
+    "volume_size=512 # size in GB of the EBS volume\n",
+    "\n",
+    "# Define Model and Endpoint configuration parameter\n",
+    "config = {\n",
+    "    \"HF_MODEL_ID\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n",
+    "    \"HF_NUM_CORES\": \"24\", # number of neuron cores\n",
+    "    \"HF_AUTO_CAST_TYPE\": \"bf16\",  # dtype of the model\n",
+    "    \"MAX_BATCH_SIZE\": \"4\", # max batch size for the model\n",
+    "    \"MAX_INPUT_TOKENS\": \"4000\", # max length of input text\n",
+    "    \"MAX_TOTAL_TOKENS\": \"4096\", # max length of generated text\n",
+    "    \"MESSAGES_API_ENABLED\": \"true\", # Enable the messages API\n",
+    "    \"HF_TOKEN\": \"<REPLACE WITH YOUR TOKEN>\",\n",
+    "}\n",
+    "\n",
+    "assert config[\"HF_TOKEN\"] != \"<REPLACE WITH YOUR TOKEN>\", \"Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token\"\n",
+    "\n",
+    "\n",
+    "# create HuggingFaceModel with the image uri\n",
+    "llm_model = HuggingFaceModel(\n",
+    "  role=role,\n",
+    "  image_uri=llm_image,\n",
+    "  env=config\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the `deploy` method. We will deploy the model with the `ml.inf2.48xlarge` instance type. TGI will automatically distribute and shard the model across all Inferentia devices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "# deactivate warning since model is compiled\n",
+    "llm_model._is_compiled_model = True\n",
+    "\n",
+    "llm = llm_model.deploy(\n",
+    "  initial_instance_count=1,\n",
+    "  instance_type=instance_type,\n",
+    "  container_startup_health_check_timeout=health_check_timeout,\n",
+    "  volume_size=volume_size\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SageMaker will now create our endpoint and deploy the model to it. It takes around 30 minutes for deployment."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After our endpoint is deployed we can run inference on it. We will use the `predict` method from the `predictor` to run inference on our endpoint. \n",
+    "\n",
+    "The endpoint supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either `system`,`assistant` or `user`. The `system` role is used to provide context to the model and the `user` role is used to ask questions or provide input to the model.\n",
+    "\n",
+    "Parameters can be defined as in the `parameters` attribute of the payload. Check out the chat completion [documentation](https://platform.openai.com/docs/api-reference/chat/create) to find supported parameters.\n",
+    "\n",
+    "```json\n",
+    "{\n",
+    "  \"messages\": [\n",
+    "    { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n",
+    "    { \"role\": \"user\", \"content\": \"What is deep learning?\" }\n",
+    "  ]\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "# Prompt to generate\n",
+    "messages=[\n",
+    "    { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n",
+    "    { \"role\": \"user\", \"content\": \"What is deep learning in one sentence?\" }\n",
+    "]\n",
+    "\n",
+    "# Generation arguments https://platform.openai.com/docs/api-reference/chat/create\n",
+    "parameters = {\n",
+    "    \"max_tokens\":100,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Okay lets test it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "chat = llm.predict({\"messages\" :messages, **parameters,\"steam\":True})\n",
+    "\n",
+    "print(chat[\"choices\"][0][\"message\"][\"content\"].strip())"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Clean up\n",
+    "\n",
+    "To clean up, we can delete the model and endpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.delete_model()\n",
+    "llm.delete_endpoint()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hf",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5fcf248a74081676ead7e77f54b2c239ba2921b952f7cbcdbbe5427323165924"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/evaluate-llm-lighteval/sagemaker-notebook.ipynb b/docs/sagemaker/notebooks/sagemaker-sdk/evaluate-llm-lighteval/sagemaker-notebook.ipynb
new file mode 100644
index 0000000000..07992715c7
--- /dev/null
+++ b/docs/sagemaker/notebooks/sagemaker-sdk/evaluate-llm-lighteval/sagemaker-notebook.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate LLMs with Hugging Face Lighteval on Amazon SageMaker\n",
+    "\n",
+    "In this sagemaker example, we are going to learn how to evaluate LLMs using Hugging Face [lighteval](https://github.com/huggingface/lighteval/tree/main).  LightEval is a lightweight LLM evaluation suite that powers [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). \n",
+    "\n",
+    "\n",
+    "Evaluating LLMs is crucial for understanding their capabilities and limitations, yet it poses significant challenges due to their complex and opaque nature. LightEval facilitates this evaluation process by enabling LLMs to be assessed on acamedic benchmarks like MMLU or IFEval, providing a structured approach to gauge their performance across diverse tasks.\n",
+    "\n",
+    "\n",
+    "In Detail you will learn how to:\n",
+    "1. Setup Development Environment\n",
+    "2. Prepare the evaluation configuraiton\n",
+    "3. Evaluate Zephyr 7B on TruthfulQA on Amazon SageMaker\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install sagemaker --upgrade --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import boto3\n",
+    "sess = sagemaker.Session()\n",
+    "# sagemaker session bucket -> used for uploading data, models and logs\n",
+    "# sagemaker will automatically create this bucket if it not exists\n",
+    "sagemaker_session_bucket=None\n",
+    "if sagemaker_session_bucket is None and sess is not None:\n",
+    "    # set to default bucket if a bucket name is not given\n",
+    "    sagemaker_session_bucket = sess.default_bucket()\n",
+    "\n",
+    "try:\n",
+    "    role = sagemaker.get_execution_role()\n",
+    "except ValueError:\n",
+    "    iam = boto3.client('iam')\n",
+    "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
+    "\n",
+    "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
+    "\n",
+    "print(f\"sagemaker role arn: {role}\")\n",
+    "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
+    "print(f\"sagemaker session region: {sess.boto_region_name}\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Prepare the evaluation configuraiton\n",
+    "\n",
+    "[LightEval](https://github.com/huggingface/lighteval/tree/main) includes script to evaluate LLMs on common benchmarks like MMLU, Truthfulqa, IFEval, and more. It is used to evaluate models on the [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). lighteval isy built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) with some additional features and improvements. \n",
+    "\n",
+    "You can find all available benchmarks [here](https://github.com/huggingface/lighteval/blob/main/examples/tasks/all_tasks.txt). \n",
+    "\n",
+    "We are going to use Amazon SageMaker Managed Training to evaluate the model. Therefore we will leverage the script available in [lighteval](https://github.com/huggingface/lighteval/blob/main/run_evals_accelerate.py). The Hugging Face DLC is not having lighteval installed. This means need to provide a `requirements.txt` file to install the required dependencies.\n",
+    "\n",
+    "First lets load the `run_evals_accelerate.py` script and create a `requirements.txt` file with the required dependencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "import requests as r\n",
+    "\n",
+    "lighteval_version = \"0.2.0\"\n",
+    "\n",
+    "# create scripts directory if not exists\n",
+    "os.makedirs(\"scripts\", exist_ok=True)\n",
+    "\n",
+    "# load custom scripts from git\n",
+    "raw_github_url = f\"https://raw.githubusercontent.com/huggingface/lighteval/v{lighteval_version}/run_evals_accelerate.py\"\n",
+    "res = r.get(raw_github_url)\n",
+    "with open(\"scripts/run_evals_accelerate.py\", \"w\") as f:\n",
+    "    f.write(res.text)\n",
+    "    \n",
+    "# write requirements.txt    \n",
+    "with open(\"scripts/requirements.txt\", \"w\") as f:\n",
+    "    f.write(f\"lighteval=={lighteval_version}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In lighteval, the evaluation is done by running the `run_evals_accelerate.py` script. The script takes a `task` argument which is defined as `suite|task|num_few_shot|{0 or 1 to automatically reduce num_few_shot if prompt is too long}`. Alternatively, you can also provide a path to a txt file with the tasks you want to evaluate the model on, which we are going to do. This makes it easier for you to extend the evaluation to other benchmarks.\n",
+    "\n",
+    "We are going to evaluate the model on the Truthfulqa benchmark with 0 few-shot examples. [TruthfulQA](https://paperswithcode.com/dataset/truthfulqa) is a benchmark designed to measure whether a language model generates truthful answers to questions, encompassing 817 questions across 38 categories including health, law, finance, and politics​​."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"scripts/tasks.txt\", \"w\") as f:\n",
+    "    f.write(f\"lighteval|truthfulqa:mc|0|0\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To evaluate a model on all the benchmarks of the Open LLM Leaderboard you can copy this [file](https://github.com/huggingface/lighteval/blob/v0.2.0/tasks_examples/open_llm_leaderboard_tasks.txt)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Evaluate Zephyr 7B on TruthfulQA on Amazon SageMaker\n",
+    "\n",
+    "In this example we are going to evaluate the [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the MMLU benchmark, which is part of the Open LLM Leaderboard. \n",
+    "\n",
+    "In addition to the `task` argument we need to define: \n",
+    "* `model_args`: Hugging Face Model ID or path, defined as `pretrained=HuggingFaceH4/zephyr-7b-beta`\n",
+    "* `model_dtype`: The model data type, defined as `bfloat16`, `float16` or `float32`\n",
+    "* `output_dir`: The directory where the evaluation results will be saved, e.g. `/opt/ml/model` \n",
+    "\n",
+    "Lightevals can also evaluat peft models or use `chat_templates` you find more about it [here](https://github.com/huggingface/lighteval/blob/v0.2.0/run_evals_accelerate.py). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import HuggingFace\n",
+    "\n",
+    "# hyperparameters, which are passed into the training job\n",
+    "hyperparameters = {\n",
+    "  'model_args': \"pretrained=HuggingFaceH4/zephyr-7b-beta\", # Hugging Face Model ID\n",
+    "  'task': 'tasks.txt',  # 'lighteval|truthfulqa:mc|0|0', \n",
+    "  'model_dtype': 'bfloat16', # Torch dtype to load model weights\n",
+    "  'output_dir': '/opt/ml/model' # Directory, which sagemaker uploads to s3 after training\n",
+    "}\n",
+    "\n",
+    "# create the Estimator\n",
+    "huggingface_estimator = HuggingFace(\n",
+    "    entry_point          = 'run_evals_accelerate.py',      # train script\n",
+    "    source_dir           = 'scripts',         # directory which includes all the files needed for training\n",
+    "    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job\n",
+    "    instance_count       = 1,                 # the number of instances used for training\n",
+    "    base_job_name        = \"lighteval\",          # the name of the training job\n",
+    "    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3\n",
+    "    volume_size          = 300,               # the size of the EBS volume in GB\n",
+    "    transformers_version = '4.36',            # the transformers version used in the training job\n",
+    "    pytorch_version      = '2.1',            # the pytorch_version version used in the training job\n",
+    "    py_version           = 'py310',            # the python version used in the training job\n",
+    "    hyperparameters      =  hyperparameters,\n",
+    "    environment          = { \n",
+    "                            \"HUGGINGFACE_HUB_CACHE\": \"/tmp/.cache\",\n",
+    "                            # \"HF_TOKEN\": \"REPALCE_WITH_YOUR_TOKEN\" # needed for private models\n",
+    "                            }, # set env variable to cache models in /tmp\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now start our evaluation job, with the `.fit()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# starting the train job with our uploaded datasets as input\n",
+    "huggingface_estimator.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the evaluation job is finished, we can download the evaluation results from the S3 bucket. Lighteval will save the results and generations in the `output_dir`. The results are savedas json and include detailed information about each task and the model's performance. The results are available in the `results` key. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tarfile\n",
+    "import json\n",
+    "import io\n",
+    "import os\n",
+    "from sagemaker.s3 import S3Downloader\n",
+    "\n",
+    "\n",
+    "# download results from s3\n",
+    "results_tar = S3Downloader.read_bytes(huggingface_estimator.model_data)\n",
+    "model_id = hyperparameters[\"model_args\"].split(\"=\")[1]\n",
+    "result={}\n",
+    "\n",
+    "# Use tarfile to open the tar content directly from bytes\n",
+    "with tarfile.open(fileobj=io.BytesIO(results_tar), mode=\"r:gz\") as tar:\n",
+    "    # Iterate over items in tar archive to find your json file by its path\n",
+    "    for member in tar.getmembers():\n",
+    "        # get path of results based on model id used to evaluate\n",
+    "        if os.path.join(\"details\", model_id) in member.name and member.name.endswith('.json'):\n",
+    "            # Extract the file content\n",
+    "            f = tar.extractfile(member)\n",
+    "            if f is not None:\n",
+    "                content = f.read()\n",
+    "                result = json.loads(content)\n",
+    "                break\n",
+    "            \n",
+    "# print results\n",
+    "print(result[\"results\"])\n",
+    "# {'lighteval|truthfulqa:mc|0': {'truthfulqa_mc1': 0.40636474908200737, 'truthfulqa_mc1_stderr': 0.017193835812093897, 'truthfulqa_mc2': 0.5747003398184238, 'truthfulqa_mc2_stderr': 0.015742356478301463}}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In our test we achieved a `mc1` score of 40.6% and an `mc2` score of 57.47%. The `mc2` is the score used in the Open LLM Leaderboard. Zephyr 7B achieved a `mc2` score of 57.47% on the TruthfulQA benchmark, which is identical to the score on the Open LLM Leaderboard. \n",
+    "The evaluation on Truthfulqa took `999 seconds`. The ml.g5.4xlarge instance we used costs `$2.03 per hour` for on-demand usage. As a result, the total cost for evaluating Zephyr 7B on Truthfulqa was `$0.56`."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/assets/requirements.txt b/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/assets/requirements.txt
new file mode 100644
index 0000000000..dd9f603154
--- /dev/null
+++ b/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/assets/requirements.txt
@@ -0,0 +1,5 @@
+transformers==4.41.2 
+accelerate==0.31.0
+sentence-transformers==3.0.0
+datasets==2.18.0
+torch==2.1.2
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/assets/run_mnr.py b/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/assets/run_mnr.py
new file mode 100644
index 0000000000..8d531e6aa9
--- /dev/null
+++ b/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/assets/run_mnr.py
@@ -0,0 +1,185 @@
+from dataclasses import dataclass, field
+import os
+from sentence_transformers import (
+    SentenceTransformerModelCardData,
+    SentenceTransformer,
+    SentenceTransformerTrainer,
+    SentenceTransformerTrainingArguments,
+)
+from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
+from sentence_transformers.training_args import BatchSamplers
+from transformers import set_seed, HfArgumentParser
+
+
+from sentence_transformers.evaluation import (
+    InformationRetrievalEvaluator,
+    SequentialEvaluator,
+)
+from sentence_transformers.util import cos_sim
+from datasets import load_dataset, concatenate_datasets
+
+
+@dataclass
+class ScriptArguments:
+    train_dataset_path: str = field(
+        default="/opt/ml/input/data/train/",
+        metadata={"help": "Path to the dataset, e.g. /opt/ml/input/data/train/"},
+    )
+    test_dataset_path: str = field(
+        default="/opt/ml/input/data/test/",
+        metadata={"help": "Path to the dataset, e.g. /opt/ml/input/data/test/"},
+    )
+    model_id: str = field(
+        default=None, metadata={"help": "Model ID to use for Embedding training"}
+    )
+    num_train_epochs: int = field(
+        default=1, metadata={"help": "Number of training epochs"}
+    )
+    per_device_train_batch_size: int = field(
+        default=32, metadata={"help": "Training batch size"}
+    )
+    per_device_eval_batch_size: int = field(
+        default=16, metadata={"help": "Evaluation batch size"}
+    )
+    gradient_accumulation_steps: int = field(
+        default=16, metadata={"help": "Gradient accumulation steps"}
+    )
+    learning_rate: float = field(
+        default=2e-5, metadata={"help": "Learning rate for the optimizer"}
+    )
+
+
+def create_evaluator(
+    train_dataset, test_dataset, matryoshka_dimensions=[768, 512, 256, 128, 64]
+):
+    corpus_dataset = concatenate_datasets([train_dataset, test_dataset])
+
+    # Convert the datasets to dictionaries
+    corpus = dict(
+        zip(corpus_dataset["id"], corpus_dataset["positive"])
+    )  # Our corpus (cid => document)
+    queries = dict(
+        zip(test_dataset["id"], test_dataset["anchor"])
+    )  # Our queries (qid => question)
+
+    # Create a mapping of relevant document (1 in our case) for each query
+    relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
+    for q_id in queries:
+        relevant_docs[q_id] = [q_id]
+
+    matryoshka_evaluators = []
+    # Iterate over the different dimensions
+    for dim in matryoshka_dimensions:
+        ir_evaluator = InformationRetrievalEvaluator(
+            queries=queries,
+            corpus=corpus,
+            relevant_docs=relevant_docs,
+            name=f"dim_{dim}",
+            truncate_dim=dim,  # Truncate the embeddings to a certain dimension
+            score_functions={"cosine": cos_sim},
+        )
+        matryoshka_evaluators.append(ir_evaluator)
+
+    # Create a sequential evaluator
+    return SequentialEvaluator(matryoshka_evaluators)
+
+
+def training_function(script_args):
+    ################
+    # Dataset
+    ################
+
+    train_dataset = load_dataset(
+        "json",
+        data_files=os.path.join(script_args.train_dataset_path, "dataset.json"),
+        split="train",
+    )
+    test_dataset = load_dataset(
+        "json",
+        data_files=os.path.join(script_args.test_dataset_path, "dataset.json"),
+        split="train",
+    )
+
+    ###################
+    # Model & Evaluator
+    ###################
+
+    matryoshka_dimensions = [768, 512, 256, 128, 64]  # Important: large to small
+
+    model = SentenceTransformer(
+        script_args.model_id,
+        device="cuda",
+        model_kwargs={"attn_implementation": "sdpa"},  # needs Ampere GPU or newer
+        model_card_data=SentenceTransformerModelCardData(
+            language="en",
+            license="apache-2.0",
+            model_name="BGE base Financial Matryoshka",
+        ),
+    )
+    evaluator = create_evaluator(
+        train_dataset, test_dataset, matryoshka_dimensions=matryoshka_dimensions
+    )
+
+    ###################
+    # Loss Function
+    ###################
+
+    # create Matryoshka loss function with MultipleNegativesRankingLoss
+    inner_train_loss = MultipleNegativesRankingLoss(model)
+    train_loss = MatryoshkaLoss(
+        model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
+    )
+
+    ################
+    # Training
+    ################
+    training_args = SentenceTransformerTrainingArguments(
+        output_dir="/opt/ml/model",  # output directory for sagemaker to upload to s3
+        num_train_epochs=script_args.num_train_epochs,  # number of epochs
+        per_device_train_batch_size=script_args.per_device_train_batch_size,  # training batch size
+        per_device_eval_batch_size=script_args.per_device_eval_batch_size,  # evaluation batch size
+        gradient_accumulation_steps=script_args.gradient_accumulation_steps,  # gradient accumulation steps
+        warmup_ratio=0.1,  # warmup ratio
+        learning_rate=script_args.learning_rate,  # learning rate
+        lr_scheduler_type="cosine",  # use constant learning rate scheduler
+        optim="adamw_torch_fused",  # use fused adamw optimizer
+        tf32=True,  # use tf32 precision
+        bf16=True,  # use bf16 precision
+        batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
+        eval_strategy="epoch",  # evaluate after each epoch
+        save_strategy="epoch",  # save after each epoch
+        logging_steps=10,  # log every 10 steps
+        save_total_limit=3,  # save only the last 3 models
+        load_best_model_at_end=True,  # load the best model when training ends
+        metric_for_best_model="eval_dim_128_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
+    )
+
+    trainer = SentenceTransformerTrainer(
+        model=model,  # bg-base-en-v1
+        args=training_args,  # training arguments
+        train_dataset=train_dataset.select_columns(
+            ["positive", "anchor"]
+        ),  # training dataset
+        loss=train_loss,
+        evaluator=evaluator,
+    )
+
+    ##########################
+    # Train model
+    ##########################
+    # start training, the model will be automatically saved to the hub and the output directory
+    trainer.train()
+
+    # save the best model
+    trainer.save_model()
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((ScriptArguments))
+    script_args = parser.parse_args_into_dataclasses()[0]
+
+    # set seed
+    set_seed(42)
+
+    # launch training
+    training_function(script_args)
diff --git a/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/sagemaker-notebook.ipynb b/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/sagemaker-notebook.ipynb
new file mode 100644
index 0000000000..d913daa555
--- /dev/null
+++ b/docs/sagemaker/notebooks/sagemaker-sdk/fine-tune-embedding-models/sagemaker-notebook.ipynb
@@ -0,0 +1,410 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tune and deploy embedding models with Amazon SageMaker\n",
+    "\n",
+    "Embedding models are crucial for successful RAG applications, but they're often trained on general knowledge, which limits their effectiveness for company or domain specific adoption. Customizing embedding for your domain specific data can significantly boost the retrieval performance of your RAG Application. With the new release of [Sentence Transformers 3](https://huggingface.co/blog/train-sentence-transformers) and the [Hugging Face Embedding Container](https://huggingface.co/blog/sagemaker-huggingface-embedding), it's easier than ever to fine-tune and deploy embedding models.\n",
+    "\n",
+    "In this example, we'll show you how to fine-tune and deploy a custom embedding model on Amazon SageMaker using the new Hugging Face Embedding Container. We'll use the [Sentence Transformers 3](https://huggingface.co/blog/train-sentence-transformers) library to fine-tune a model on a custom dataset and deploy it on Amazon SageMaker for inference. We will fine-tune [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) for financial RAG applications using a synthetic dataset from the [2023_10 NVIDIA SEC Filing](https://stocklight.com/stocks/us/nasdaq-nvda/nvidia/annual-reports/nasdaq-nvda-2023-10K-23668751.pdf). \n",
+    "\n",
+    "1. [Setup development environment](#2-setup-development-environment)\n",
+    "2. [Create and prepare the dataset](#3-create-and-prepare-the-dataset)\n",
+    "3. [Fine-tune Embedding model on Amazon SageMaker](#3-fine-tune-embedding-model-on-amazon-sagemaker)\n",
+    "4. [Deploy & Test fine-tuned Embedding Model on Amazon SageMaker](#4-deploy--test-fine-tuned-embedding-model-on-amazon-sagemaker)\n",
+    "\n",
+    "**What is new with Sentence Transformers 3?**\n",
+    "\n",
+    "Sentence Transformers v3 introduces a new trainer that makes it easier to fine-tune and train embedding models. This update includes enhanced components like diverse datasets, updated loss functions, and a streamlined training process, improving the efficiency and flexibility of model development.\n",
+    "\n",
+    "\n",
+    "**What is the Hugging Face Embedding Container?**\n",
+    "\n",
+    "The Hugging Face Embedding Container is a new purpose-built Inference Container to easily deploy Embedding Models in a secure and managed environment. The DLC is powered by [Text Embedding Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) a blazing fast and memory efficient solution for deploying and serving Embedding Models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5. TEI implements many features such as:\n",
+    "\n",
+    "_Note: This blog was created and validated on `ml.g5.xlarge` for training and `ml.c6i.2xlarge` for inference instance._\n",
+    "\n",
+    "\n",
+    "## 1. Setup Development Environment\n",
+    "\n",
+    "Our first step is to install Hugging Face Libraries we need on the client to correctly prepare our dataset and start our training/evaluations jobs. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers \"datasets[s3]==2.18.0\" \"sagemaker>=2.190.0\" \"huggingface_hub[cli]\" --upgrade --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import boto3\n",
+    "sess = sagemaker.Session()\n",
+    "# sagemaker session bucket -> used for uploading data, models and logs\n",
+    "# sagemaker will automatically create this bucket if it not exists\n",
+    "sagemaker_session_bucket=None\n",
+    "if sagemaker_session_bucket is None and sess is not None:\n",
+    "    # set to default bucket if a bucket name is not given\n",
+    "    sagemaker_session_bucket = sess.default_bucket()\n",
+    "\n",
+    "try:\n",
+    "    role = sagemaker.get_execution_role()\n",
+    "except ValueError:\n",
+    "    iam = boto3.client('iam')\n",
+    "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
+    "\n",
+    "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
+    "\n",
+    "print(f\"sagemaker role arn: {role}\")\n",
+    "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
+    "print(f\"sagemaker session region: {sess.boto_region_name}\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Create and prepare the dataset\n",
+    "\n",
+    "An embedding dataset typically consists of text pairs (question, answer/context) or triplets that represent relationships or similarities between sentences. The dataset format you choose or have available will also impact the loss function you can use. Common formats for embedding datasets:\n",
+    "\n",
+    "- **Positive Pair**: Text Pairs of related sentences (query, context | query, answer), suitable for tasks like similarity or semantic search, example datasets: `sentence-transformers/sentence-compression`, `sentence-transformers/natural-questions`.\n",
+    "- **Triplets**: Text triplets consisting of (anchor, positive, negative), example datasets `sentence-transformers/quora-duplicates`, `nirantk/triplets`.\n",
+    "- **Pair with Similarity Score**: Sentence pairs with a similarity score indicating how related they are, example datasets: `sentence-transformers/stsb`, `PhilipMay/stsb_multi_mt`\n",
+    "\n",
+    "Learn more at [Dataset Overview](https://sbert.net/docs/sentence_transformer/dataset_overview.html).\n",
+    "\n",
+    "We are going to use [philschmid/finanical-rag-embedding-dataset](https://huggingface.co/datasets/philschmid/finanical-rag-embedding-dataset), which includes 7,000 positive text pairs of questions and corresponding context from the [2023_10 NVIDIA SEC Filing](https://stocklight.com/stocks/us/nasdaq-nvda/nvidia/annual-reports/nasdaq-nvda-2023-10K-23668751.pdf).\n",
+    "\n",
+    "The dataset has the following format\n",
+    "```json\n",
+    "{\"question\": \"<question>\", \"context\": \"<relevant context to answer>\"}\n",
+    "{\"question\": \"<question>\", \"context\": \"<relevant context to answer>\"}\n",
+    "{\"question\": \"<question>\", \"context\": \"<relevant context to answer>\"}\n",
+    "```\n",
+    "\n",
+    "We are going to use the [FileSystem integration](https://huggingface.co/docs/datasets/filesystems) to upload our dataset to S3. We are using the `sess.default_bucket()`, adjust this if you want to store the dataset in a different S3 bucket. We will use the S3 path later in our training script."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "# Load dataset from the hub\n",
+    "dataset = load_dataset(\"philschmid/finanical-rag-embedding-dataset\", split=\"train\")\n",
+    "input_path = f's3://{sess.default_bucket()}/datasets/rag-embedding'\n",
+    "\n",
+    "# rename columns\n",
+    "dataset = dataset.rename_column(\"question\", \"anchor\")\n",
+    "dataset = dataset.rename_column(\"context\", \"positive\")\n",
+    "\n",
+    "# Add an id column to the dataset\n",
+    "dataset = dataset.add_column(\"id\", range(len(dataset)))\n",
+    "\n",
+    "# split dataset into a 10% test set\n",
+    "dataset = dataset.train_test_split(test_size=0.1)\n",
+    "\n",
+    "# save train_dataset to s3 using our SageMaker session\n",
+    "\n",
+    "# save datasets to s3\n",
+    "dataset[\"train\"].to_json(f\"{input_path}/train/dataset.json\", orient=\"records\")\n",
+    "train_dataset_s3_path = f\"{input_path}/train/dataset.json\"\n",
+    "dataset[\"test\"].to_json(f\"{input_path}/test/dataset.json\", orient=\"records\")\n",
+    "test_dataset_s3_path = f\"{input_path}/test/dataset.json\"\n",
+    "\n",
+    "print(f\"Training data uploaded to:\")\n",
+    "print(train_dataset_s3_path)\n",
+    "print(test_dataset_s3_path)\n",
+    "print(f\"https://s3.console.aws.amazon.com/s3/buckets/{sess.default_bucket()}/?region={sess.boto_region_name}&prefix={input_path.split('/', 3)[-1]}/\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Fine-tune Embedding model on Amazon SageMaker\n",
+    "\n",
+    "We are now ready to fine-tune our model. We will use the [SentenceTransformerTrainer](https://www.sbert.net/docs/package_reference/sentence_transformer/trainer.html) from `sentence-transformers` to fine-tune our model. The `SentenceTransformerTrainer` makes it straightfoward to supervise fine-tune open Embedding Models, as it is a subclass of the `Trainer` from the `transformers`. We prepared a script [run_mnr.py](assets/run_mnr.py) which will loads the dataset from disk, prepare the model, tokenizer and start the training. \n",
+    "The `SentenceTransformerTrainer` makes it straightfoward to supervise fine-tune open Embedding supporting:\n",
+    "- **Integrated Components**: Combines datasets, loss functions, and evaluators into a unified training framework.\n",
+    "- **Flexible Data Handling**: Supports various data formats and easy integration with Hugging Face datasets.\n",
+    "- **Versatile Loss Functions**: Offers multiple loss functions for different training tasks.\n",
+    "- **Multi-Dataset Training**: Facilitates simultaneous training with multiple datasets and different loss functions.\n",
+    "- **Seamless Integration**: Easy saving, loading, and sharing of models within the Hugging Face ecosystem.\n",
+    "\n",
+    "In order to create a sagemaker training job we need an `HuggingFace` Estimator. The Estimator handles end-to-end Amazon SageMaker training and deployment tasks. The Estimator manages the infrastructure use. Amazon SagMaker takes care of starting and managing all the required ec2 instances for us, provides the correct huggingface container, uploads the provided scripts and downloads the data from our S3 bucket into the container at `/opt/ml/input/data`. Then, it starts the training job by running.\n",
+    "\n",
+    "> Note: Make sure that you include the `requirements.txt` in the `source_dir` if you are using a custom training script. We recommend to just clone the whole repository.\n",
+    "\n",
+    "Lets first define our trainings parameter. Those are passed as cli arguments to our training script. We are going to use the `BAAI/bge-base-en-v1.5` model, which is a pre-trained model on a large corpus of English text. We will use the `MultipleNegativesRankingLoss` in combination with the `MatryoshkaLoss`. This approach allows us to leverage the efficiency and flexibility of Matryoshka embeddings, enabling different embedding dimensions to be utilized without significant performance trade-offs. The `MultipleNegativesRankingLoss` is a great loss function if you only have positive pairs as it adds in batch negative samples to the loss function to have per sample n-1 negative samples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import HuggingFace\n",
+    "\n",
+    "# define Training Job Name \n",
+    "job_name = f'bge-base-exp1'\n",
+    "\n",
+    "# define hyperparameters, which are passed into the training job\n",
+    "training_arguments = {\n",
+    "  \"model_id\": \"BAAI/bge-base-en-v1.5\", # model id from the hub\n",
+    "  \"train_dataset_path\": \"/opt/ml/input/data/train/\", # path inside the container where the training data is stored\n",
+    "  \"test_dataset_path\": \"/opt/ml/input/data/test/\", # path inside the container where the test data is stored\n",
+    "  \"num_train_epochs\": 3, # number of training epochs\n",
+    "  \"learning_rate\": 2e-5, # learning rate\n",
+    "}\n",
+    "\n",
+    "# create the Estimator\n",
+    "huggingface_estimator = HuggingFace(\n",
+    "    entry_point          = 'run_mnr.py',      # train script\n",
+    "    source_dir           = 'scripts',         # directory which includes all the files needed for training\n",
+    "    instance_type        = 'ml.g5.xlarge',    # instances type used for the training job\n",
+    "    instance_count       = 1,                 # the number of instances used for training\n",
+    "    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)\n",
+    "    base_job_name        = job_name,          # the name of the training job\n",
+    "    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3\n",
+    "    transformers_version = '4.36.0',          # the transformers version used in the training job\n",
+    "    pytorch_version      = '2.1.0',           # the pytorch_version version used in the training job\n",
+    "    py_version           = 'py310',           # the python version used in the training job\n",
+    "    hyperparameters      =  training_arguments,\n",
+    "    disable_output_compression = True,        # not compress output to save training time and cost\n",
+    "    environment  = {\n",
+    "        \"HUGGINGFACE_HUB_CACHE\": \"/tmp/.cache\", # set env variable to cache models in /tmp\n",
+    "    }, \n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now start our training job, with the `.fit()` method passing our S3 path to the training script."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define a data input dictonary with our uploaded s3 uris\n",
+    "data = {\n",
+    "  'train': train_dataset_s3_path,\n",
+    "  'test': test_dataset_s3_path,\n",
+    "  }\n",
+    "\n",
+    "# starting the train job with our uploaded datasets as input\n",
+    "huggingface_estimator.fit(data, wait=True)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In our example the training BGE Base with Flash Attention 2 (SDPA) for 3 epochs with a dataset of 6,3k train samples and 700 eval samples took 645 seconds (~10minutes) on a `ml.g5.xlarge` (1.2575 $/h) or ~$5."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Deploy & Test fine-tuned Embedding Model on Amazon SageMaker\n",
+    "\n",
+    "We are going to use the [Hugging Face Embedding Container](https://huggingface.co/blog/sagemaker-huggingface-embedding#what-is-the-hugging-face-embedding-container) a purpose-built Inference  Container to easily deploy Embedding Models in a secure and managed environment. The DLC is powered by Text Embedding Inference (TEI) a blazing fast and memory efficient solution for deploying and serving Embedding Models.\n",
+    "\n",
+    "To retrieve the new Hugging Face Embedding Container in Amazon SageMaker, we can use the `get_huggingface_llm_image_uri` method provided by the sagemaker SDK. This method allows us to retrieve the URI for the desired Hugging Face Embedding Container. Important to note is that TEI has 2 different versions for cpu and gpu, so we create a helper function to retrieve the correct image uri based on the instance type."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import get_huggingface_llm_image_uri\n",
+    "\n",
+    "# retrieve the image uri based on instance type\n",
+    "def get_image_uri(instance_type):\n",
+    "  key = \"huggingface-tei\" if instance_type.startswith(\"ml.g\") or instance_type.startswith(\"ml.p\") else \"huggingface-tei-cpu\"\n",
+    "  return get_huggingface_llm_image_uri(key, version=\"1.4.0\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now create a `HuggingFaceModel` using the container uri and the S3 path to our model. We also need to set our TEI configuration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import HuggingFaceModel\n",
+    "\n",
+    "# sagemaker config\n",
+    "instance_type = \"ml.c6i.2xlarge\"\n",
+    "\n",
+    "# create HuggingFaceModel with the image uri\n",
+    "emb_model = HuggingFaceModel(\n",
+    "  role=role,\n",
+    "  image_uri=get_image_uri(instance_type),\n",
+    "  model_data=huggingface_estimator.model_data,\n",
+    "  env={'HF_MODEL_ID': \"/opt/ml/model\"}     # Path to the model in the container\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the deploy method. We will deploy the model with the `ml.c6i.2xlarge` instance type."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Deploy model to an endpoint\n",
+    "emb = emb_model.deploy(\n",
+    "  initial_instance_count=1,\n",
+    "  instance_type=instance_type,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SageMaker will now create our endpoint and deploy the model to it. This can take ~5 minutes. After our endpoint is deployed we can run inference on it. We will use the `predict` method from the predictor to run inference on our endpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\n",
+    "  \"inputs\": \"the mesmerizing performances of the leads keep the film grounded and keep the audience riveted .\",\n",
+    "}\n",
+    " \n",
+    "res = emb.predict(data=data)\n",
+    " \n",
+    " \n",
+    "# print some results\n",
+    "print(f\"length of embeddings: {len(res[0])}\")\n",
+    "print(f\"first 10 elements of embeddings: {res[0][:10]}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We trained our model with the Matryoshka Loss means that the semantic meaning is frontloaded. To use the different mathryshoka dimension we need to manually truncate our embeddings manually. Below is an example on how you would truncate the embeddings to 256 dimension, which is 1/3 of the original size. If we check our training logs we can see that the NDCG metric for 768 is `0.823` and for 256 `0.818` meaning we preserve > 99% accuracy. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\n",
+    "  \"inputs\": \"the mesmerizing performances of the leads keep the film grounded and keep the audience riveted .\",\n",
+    "}\n",
+    " \n",
+    "res = emb.predict(data=data)\n",
+    " \n",
+    "# truncate embeddings to matryoshka dimensions\n",
+    "dim = 256\n",
+    "res = res[0][0:dim]\n",
+    " \n",
+    "# print some results\n",
+    "print(f\"length of embeddings: {len(res)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Awesome! 🚀 Now that we can generate embeddings and integrate your endpoint into your RAG application. \n",
+    "\n",
+    "To clean up, we can delete the model and endpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "emb.delete_model()\n",
+    "emb.delete_endpoint()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "2d58e898dde0263bc564c6968b04150abacfd33eed9b19aaa8e45c040360e146"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/sagemaker/reference.md b/docs/sagemaker/reference.md
deleted file mode 100644
index 42c5e5146f..0000000000
--- a/docs/sagemaker/reference.md
+++ /dev/null
@@ -1,254 +0,0 @@
-# Reference
-
-## Deep Learning Container
-
-Below you can find a version table of currently available Hugging Face DLCs. The table doesn't include the full `image_uri` here are two examples on how to construct those if needed.
-
-**Manually construction the `image_uri`**
-
-`{dlc-aws-account-id}.dkr.ecr.{region}.amazonaws.com/huggingface-{framework}-{(training | inference)}:{framework-version}-transformers{transformers-version}-{device}-{python-version}-{device-tag}`
-
-- `dlc-aws-account-id`: The AWS account ID of the account that owns the ECR repository. You can find them in the [here](https://github.com/aws/sagemaker-python-sdk/blob/e0b9d38e1e3b48647a02af23c4be54980e53dc61/src/sagemaker/image_uri_config/huggingface.json#L21)
-- `region`: The AWS region where you want to use it.
-- `framework`: The framework you want to use, either `pytorch` or `tensorflow`.
-- `(training | inference)`: The training or inference mode.
-- `framework-version`: The version of the framework you want to use.
-- `transformers-version`: The version of the transformers library you want to use.
-- `device`: The device you want to use, either `cpu` or `gpu`.
-- `python-version`: The version of the python of the DLC.
-- `device-tag`: The device tag you want to use. The device tag can include os version and cuda version
-
-**Example 1: PyTorch Training:**
-`763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`
-**Example 2: Tensorflow Inference:**
-`763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-inference:2.4.1-transformers4.6.1-cpu-py37-ubuntu18.04`
-
-## Training DLC Overview
-
-The Training DLC overview includes all released and available Hugging Face Training DLCs. It includes PyTorch and TensorFlow flavored 
-versions for GPU.
-
-| 🤗 Transformers version | 🤗 Datasets version | PyTorch/TensorFlow version | type     | device | Python Version |
-| ----------------------- | ------------------- | -------------------------- | -------- | ------ | -------------- |
-| 4.4.2                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            |
-| 4.4.2                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            |
-| 4.5.0                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            |
-| 4.5.0                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            |
-| 4.6.1                   | 1.6.2               | PyTorch 1.6.0              | training | GPU    | 3.6            |
-| 4.6.1                   | 1.6.2               | PyTorch 1.7.1              | training | GPU    | 3.6            |
-| 4.6.1                   | 1.6.2               | TensorFlow 2.4.1           | training | GPU    | 3.7            |
-| 4.10.2                  | 1.11.0              | PyTorch 1.8.1              | training | GPU    | 3.6            |
-| 4.10.2                  | 1.11.0              | PyTorch 1.9.0              | training | GPU    | 3.8            |
-| 4.10.2                  | 1.11.0              | TensorFlow 2.4.1           | training | GPU    | 3.7            |
-| 4.10.2                  | 1.11.0              | TensorFlow 2.5.1           | training | GPU    | 3.7            |
-| 4.11.0                  | 1.12.1              | PyTorch 1.9.0              | training | GPU    | 3.8            |
-| 4.11.0                  | 1.12.1              | TensorFlow 2.5.1           | training | GPU    | 3.7            |
-| 4.12.3                  | 1.15.1              | PyTorch 1.9.1              | training | GPU    | 3.8            |
-| 4.12.3                  | 1.15.1              | TensorFlow 2.5.1           | training | GPU    | 3.7            |
-| 4.17.0                  | 1.18.4              | PyTorch 1.10.2             | training | GPU    | 3.8            |
-| 4.17.0                  | 1.18.4              | TensorFlow 2.6.3           | training | GPU    | 3.8            |
-| 4.26.0                  |  2.9.0              | PyTorch 1.13.1             | training | GPU    | 3.9            |
-
-## Inference DLC Overview
-
-The Inference DLC overview includes all released and available Hugging Face Inference DLCs. It includes PyTorch and TensorFlow flavored 
-versions for CPU, GPU & AWS Inferentia.
-
-
-| 🤗 Transformers version | PyTorch/TensorFlow version | type      | device | Python Version |
-| ----------------------- | -------------------------- | --------- | ------ | -------------- |
-| 4.6.1                   | PyTorch 1.7.1              | inference | CPU    | 3.6            |
-| 4.6.1                   | PyTorch 1.7.1              | inference | GPU    | 3.6            |
-| 4.6.1                   | TensorFlow 2.4.1           | inference | CPU    | 3.7            |
-| 4.6.1                   | TensorFlow 2.4.1           | inference | GPU    | 3.7            |
-| 4.10.2                  | PyTorch 1.8.1              | inference | GPU    | 3.6            |
-| 4.10.2                  | PyTorch 1.9.0              | inference | GPU    | 3.8            |
-| 4.10.2                  | TensorFlow 2.4.1           | inference | GPU    | 3.7            |
-| 4.10.2                  | TensorFlow 2.5.1           | inference | GPU    | 3.7            |
-| 4.10.2                  | PyTorch 1.8.1              | inference | CPU    | 3.6            |
-| 4.10.2                  | PyTorch 1.9.0              | inference | CPU    | 3.8            |
-| 4.10.2                  | TensorFlow 2.4.1           | inference | CPU    | 3.7            |
-| 4.10.2                  | TensorFlow 2.5.1           | inference | CPU    | 3.7            |
-| 4.11.0                  | PyTorch 1.9.0              | inference | GPU    | 3.8            |
-| 4.11.0                  | TensorFlow 2.5.1           | inference | GPU    | 3.7            |
-| 4.11.0                  | PyTorch 1.9.0              | inference | CPU    | 3.8            |
-| 4.11.0                  | TensorFlow 2.5.1           | inference | CPU    | 3.7            |
-| 4.12.3                  | PyTorch 1.9.1              | inference | GPU    | 3.8            |
-| 4.12.3                  | TensorFlow 2.5.1           | inference | GPU    | 3.7            |
-| 4.12.3                  | PyTorch 1.9.1              | inference | CPU    | 3.8            |
-| 4.12.3                  | TensorFlow 2.5.1           | inference | CPU    | 3.7            |
-| 4.12.3                  | PyTorch 1.9.1              | inference | Inferentia    | 3.7            |
-| 4.17.0                  | PyTorch 1.10.2              | inference | GPU    | 3.8            |
-| 4.17.0                  | TensorFlow 2.6.3           | inference | GPU    | 3.8            |
-| 4.17.0                  | PyTorch 1.10.2              | inference | CPU    | 3.8            |
-| 4.17.0                  | TensorFlow 2.6.3           | inference | CPU    | 3.8            |
-| 4.26.0                  | PyTorch 1.13.1              | inference | CPU    | 3.9            |
-| 4.26.0                  | PyTorch 1.13.1              | inference | GPU    | 3.9            |
-
-
-
-## Hugging Face Transformers Amazon SageMaker Examples
-
-Example Jupyter notebooks that demonstrate how to build, train, and deploy [Hugging Face Transformers](https://github.com/huggingface/transformers) using [Amazon SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/whatis.html) and the [Amazon SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/).
-
-
-| Notebook                                                                                                                                                    | Type     | Description                                                                                                                            |
-|-------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|----------------------------------------------------------------------------------------------------------------------------------------|
-| [01 Getting started with PyTorch](https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb)       | Training | Getting started end-to-end example on how to fine-tune a pre-trained Hugging Face Transformer for Text-Classification using PyTorch    |
-| [02 getting started with TensorFlow](https://github.com/huggingface/notebooks/blob/main/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb) | Training | Getting started end-to-end example on how to fine-tune a pre-trained Hugging Face Transformer for Text-Classification using TensorFlow |
-| [03 Distributed Training: Data Parallelism](https://github.com/huggingface/notebooks/blob/main/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | Training | End-to-end example on how to use distributed training with data-parallelism strategy for fine-tuning a pre-trained Hugging Face Transformer for Question-Answering using Amazon SageMaker Data Parallelism |
-| [04 Distributed Training: Model Parallelism](https://github.com/huggingface/notebooks/blob/main/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb) | Training | End-to-end example on how to use distributed training with model-parallelism strategy to pre-trained Hugging Face Transformer using Amazon SageMaker Model Parallelism |
-| [05 How to use Spot Instances & Checkpointing](https://github.com/huggingface/notebooks/blob/main/sagemaker/05_spot_instances/sagemaker-notebook.ipynb) | Training | End-to-end example on how to use Spot Instances and Checkpointing to reduce training cost |
-| [06 Experiment Tracking with SageMaker Metrics](https://github.com/huggingface/notebooks/blob/main/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb) | Training | End-to-end example on how to use SageMaker metrics to track your experiments and training jobs |
-| [07 Distributed Training: Data Parallelism](https://github.com/huggingface/notebooks/blob/main/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | Training | End-to-end example on how to use Amazon SageMaker Data Parallelism with TensorFlow |
-| [08 Distributed Training: Summarization with T5/BART](https://github.com/huggingface/notebooks/blob/main/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb) | Training | End-to-end example on how to fine-tune BART/T5 for Summarization using Amazon SageMaker Data Parallelism |
-| [09 Vision: Fine-tune ViT](https://github.com/huggingface/notebooks/blob/main/sagemaker/09_image_classification_vision_transformer/sagemaker-notebook.ipynb) | Training | End-to-end example on how to fine-tune Vision Transformer for Image-Classification |
-| [10 Deploy HF Transformer from Amazon S3](https://github.com/huggingface/notebooks/blob/main/sagemaker/10_deploy_model_from_s3/deploy_transformer_model_from_s3.ipynb) | Inference | End-to-end example on how to deploy a model from Amazon S3 |
-| [11 Deploy HF Transformer from Hugging Face Hub](https://github.com/huggingface/notebooks/blob/main/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb) | Inference | End-to-end example on how to deploy a model from the Hugging Face Hub |
-| [12 Batch Processing with Amazon SageMaker Batch Transform](https://github.com/huggingface/notebooks/blob/main/sagemaker/12_batch_transform_inference/sagemaker-notebook.ipynb) | Inference | End-to-end example on how to do batch processing with Amazon SageMaker Batch Transform |
-| [13 Autoscaling SageMaker Endpoints](https://github.com/huggingface/notebooks/blob/main/sagemaker/13_deploy_and_autoscaling_transformers/sagemaker-notebook.ipynb) | Inference | End-to-end example on how to do use autoscaling for a HF Endpoint |
-| [14 Fine-tune and push to Hub](https://github.com/huggingface/notebooks/blob/main/sagemaker/14_train_and_push_to_hub/sagemaker-notebook.ipynb) | Training | End-to-end example on how to do use the Hugging Face Hub as MLOps backend for saving checkpoints during training |
-| [15 Training Compiler](https://github.com/huggingface/notebooks/blob/main/sagemaker/15_training_compiler/sagemaker-notebook.ipynb) | Training | End-to-end example on how to do use Amazon SageMaker Training Compiler to speed up training time |
-| [16 Asynchronous Inference](https://github.com/huggingface/notebooks/blob/main/sagemaker/16_async_inference_hf_hub/sagemaker-notebook.ipynb) | Inference | End-to-end example on how to do use Amazon SageMaker Asynchronous Inference endpoints with Hugging Face Transformers |
-| [17 Custom inference.py script](https://github.com/huggingface/notebooks/blob/main/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb) | Inference | End-to-end example on how to create a custom inference.py for Sentence Transformers and sentence embeddings |
-| [18 AWS Inferentia](https://github.com/huggingface/notebooks/blob/main/sagemaker/18_inferentia_inference/sagemaker-notebook.ipynb) | Inference | End-to-end example on how to AWS Inferentia to speed up inference time |
-
-## Inference Toolkit API
-
-The Inference Toolkit accepts inputs in the `inputs` key, and supports additional [`pipelines`](https://huggingface.co/docs/transformers/main_classes/pipelines) parameters in the `parameters` key. You can provide any of the supported `kwargs` from `pipelines` as `parameters`.
-
-Tasks supported by the Inference Toolkit API include:
-
-- **`text-classification`**
-- **`sentiment-analysis`**
-- **`token-classification`**
-- **`feature-extraction`**
-- **`fill-mask`**
-- **`summarization`**
-- **`translation_xx_to_yy`**
-- **`text2text-generation`**
-- **`text-generation`**
-- **`audio-classificatin`**
-- **`automatic-speech-recognition`**
-- **`conversational`**
-- **`image-classification`**
-- **`image-segmentation`**
-- **`object-detection`**
-- **`table-question-answering`**
-- **`zero-shot-classification`**
-- **`zero-shot-image-classification`**
-
-
-See the following request examples for some of the tasks:
-
-**`text-classification`**
-
-```json
-{
-  "inputs": "This sound track was beautiful! It paints the senery in your mind so well I would recomend it
-  even to people who hate vid. game music!"
-}
-```
-
-**`sentiment-analysis`**
-
-```json
-{
-  "inputs": "Don't waste your time.  We had two different people come to our house to give us estimates for
-a deck (one of them the OWNER).  Both times, we never heard from them.  Not a call, not the estimate, nothing."
-}
-```
-
-**`token-classification`**
-
-```json
-{
-  "inputs": "My name is Sylvain and I work at Hugging Face in Brooklyn."
-}
-```
-
-**`question-answering`**
-
-```json
-{
-  "inputs": {
-    "question": "What is used for inference?",
-    "context": "My Name is Philipp and I live in Nuremberg. This model is used with sagemaker for inference."
-  }
-}
-```
-
-**`zero-shot-classification`**
-
-```json
-{
-  "inputs": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
-  "parameters": {
-    "candidate_labels": ["refund", "legal", "faq"]
-  }
-}
-```
-
-**`table-question-answering`**
-
-```json
-{
-  "inputs": {
-    "query": "How many stars does the transformers repository have?",
-    "table": {
-      "Repository": ["Transformers", "Datasets", "Tokenizers"],
-      "Stars": ["36542", "4512", "3934"],
-      "Contributors": ["651", "77", "34"],
-      "Programming language": ["Python", "Python", "Rust, Python and NodeJS"]
-    }
-  }
-}
-```
-
-**`parameterized-request`**
-
-```json
-{
-  "inputs": "Hugging Face, the winner of VentureBeat’s Innovation in Natural Language Process/Understanding Award for 2021, is looking to level the playing field. The team, launched by Clément Delangue and Julien Chaumond in 2016, was recognized for its work in democratizing NLP, the global market value for which is expected to hit $35.1 billion by 2026. This week, Google’s former head of Ethical AI Margaret Mitchell joined the team.",
-  "parameters": {
-    "repetition_penalty": 4.0,
-    "length_penalty": 1.5
-  }
-}
-```
-
-## Inference Toolkit environment variables
-
-The Inference Toolkit implements various additional environment variables to simplify deployment. A complete list of Hugging Face specific environment variables is shown below:
-
-**`HF_TASK`**
-
-`HF_TASK` defines the task for the 🤗 Transformers pipeline used . See [here](https://huggingface.co/docs/transformers/main_classes/pipelines) for a complete list of tasks.
-
-```bash
-HF_TASK="question-answering"
-```
-
-**`HF_MODEL_ID`**
-
-`HF_MODEL_ID` defines the model ID which is automatically loaded from [hf.co/models](https://huggingface.co/models) when creating a SageMaker endpoint. All of the 🤗 Hub's 10,000+ models are available through this environment variable.
-
-```bash
-HF_MODEL_ID="distilbert-base-uncased-finetuned-sst-2-english"
-```
-
-**`HF_MODEL_REVISION`**
-
-`HF_MODEL_REVISION` is an extension to `HF_MODEL_ID` and allows you to define or pin a model revision to make sure you always load the same model on your SageMaker endpoint.
-
-```bash
-HF_MODEL_REVISION="03b4d196c19d0a73c7e0322684e97db1ec397613"
-```
-
-**`HF_API_TOKEN`**
-
-`HF_API_TOKEN` defines your Hugging Face authorization token. The `HF_API_TOKEN` is used as a HTTP bearer authorization for remote files like private models. You can find your token under [Settings](https://huggingface.co/settings/tokens) of your Hugging Face account.
-
-```bash
-HF_API_TOKEN="api_XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
-```
diff --git a/docs/sagemaker/scripts/auto-generate-examples.py b/docs/sagemaker/scripts/auto-generate-examples.py
new file mode 100644
index 0000000000..13a939b954
--- /dev/null
+++ b/docs/sagemaker/scripts/auto-generate-examples.py
@@ -0,0 +1,117 @@
+import os
+import re
+
+BRANCH_NAME = os.getenv("SAGEMAKER_BRANCH", "main")
+
+
+def process_readme_files():
+    print(
+        "Processing README.md files generated from the Jupyter Notebooks in docs/sagemaker/notebooks/sagemaker-sdk..."
+    )
+    os.makedirs("source/examples", exist_ok=True)
+
+    # NOTE: at the moment only `sagemaker-sdk` but left here to easily include new files from other sources
+    for dirname in {"sagemaker-sdk"}:
+        for root, _, files in os.walk(f"notebooks/{dirname}"):
+            for file in files:
+                if file == "sagemaker-notebook.md":
+                    process_file(root, file, dirname)
+
+
+def process_file(root, file, dirname):
+    parsed_dirname = (
+        dirname if not dirname.__contains__("/") else dirname.replace("/", "-")
+    )
+
+    file_path = os.path.join(root, file)
+    subdir = root.replace(f"notebooks/{dirname}/", "")
+    base = os.path.basename(subdir)
+
+    # NOTE: temporarily disabled
+    if file_path == f"examples/{dirname}/README.md":
+        target = f"source/examples/{parsed_dirname}-index.mdx"
+    else:
+        target = f"source/examples/{parsed_dirname}-{base}.mdx"
+
+    print(f"Processing {file_path} to {target}")
+    with open(file_path, "r") as f:
+        content = f.read()
+
+    # For Juypter Notebooks, remove the comment i.e. `<!--` and the `--!>` but keep the metadata
+    content = re.sub(r"<!-- (.*?) -->", r"\1", content, flags=re.DOTALL)
+
+    # Replace image and link paths
+    content = re.sub(
+        r"\(\./(imgs|assets)/([^)]*\.png)\)",
+        rf"(https://raw.githubusercontent.com/huggingface/hub-docs/refs/heads/{BRANCH_NAME}/docs/sagemaker/"
+        + root
+        + r"/\1/\2)",
+        content,
+    )
+    content = re.sub(
+        r"\(\.\./([^)]+)\)",
+        rf"(https://github.com/huggingface/hub-docs/tree/{BRANCH_NAME}/docs/sagemaker/notebooks/"
+        + dirname
+        + r"/\1)",
+        content,
+    )
+    content = re.sub(
+        r"\(\.\/([^)]+)\)",
+        rf"(https://github.com/huggingface/hub-docs/tree/{BRANCH_NAME}/docs/sagemaker/"
+        + root
+        + r"/\1)",
+        content,
+    )
+
+    def replacement(match):
+        block_type = match.group(1)
+        content = match.group(2)
+
+        # Remove '> ' from the beginning of each line
+        lines = [line[2:] for line in content.split("\n") if line.strip()]
+
+        # Determine the Tip type
+        tip_type = " warning" if block_type == "WARNING" else ""
+
+        # Construct the new block
+        new_block = f"<Tip{tip_type}>\n\n"
+        new_block += "\n".join(lines)
+        new_block += "\n\n</Tip>\n"
+
+        return new_block
+
+    # Regular expression to match the specified blocks
+    pattern = r"> \[!(NOTE|WARNING)\]\n((?:>.*(?:\n|$))+)"
+
+    # Perform the transformation
+    content = re.sub(pattern, replacement, content, flags=re.MULTILINE)
+
+    # Remove any remaining '>' or '> ' at the beginning of lines
+    content = re.sub(r"^>[ ]?", "", content, flags=re.MULTILINE)
+
+    # Check for remaining relative paths
+    if re.search(r"\(\.\./|\(\./", content):
+        print("WARNING: Relative paths still exist in the processed file.")
+        print(
+            "The following lines contain relative paths, consider replacing those with GitHub URLs instead:"
+        )
+        for i, line in enumerate(content.split("\n"), 1):
+            if re.search(r"\(\.\./|\(\./", line):
+                print(f"{i}: {line}")
+    else:
+        print("No relative paths found in the processed file.")
+
+    # Calculate the example URL
+    example_url = f"https://github.com/huggingface/hub-docs/tree/{BRANCH_NAME}/{root}"
+    if file.__contains__("sagemaker-notebook"):
+        example_url += "/sagemaker-notebook.ipynb"
+
+    # Add the final note
+    content += f"\n\n---\n<Tip>\n\n📍 Find the complete example on GitHub [here]({example_url})!\n\n</Tip>"
+
+    with open(target, "w") as f:
+        f.write(content)
+
+
+if __name__ == "__main__":
+    process_readme_files()
diff --git a/docs/sagemaker/scripts/auto-update-toctree.py b/docs/sagemaker/scripts/auto-update-toctree.py
new file mode 100644
index 0000000000..f6197ea4cd
--- /dev/null
+++ b/docs/sagemaker/scripts/auto-update-toctree.py
@@ -0,0 +1,66 @@
+import glob
+import os
+import re
+from pathlib import Path
+
+
+def update_toctree_yaml():
+    input_file = "source/_toctree.yml"
+    output_file = "source/_toctree.yml"
+    dirnames = ["sagemaker-sdk"]
+
+    # Read the existing content
+    with open(input_file, "r") as f:
+        content = f.read()
+
+    # Find the position between tutorials and reference sections
+    tutorials_end = content.find("- sections:\n    - local: reference/inference-toolkit")
+    if tutorials_end == -1:
+        print("Error: Could not find the reference section in the file")
+        return
+
+    # Generate the new content
+    new_content = []
+    new_content.append("# GENERATED CONTENT DO NOT EDIT!")
+    new_content.append("- title: Examples")
+    new_content.append("  sections:")
+
+    for dirname in dirnames:
+        # Get sorted files excluding index
+        files = sorted(glob.glob(f"source/examples/{dirname}-*.mdx"))
+        files = [f for f in files if not f.endswith(f"{dirname}-index.mdx")]
+
+        file_entries = []
+        for file_path in files:
+            with open(file_path, "r") as mdx_file:
+                first_line = mdx_file.readline().strip()
+                if first_line.startswith("# "):
+                    title = first_line[2:].strip()
+                    base_name = Path(file_path).stem
+                    file_entries.append((base_name, title))
+                else:
+                    print(f"⚠️ Skipping {Path(file_path).name} - missing H1 title")
+                    continue
+
+        # Write directory section
+        new_content.append("     - title: SageMaker SDK")
+        new_content.append("       isExpanded: false")
+
+        for idx, (base, title) in enumerate(file_entries):
+            if idx == 0:
+                new_content.append("       sections:")
+            new_content.append(f"          - local: examples/{base}")
+            new_content.append(f'            title: "{title}"')
+
+    new_content.append("# END GENERATED CONTENT")
+
+    # Insert the new content
+    updated_content = content[:tutorials_end] + "\n" + "\n".join(new_content) + "\n" + content[tutorials_end:]
+
+    # Write the updated content back to the file
+    with open(output_file, "w") as f:
+        f.write(updated_content)
+
+
+if __name__ == "__main__":
+    update_toctree_yaml()
diff --git a/docs/sagemaker/source/.gitignore b/docs/sagemaker/source/.gitignore
new file mode 100644
index 0000000000..d838da9865
--- /dev/null
+++ b/docs/sagemaker/source/.gitignore
@@ -0,0 +1 @@
+examples/
diff --git a/docs/sagemaker/source/_toctree.yml b/docs/sagemaker/source/_toctree.yml
new file mode 100644
index 0000000000..3bfd8282e1
--- /dev/null
+++ b/docs/sagemaker/source/_toctree.yml
@@ -0,0 +1,66 @@
+- sections:
+    - local: index
+      title: Hugging Face on AWS
+    - local: resources
+      title: Other Resources
+  title: Get Started
+  isExpanded: true
+- sections:
+    - local: dlcs/introduction
+      title: Introduction
+    - local: dlcs/features
+      title: Features & benefits
+    - local: dlcs/available
+      title: Available DLCs on AWS
+  title: Deep Learning Containers (DLCs)
+  isExpanded: true
+- sections:
+    - local: tutorials/index
+      title: Introduction
+    - sections:
+        - local: tutorials/sagemaker-sdk/sagemaker-sdk-quickstart
+          title: SageMaker SDK Quickstart
+        - local: tutorials/sagemaker-sdk/training-sagemaker-sdk
+          title: Train models with SageMaker SDK
+        - local: tutorials/sagemaker-sdk/deploy-sagemaker-sdk
+          title: Deploy models with SageMaker SDK
+      title: SageMaker SDK
+      isExpanded: false
+    - sections:
+        - local: tutorials/jumpstart/jumpstart-quickstart
+          title: Jumpstart Quickstart
+      title: Jumpstart
+      isExpanded: false
+    - sections:
+        - local: tutorials/bedrock/bedrock-quickstart
+          title: Bedrock Quickstart
+      title: Bedrock
+      isExpanded: false
+    - sections:
+        - local: tutorials/compute-services/compute-services-quickstart
+          title: EC2, ECS and EKS Quickstart
+      title: EC2, ECS and EKS
+      isExpanded: false
+  title: Tutorials
+  isExpanded: false
+
+# GENERATED CONTENT DO NOT EDIT!
+- title: Examples
+  sections:
+     - title: SageMaker SDK
+       isExpanded: false
+       sections:
+          - local: examples/sagemaker-sdk-deploy-embedding-models
+            title: "How to deploy Embedding Models to Amazon SageMaker using new Hugging Face Embedding DLC"
+          - local: examples/sagemaker-sdk-deploy-llama-3-3-70b-inferentia2
+            title: "Deploy Llama 3.3 70B on AWS Inferentia2"
+          - local: examples/sagemaker-sdk-evaluate-llm-lighteval
+            title: "Evaluate LLMs with Hugging Face Lighteval on Amazon SageMaker"
+          - local: examples/sagemaker-sdk-fine-tune-embedding-models
+            title: "Fine-tune and deploy embedding models with Amazon SageMaker"
+# END GENERATED CONTENT
+- sections:
+    - local: reference/inference-toolkit
+      title: Inference Toolkit API
+  title: Reference
+  isExpanded: false
diff --git a/docs/sagemaker/source/dlcs/available.md b/docs/sagemaker/source/dlcs/available.md
new file mode 100644
index 0000000000..c4691e5bd7
--- /dev/null
+++ b/docs/sagemaker/source/dlcs/available.md
@@ -0,0 +1,61 @@
+# Available DLCs on AWS
+
+Below you can find a listing of all the Deep Learning Containers (DLCs) available on AWS.
+
+For each supported combination of use-case (training, inference), accelerator type (CPU, GPU, Neuron), and framework (PyTorch, TGI, TEI) containers are created.
+
+## FAQ
+
+**How to choose the right container for my use case?**
+
+![dlc-decision-tree](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sagemaker/dlc-decision-tree.png)
+
+*Note:* See [here]((https://huggingface.co/docs/sagemaker/main/en/reference/inference-toolkit)) for the list of supported task in the inference toolkit.
+
+*Note:* Browse through the Hub to see if you model is tagged ["text-generation-inference"](https://huggingface.co/models?other=text-generation-inference) or ["text-embeddings-inference"](https://huggingface.co/models?other=text-embeddings-inference)
+
+**How to find the URI of my container?**
+The URI is built with an AWS account ID and an AWS region. Those two values need to be replaced depending on your use case.
+Let's say you want to use the training DLC for GPUs in  
+- `dlc-aws-account-id`: The AWS account ID of the account that owns the ECR repository. You can find them in the [here](https://github.com/aws/sagemaker-python-sdk/blob/e0b9d38e1e3b48647a02af23c4be54980e53dc61/src/sagemaker/image_uri_config/huggingface.json#L21)
+- `region`: The AWS region where you want to use it.
+
+## Training
+
+Pytorch Training DLC: For training, our DLCs are available for PyTorch via :hugging_face: Transformers. They include support for training on GPUs and AWS AI chips with libraries such as :hugging_face: TRL, Sentence Transformers, or :firecracker: Diffusers.
+
+| Container URI                                                                                                                    | Accelerator |
+| -------------------------------------------------------------------------------------------------------------------------------- | ----------- |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.5.1-transformers4.49.0-gpu-py311-cu124-ubuntu22.04 | GPU         |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training-neuronx:2.1.2-transformers4.48.1-neuronx-py310-sdk2.20.0-ubuntu20.04 | Neuron         |
+
+
+## Inference
+
+### Pytorch Inference DLC
+
+For inference, we have a general-purpose PyTorch inference DLC, for serving models trained with any of those frameworks mentioned before on CPU, GPU, and AWS AI chips.
+
+| Container URI                                                                                                                    | Accelerator |
+| -------------------------------------------------------------------------------------------------------------------------------- | ----------- |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:2.6.0-transformers4.49.0-cpu-py312-ubuntu22.04- | CPU         |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:2.6.0-transformers4.49.0-gpu-py312-cu124-ubuntu22.04 | GPU         |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference-neuronx:2.1.2-transformers4.43.2-neuronx-py310-sdk2.20.0-ubuntu20.04 | Neuron         |
+
+### Text Generation Inference
+
+There is also the Text Generation Inference (TGI) DLC for high-performance text generation of LLMs on GPU and AWS AI chips.
+
+| Container URI                                                                                                                    | Accelerator |
+| -------------------------------------------------------------------------------------------------------------------------------- | ----------- |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.6.0-tgi3.2.3-gpu-py311-cu124-ubuntu22.04 | GPU         |
+| 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.2-optimum0.0.28-neuronx-py310-ubuntu22.04 | Neuron         |
+
+### Text Embedding Inference
+
+Finally, there is a Text Embeddings Inference (TEI) DLC for high-performance serving of embedding models on CPU and GPU.
+
+| Container URI                                                                                                                    | Accelerator |
+| -------------------------------------------------------------------------------------------------------------------------------- | ----------- |
+| 683313688378.dkr.ecr.us-east-1.amazonaws.com/tei-cpu:2.0.1-tei1.2.3-cpu-py310-ubuntu22.04 | CPU         |
+| 683313688378.dkr.ecr.us-east-1.amazonaws.com/tei:2.0.1-tei1.4.0-gpu-py310-cu122-ubuntu22.04 | GPU         |
\ No newline at end of file
diff --git a/docs/sagemaker/source/dlcs/features.md b/docs/sagemaker/source/dlcs/features.md
new file mode 100644
index 0000000000..27ff90716a
--- /dev/null
+++ b/docs/sagemaker/source/dlcs/features.md
@@ -0,0 +1,29 @@
+# Features & benefits
+
+The Hugging Face DLCs provide ready-to-use, tested environments to train and deploy Hugging Face models.
+
+## One command is all you need
+
+With the new Hugging Face DLCs, train and deploy cutting-edge Transformers-based NLP models in a single line of code. The Hugging Face PyTorch DLCs for training come with all the libraries installed to run a single command e.g. via TRL CLI to fine-tune LLMs on any setting, either single-GPU, single-node multi-GPU, and more.
+
+## Accelerate machine learning from science to production
+
+In addition to Hugging Face DLCs, we created a first-class Hugging Face library for inference, huggingface-inference-toolkit, that comes with the Hugging Face PyTorch DLCs for inference, with full support on serving any PyTorch model on AWS.
+
+Deploy your trained models for inference with just one more line of code or select any of the ever growing publicly available models from the model Hub.
+
+## High-performance text generation and embedding
+
+Besides the PyTorch-oriented DLCs, Hugging Face also provides high-performance inference for both text generation and embedding models via the Hugging Face DLCs for both Text Generation Inference (TGI) and Text Embeddings Inference (TEI), respectively.
+
+The Hugging Face DLC for TGI enables you to deploy any of the +225,000 text generation inference supported models from the Hugging Face Hub, or any custom model as long as its architecture is supported within TGI.
+
+The Hugging Face DLC for TEI enables you to deploy any of the +12,000 embedding, re-ranking or sequence classification supported models from the Hugging Face Hub, or any custom model as long as its architecture is supported within TEI.
+
+Additionally, these DLCs come with full support for AWS meaning that deploying models from Amazon Simple Storage Service (S3) is also straight forward and requires no configuration.
+
+## Built-in performance
+
+Hugging Face DLCs feature built-in performance optimizations for PyTorch to train models faster. The DLCs also give you the flexibility to choose a training infrastructure that best aligns with the price/performance ratio for your workload.
+
+Hugging Face Inference DLCs provide you with production-ready endpoints that scale quickly with your Google Cloud environment, built-in monitoring, and a ton of enterprise features.
\ No newline at end of file
diff --git a/docs/sagemaker/source/dlcs/introduction.md b/docs/sagemaker/source/dlcs/introduction.md
new file mode 100644
index 0000000000..898445b4e2
--- /dev/null
+++ b/docs/sagemaker/source/dlcs/introduction.md
@@ -0,0 +1,12 @@
+# Introduction
+
+Hugging Face built Deep Learning Containers (DLCs) for Amazon Web Services customers to run any of their machine learning workload in an optimized environment, with no configuration or maintenance on their part. These are Docker images pre-installed with deep learning frameworks and libraries such as 🤗 Transformers, 🤗 Datasets, and 🤗 Tokenizers. The DLCs allow you to directly serve and train any models, skipping the complicated process of building and optimizing your serving and training environments from scratch.
+
+The containers are publicly maintained, updated and released periodically by Hugging Face and the AWS team and available for all AWS customers within the AWS’s Elastic Container Registry. They can be used from any AWS service such as:
+* Amazon Sagemaker AI: Amazon SageMaker AI is a fully managed machine learning (ML) platform for data scientists and developers to quickly and confidently build, train, and deploy ML models into a production-ready hosted environment.
+* Amazon Bedrock: Amazon Bedrock is a fully managed service that makes high-performing foundation models (FMs) from leading AI companies and Amazon available for your use through a unified API to build generative AI applications.
+* Amazon Elastic Kubernetes Service (EKS): Amazon EKS is the premiere platform for running Kubernetes clusters in the AWS cloud.
+* Amazon Elastic Container Service (ECS): Amazon ECS is a fully managed container orchestration service that helps you easily deploy, manage, and scale containerized applications.
+* Amazon Elastic Compute Cloud (EC2): Amazon EC2 provides on-demand, scalable computing capacity in the Amazon Web Services (AWS) Cloud.
+
+Hugging Face DLCs are open source and licensed under Apache 2.0. Feel free to reach out on our [community forum](https://discuss.huggingface.co/c/sagemaker/17) if you have any questions.
\ No newline at end of file
diff --git a/docs/sagemaker/source/index.md b/docs/sagemaker/source/index.md
new file mode 100644
index 0000000000..3235646e71
--- /dev/null
+++ b/docs/sagemaker/source/index.md
@@ -0,0 +1,78 @@
+# Hugging Face on AWS
+
+![cover](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sagemaker/cover.png)
+
+Hugging Face partners with Amazon Web Services (AWS) to democratize artificial intelligence (AI), enabling developers to seamlessly build, train, and deploy state-of-the-art machine learning models using AWS's robust cloud infrastructure. ​
+
+This collaboration aims to offer developers access to an everyday growing catalog of pre-trained models and dataset from the Hugging Face Hub, using Hugging Face open-source libraries across a broad spectrum of AWS services and hardware platforms.
+
+We build new experiences for developers to seamlessly train and deploy Hugging Face models whether they use AWS AI platforms such as Amazon SageMaker AI and AWS Bedrock, or AWS Compute services such as Elastic Container Service (ECS), Elastic Kubernetes Service (EKS), and virtual servers on Amazon Elastic Compute Cloud (EC2).
+
+We develop new tools to simplify the adoption of custom AI accelerators like AWS Inferentia and AWS Trainium, designed to enhance the performance and cost-efficiency of machine learning workloads.
+
+By combining Hugging Face's open-source models and libraries with AWS's scalable and secure cloud services, developers can more easily and affordably incorporate advanced AI capabilities into their applications.
+
+## Deploy models on AWS
+
+Deploying Hugging Face models on AWS is streamlined through various services, each suited for different deployment scenarios. Here's how you can deploy your models using AWS and Hugging Face offerings.
+
+You can deploy any Hugging Face Model on AWS with:
+- [Amazon Sagemaker SDK](#deploy-with-sagemaker-sdk)
+- [Amazon Sagemaker Jumpstart](#deploy-with-sagemaker-jumpstart)
+- [AWS Bedrock](#deploy-with-aws-bedrock)
+- [Hugging Face Inference Endpoints](#deploy-with-hugging-face-inference-endpoints)
+- [ECS, EKS, and EC2](#deploy-with-ecs-eks-and-ec2)
+
+### Deploy with Sagemaker SDK
+
+Amazon SageMaker is a fully managed AWS service for building, training, and deploying machine learning models at scale. The SageMaker SDK simplifies interacting with SageMaker programmatically. Amazon SageMaker SDK provides a seamless integration specifically designed for Hugging Face models, simplifying the deployment process of managed endpoints. With this integration, you can quickly deploy pre-trained Hugging Face models or your own fine-tuned models directly into SageMaker-managed endpoints, significantly reducing setup complexity and time to production.
+
+[Sagemaker SDK Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/sagemaker-sdk/sagemaker-sdk-quickstart)
+
+### Deploy with Sagemaker Jumpstart
+
+Amazon SageMaker JumpStart is a curated model catalog from which you can deploy a model with just a few clicks. We maintain a Hugging Face section in the catalog that will let you self-host the most famous open models in your VPC with performant default configurations, powered under the hood by [Hugging Face Deep Learning Catalogs (DLCs)](https://huggingface.co/docs/sagemaker/main/en/dlcs/introduction).
+
+[Sagemaker Jumpstart Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/jumpstart/jumpstart-quickstart)
+
+### Deploy with AWS Bedrock
+
+Amazon Bedrock enables developers to easily build and scale generative AI applications through a single API. With Bedrock Marketplace, you can now combine the ease of use of SageMaker JumpStart with the fully managed infrastructure of Amazon Bedrock, including compatibility with high-level APIs such as Agents, Knowledge Bases, Guardrails and Model Evaluations.
+
+[AWS Bedrock Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/bedrock/bedrock-quickstart)
+
+### Deploy with Hugging Face Inference Endpoints
+
+Hugging Face Inference Endpoints allow you to deploy models hosted directly by Hugging Face, fully managed and optimized for performance. It's ideal for quick deployment and scalable inference workloads.
+
+[Hugging Face Inference Endpoints Quickstart](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint).
+
+### Deploy with ECS, EKS, and EC2
+
+Hugging Face provides Inference Deep Learning Containers (DLCs) to AWS users, optimized environments preconfigured with Hugging Face libraries for inference, natively integrated in SageMaker SDK and JumpStart. However, the HF DLCs can also be used across other AWS services like ECS, EKS, and EC2.
+
+AWS Elastic Container Service (ECS), Elastic Kubernetes Service (EKS), and Elastic Compute Cloud (EC2) allow you to leverage DLCs directly.
+
+[EC2, ECS and EKS Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/compute-services/compute-services-quickstart)
+
+## Train models on AWS
+
+Training Hugging Face models on AWS is streamlined through various services. Here's how you can fine-tune your models using AWS and Hugging Face offerings.
+
+You can fine-tune any Hugging Face Model on AWS with:
+- [Amazon Sagemaker SDK](#train-with-sagemaker-sdk)
+- [ECS, EKS, and EC2](#train-with-ecs-eks-and-ec2)
+
+### Train with Sagemaker SDK
+
+Amazon SageMaker is a fully managed AWS service for building, training, and deploying machine learning models at scale. The SageMaker SDK simplifies interacting with SageMaker programmatically. Amazon SageMaker SDK provides a seamless integration specifically designed for Hugging Face models, simplifying the training job management. With this integration, you can quickly create your own fine-tuned models, significantly reducing setup complexity and time to production.
+
+[Sagemaker SDK Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/sagemaker-sdk/sagemaker-sdk-quickstart)
+
+### Train with ECS, EKS, and EC2
+
+Hugging Face provides Training Deep Learning Containers (DLCs) to AWS users, optimized environments preconfigured with Hugging Face libraries for training, natively integrated in SageMaker SDK. However, the HF DLCs can also be used across other AWS services like ECS, EKS, and EC2.
+
+AWS Elastic Container Service (ECS), Elastic Kubernetes Service (EKS), and Elastic Compute Cloud (EC2) allow you to leverage DLCs directly.
+
+[EC2, ECS and EKS Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/compute-services/compute-services-quickstart)
\ No newline at end of file
diff --git a/docs/sagemaker/source/reference/inference-toolkit.md b/docs/sagemaker/source/reference/inference-toolkit.md
new file mode 100644
index 0000000000..7a20bd1d0f
--- /dev/null
+++ b/docs/sagemaker/source/reference/inference-toolkit.md
@@ -0,0 +1,141 @@
+# Inference Toolkit API
+
+## Supported tasks
+
+The Inference Toolkit accepts inputs in the `inputs` key, and supports additional [`pipelines`](https://huggingface.co/docs/transformers/main_classes/pipelines) parameters in the `parameters` key. You can provide any of the supported `kwargs` from `pipelines` as `parameters`.
+
+Tasks supported by the Inference Toolkit API include:
+
+- **`text-classification`**
+- **`sentiment-analysis`**
+- **`token-classification`**
+- **`feature-extraction`**
+- **`fill-mask`**
+- **`summarization`**
+- **`translation_xx_to_yy`**
+- **`text2text-generation`**
+- **`text-generation`**
+- **`audio-classificatin`**
+- **`automatic-speech-recognition`**
+- **`conversational`**
+- **`image-classification`**
+- **`image-segmentation`**
+- **`object-detection`**
+- **`table-question-answering`**
+- **`zero-shot-classification`**
+- **`zero-shot-image-classification`**
+
+
+See the following request examples for some of the tasks:
+
+**`text-classification`**
+
+```json
+{
+  "inputs": "This sound track was beautiful! It paints the senery in your mind so well I would recommend it
+  even to people who hate vid. game music!"
+}
+```
+
+**`sentiment-analysis`**
+
+```json
+{
+  "inputs": "Don't waste your time.  We had two different people come to our house to give us estimates for
+a deck (one of them the OWNER).  Both times, we never heard from them.  Not a call, not the estimate, nothing."
+}
+```
+
+**`token-classification`**
+
+```json
+{
+  "inputs": "My name is Sylvain and I work at Hugging Face in Brooklyn."
+}
+```
+
+**`question-answering`**
+
+```json
+{
+  "inputs": {
+    "question": "What is used for inference?",
+    "context": "My Name is Philipp and I live in Nuremberg. This model is used with sagemaker for inference."
+  }
+}
+```
+
+**`zero-shot-classification`**
+
+```json
+{
+  "inputs": "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
+  "parameters": {
+    "candidate_labels": ["refund", "legal", "faq"]
+  }
+}
+```
+
+**`table-question-answering`**
+
+```json
+{
+  "inputs": {
+    "query": "How many stars does the transformers repository have?",
+    "table": {
+      "Repository": ["Transformers", "Datasets", "Tokenizers"],
+      "Stars": ["36542", "4512", "3934"],
+      "Contributors": ["651", "77", "34"],
+      "Programming language": ["Python", "Python", "Rust, Python and NodeJS"]
+    }
+  }
+}
+```
+
+**`parameterized-request`**
+
+```json
+{
+  "inputs": "Hugging Face, the winner of VentureBeat’s Innovation in Natural Language Process/Understanding Award for 2021, is looking to level the playing field. The team, launched by Clément Delangue and Julien Chaumond in 2016, was recognized for its work in democratizing NLP, the global market value for which is expected to hit $35.1 billion by 2026. This week, Google’s former head of Ethical AI Margaret Mitchell joined the team.",
+  "parameters": {
+    "repetition_penalty": 4.0,
+    "length_penalty": 1.5
+  }
+}
+```
+
+## Environment variables
+
+The Inference Toolkit implements various additional environment variables to simplify deployment. A complete list of Hugging Face specific environment variables is shown below:
+
+**`HF_TASK`**
+
+`HF_TASK` defines the task for the 🤗 Transformers pipeline used . See [here](https://huggingface.co/docs/transformers/main_classes/pipelines) for a complete list of tasks.
+
+```bash
+HF_TASK="question-answering"
+```
+
+**`HF_MODEL_ID`**
+
+`HF_MODEL_ID` defines the model ID which is automatically loaded from [hf.co/models](https://huggingface.co/models) when creating a SageMaker endpoint. All of the 🤗 Hub's 10,000+ models are available through this environment variable.
+
+```bash
+HF_MODEL_ID="distilbert-base-uncased-finetuned-sst-2-english"
+```
+
+**`HF_MODEL_REVISION`**
+
+`HF_MODEL_REVISION` is an extension to `HF_MODEL_ID` and allows you to define or pin a model revision to make sure you always load the same model on your SageMaker endpoint.
+
+```bash
+HF_MODEL_REVISION="03b4d196c19d0a73c7e0322684e97db1ec397613"
+```
+
+**`HF_API_TOKEN`**
+
+`HF_API_TOKEN` defines your Hugging Face authorization token. The `HF_API_TOKEN` is used as a HTTP bearer authorization for remote files like private models. You can find your token under [Settings](https://huggingface.co/settings/tokens) of your Hugging Face account.
+
+```bash
+HF_API_TOKEN="api_XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+```
diff --git a/docs/sagemaker/source/resources.md b/docs/sagemaker/source/resources.md
new file mode 100644
index 0000000000..eaa252d48a
--- /dev/null
+++ b/docs/sagemaker/source/resources.md
@@ -0,0 +1,58 @@
+# Resources
+
+Take a look at our published blog posts, videos, tutorials and examples along with external relevant documentation for additional help and more context about Hugging Face on AWS.
+
+Feel free to reach out on our [community forum](https://discuss.huggingface.co/c/sagemaker/17) if you have any questions.
+
+## Tutorials
+
+- [All tutorials](https://huggingface.co/docs/sagemaker/main/en/tutorials/introduction)
+
+## Examples
+
+- [All examples](https://huggingface.co/docs/sagemaker/main/en/examples/introduction)
+
+## Hugging Face Blogs
+
+- [Deploy Hugging Face models easily with Amazon SageMaker](https://huggingface.co/blog/deploy-hugging-face-models-easily-with-amazon-sagemaker)
+- [Hugging Face and AWS partner to make AI more accessible](https://huggingface.co/blog/aws-partnership)
+- [Introducing the Hugging Face LLM Inference Container for Amazon SageMaker](https://huggingface.co/blog/sagemaker-huggingface-llm)
+- [Hugging Face Text Generation Inference available for AWS Inferentia2](https://huggingface.co/blog/text-generation-inference-on-inferentia2)
+- [Subscribe to Enterprise Hub with your AWS Account](https://huggingface.co/blog/enterprise-hub-aws-marketplace)
+- [Deploy models on AWS Inferentia2 from Hugging Face](https://huggingface.co/blog/inferentia-inference-endpoints)
+- [Introducing the Hugging Face Embedding Container for Amazon SageMaker](https://huggingface.co/blog/sagemaker-huggingface-embedding)
+- [Use Hugging Face models with Amazon Bedrock](https://huggingface.co/blog/bedrock-marketplace)
+
+## AWS Blogs
+
+- [AWS: Embracing natural language processing with Hugging Face](https://aws.amazon.com/de/blogs/opensource/embracing-natural-language-processing-with-hugging-face/)
+- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing models](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
+- [AWS and Hugging Face collaborate to make generative AI more accessible and cost efficient](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-make-generative-ai-more-accessible-and-cost-efficient/)
+- [Use Amazon Bedrock tooling with Amazon SageMaker JumpStart models](https://aws.amazon.com/blogs/machine-learning/use-amazon-bedrock-tooling-with-amazon-sagemaker-jumpstart-models/)
+- [Deploy RAG applications on Amazon SageMaker JumpStart using FAISS](https://aws.amazon.com/blogs/machine-learning/deploy-rag-applications-on-amazon-sagemaker-jumpstart-using-faiss/)
+- [Fine-tune and host SDXL models cost-effectively with AWS Inferentia2](https://aws.amazon.com/blogs/machine-learning/fine-tune-and-host-sdxl-models-cost-effectively-with-aws-inferentia2/)
+- [Achieve ~2x speed-up in LLM inference with Medusa-1 on Amazon SageMaker AI](https://aws.amazon.com/blogs/machine-learning/achieve-2x-speed-up-in-llm-inference-with-medusa-1-on-amazon-sagemaker-ai/)
+- [Optimize hosting DeepSeek-R1 distilled models with Hugging Face TGI on Amazon SageMaker AI](https://aws.amazon.com/blogs/machine-learning/optimize-hosting-deepseek-r1-distilled-models-with-hugging-face-tgi-on-amazon-sagemaker-ai/)
+
+## Videos
+
+- [Walkthrough: End-to-End Text Classification](https://youtu.be/ok3hetb42gU)
+- [Working with Hugging Face models on Amazon SageMaker](https://youtu.be/leyrCgLAGjMn)
+- [Deploy a Hugging Face Transformers Model from S3 to Amazon SageMaker](https://youtu.be/pfBGgSGnYLs)
+- [Deploy a Hugging Face Transformers Model from the Model Hub to Amazon SageMaker](https://youtu.be/l9QZuazbzWM)
+- [Training with Hugging Face on Amazon SageMaker](https://www.youtube.com/watch?v=BqQ14SZ5tos)
+- [Hosting with Hugging Face on Amazon SageMaker](https://www.youtube.com/watch?v=oVIvXfeunv8)
+- [Introduction to Hugging Face on Amazon SageMaker](https://www.youtu.be/watch?v=80ix-IyNnQI)
+
+## External Documentation
+
+- [Hugging Face on AWS](https://aws.amazon.com/ai/hugging-face/)
+- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
+- [Python SDK SageMaker documentation for Hugging Face](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
+- [Deep Learning Container](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers)
+- [LLM Hosting Container](https://github.com/awslabs/llm-hosting-container)
+
+
+## Workshops
+
+- [Enterprise-Scale NLP with Hugging Face & Amazon SageMaker](https://github.com/philschmid/huggingface-sagemaker-workshop-series/tree/main)
diff --git a/docs/sagemaker/source/tutorials/bedrock/bedrock-quickstart.md b/docs/sagemaker/source/tutorials/bedrock/bedrock-quickstart.md
new file mode 100644
index 0000000000..d1b7f7068d
--- /dev/null
+++ b/docs/sagemaker/source/tutorials/bedrock/bedrock-quickstart.md
@@ -0,0 +1,77 @@
+# Quickstart — Using Hugging Face Models with Amazon Bedrock Marketplace
+
+## Why use Bedrock Marketplace for Hugging Face models?
+Amazon Bedrock now exposes 83 Hugging Face open-weight models—including Gemma, Llama 3, Mistral, and more—through a single catalog. You invoke them with the same Bedrock APIs you already use for Titan, Anthropic, Cohere, etc. Under the hood, Bedrock Marketplace model endpoints are managed by Amazon SageMaker AI. With Bedrock Marketplace, you can now combine the ease of use of SageMaker JumpStart with the fully managed infrastructure of Amazon Bedrock, including compatibility with high-level APIs such as Agents, Knowledge Bases, Guardrails and Model Evaluations.
+
+## 1 . Prerequisites
+
+|  | Requirement | Notes |
+|---|-------------|
+| AWS account in a Bedrock Region | Marketplace is regional; switch the console to one of the 14 supported Regions first. |
+| Permissions | For a quick trial, attach AmazonBedrockFullAccess and AmazonSageMakerFullAccess.|
+| Service quotas | The SageMaker endpoint uses GPU instances (for example ml.g5). Verify you have quota or request it. |
+| JumpStart-only | If you choose path B, create a SageMaker Studio domain and user profile first (Console ▸ SageMaker ▸ Domains). Open Studio before continuing. |
+
+When registering your Sagemaker Jumpstart endpoints in Amazon Bedrock, you only pay for the SageMaker compute resources and regular Amazon Bedrock APIs prices are applicable.
+
+## 2. Endpoint deployment
+
+There are two equivalent paths to use a Hugging Face model with Amazon Bedrock Marketplace.
+
+Path A is from the Bedrock *Model Catalog*:
+1. Console → Amazon Bedrock → Foundation Models → Model catalog  
+2. Filter Provider → “Hugging Face”, then pick your model (e.g., Gemma 2 27B Instruct)  
+3. If you see Subscribe, review pricing & terms, click Subscribe, then continue  
+4. Click Deploy → name the endpoint → keep the recommended instance → accept the EULA → Deploy  
+5. Wait for Foundation Models → Marketplace deployments to show status In service (takes a few minutes)  
+6. Click the deployment name and copy the SageMaker endpoint ARN — you’ll need it for API calls  
+
+Path B is from SageMaker JumpStart for the model that shows “Use with Bedrock”:
+1. In SageMaker Studio, open JumpStart  
+2. Filter Bedrock Ready models → select the model card (e.g., Gemma 2 9B Instruct)  
+3. Click Deploy, accept the EULA, keep defaults, Deploy  
+4. Studio → Deployments → Endpoints → wait for status In service  
+5. Click the endpoint, choose Use with Bedrock
+6. In the Bedrock console, review and Register → a new entry appears under Marketplace deployments  
+7. Open that entry and copy the SageMaker endpoint ARN for code samples  
+
+## 3 . Test interactively 
+
+To test the model interactively in the console, select the model under Marketplace deployments, open it in the playground, and send a prompt in Chat/Text mode to verify the model's response.
+
+Alternatively, you can programmatically access your endpoint.
+
+```python
+import boto3
+
+bedrock = boto3.client("bedrock-runtime")
+
+# Paste the endpoint ARN you copied above
+endpoint_arn = "arn:aws:sagemaker:<region>:<account‑id>:endpoint/<name>"
+
+inference_cfg = {"maxTokens": 256, "temperature": 0.1, "topP": 0.95}
+extra = {"parameters": {"repetition_penalty": 1.05}}
+
+response = bedrock.converse(
+    modelId=endpoint_arn,                  # <- SageMaker endpoint ARN
+    messages=[{
+        "role": "user",
+        "content": [{"text": "Give me three taglines for a serverless AI startup"}]
+    }],
+    inferenceConfig=inference_cfg,
+    additionalModelRequestFields=extra,
+)
+
+print(response["output"]["message"]["content"][0]["text"])
+```
+
+*Heads‑up*: the same `modelId=endpoint_arn` works with **InvokeModel**, **Knowledge Bases (RetrieveAndGenerate)**, **Agents**, and **Guardrails**—no code changes.
+
+## 4 . Clean‑up (stop charges)
+
+| Resource | How to delete |
+|----------|---------------|
+| SageMaker endpoint | Console → Marketplace deployments → select → Delete (also de‑registers it) • *or* `boto3.client("sagemaker").delete_endpoint(...)` |
+| Optional extras | Delete Knowledge Base, Guardrail, or S3 vectors if you created them. |
+
+For more information, refer to the [Bedrock documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html).
diff --git a/docs/sagemaker/source/tutorials/compute-services/compute-services-quickstart.md b/docs/sagemaker/source/tutorials/compute-services/compute-services-quickstart.md
new file mode 100644
index 0000000000..d293e39abb
--- /dev/null
+++ b/docs/sagemaker/source/tutorials/compute-services/compute-services-quickstart.md
@@ -0,0 +1,3 @@
+# EC2, ECS and EKS Quickstart
+
+This page is under construction, bear with us!
\ No newline at end of file
diff --git a/docs/sagemaker/source/tutorials/index.md b/docs/sagemaker/source/tutorials/index.md
new file mode 100644
index 0000000000..8a67efb015
--- /dev/null
+++ b/docs/sagemaker/source/tutorials/index.md
@@ -0,0 +1,21 @@
+# How to
+
+Take a look at our tutorials about using Hugging Face models on AWS.
+
+## Sagemaker SDK
+
+- [Sagemaker SDK Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/sagemaker-sdk/sagemaker-sdk-quickstart)
+- [Train models with Sagemaker SDK](https://huggingface.co/docs/sagemaker/main/en/tutorials/sagemaker-sdk/training-sagemaker-sdk)
+- [Deploy models with Sagemaker SDK](https://huggingface.co/docs/sagemaker/main/en/tutorials/sagemaker-sdk/deploy-sagemaker-sdk)
+
+## Jumpstart
+
+- [Sagemaker Jumpstart Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/jumpstart/jumpstart-quickstart)
+
+## Bedrock
+
+- [AWS Bedrock Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/bedrock/bedrock-quickstart)
+
+## EC2, ECS and EKS
+
+- [EC2, ECS and EKS Quickstart](https://huggingface.co/docs/sagemaker/main/en/tutorials/compute-services/compute-services-quickstart)
diff --git a/docs/sagemaker/source/tutorials/jumpstart/jumpstart-quickstart.md b/docs/sagemaker/source/tutorials/jumpstart/jumpstart-quickstart.md
new file mode 100644
index 0000000000..aa55fd290d
--- /dev/null
+++ b/docs/sagemaker/source/tutorials/jumpstart/jumpstart-quickstart.md
@@ -0,0 +1,86 @@
+# Quickstart - Deploy Hugging Face Models with SageMaker Jumpstart
+
+## Why use SageMaker JumpStart for Hugging Face models?
+
+Amazon SageMaker JumpStart lets you deploy the most-popular open Hugging Face models with one click—inside your own AWS account. JumpStart offers a curated [selection](https://aws.amazon.com/sagemaker-ai/jumpstart/getting-started/?sagemaker-jumpstart-cards.sort-by=item.additionalFields.model-name&sagemaker-jumpstart-cards.sort-order=asc&awsf.sagemaker-jumpstart-filter-product-type=*all&awsf.sagemaker-jumpstart-filter-text=*all&awsf.sagemaker-jumpstart-filter-vision=*all&awsf.sagemaker-jumpstart-filter-tabular=*all&awsf.sagemaker-jumpstart-filter-audio-tasks=*all&awsf.sagemaker-jumpstart-filter-multimodal=*all&awsf.sagemaker-jumpstart-filter-RL=*all&awsm.page-sagemaker-jumpstart-cards=1&sagemaker-jumpstart-cards.q=qwen&sagemaker-jumpstart-cards.q_operator=AND) of model checkpoints for various tasks, including text generation, embeddings, vision, audio, and more. Most models are deployed using the official [Hugging Face Deep Learning Containers](https://huggingface.co/docs/sagemaker/main/en/dlcs/introduction) with a sensible default instance type, so you can move from idea to production in minutes.
+
+In this quickstart guide, we will deploy [Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct).
+
+## 1. Prerequisites
+
+|   | Requirement |
+|---|-------------|
+| AWS account with SageMaker enabled | An AWS account that will contain all your AWS resources. |
+| An IAM role to access SageMaker AI | Learn more about how IAM works with SageMaker AI in this [guide](https://docs.aws.amazon.com/sagemaker/latest/dg/security-iam.html). |
+| SageMaker Studio domain and user profile | We recommend using SageMaker Studio for straightforward deployment and inference. Follow this [guide](https://docs.aws.amazon.com/sagemaker/latest/dg/onboard-quick-start.html). |
+| Service quotas | Most LLMs need GPU instances (e.g. ml.g5). Verify you have quota for ml.g5.24xlarge or [request it](https://docs.aws.amazon.com/sagemaker/latest/dg/canvas-requesting-quota-increases.html). | 
+
+## 2· Endpoint deployment
+
+Let's explain how you would deploy a Hugging Face model to SageMaker browsing through the Jumpstart catalog:
+1. Open SageMaker → JumpStart.  
+2. Filter “Hugging Face” or search for your model (e.g. Qwen2.5-14B).  
+3. Click Deploy → (optional) adjust instance size / count → Deploy.  
+4. Wait until Endpoints shows In service.  
+5. Copy the Endpoint name (or ARN) for later use.
+
+Alternatively, you can also browse through the Hugging Face Model Hub:
+1. Open the model page → Click Deploy → SageMaker → Jumpstart tab if model is available.
+2. Copy the code snippet and use it from a SageMaker Notebook instance.
+
+```python
+# SageMaker JumpStart provides APIs as part of SageMaker SDK that allow you to deploy and fine-tune models in network isolation using scripts that SageMaker maintains.
+
+from sagemaker.jumpstart.model import JumpStartModel
+
+
+model = JumpStartModel(model_id="huggingface-llm-qwen2-5-14b-instruct")
+example_payloads = model.retrieve_all_examples()
+
+predictor = model.deploy()
+
+for payload in example_payloads:
+    response = predictor.predict(payload.body)
+    print("Input:\n", payload.body[payload.prompt_key])
+    print("Output:\n", response[0]["generated_text"], "\n\n===============\n")
+```
+
+The endpoint creation can take several minutes, depending on the size of the model.
+
+## 3. Test interactively
+
+If you deployed through the console, you need to grab the endpoint ARN and reuse in your code.
+```python
+from sagemaker.predictor import retrieve_default
+endpoint_name = "MY ENDPOINT NAME"
+predictor = retrieve_default(endpoint_name)
+payload = {
+    "messages": [
+        {
+            "role": "system",
+            "content": "You are a passionate data scientist."
+        },
+        {
+            "role": "user",
+            "content": "what is machine learning?"
+        }
+    ],
+    "max_tokens": 2048,
+    "temperature": 0.7,
+    "top_p": 0.9,
+    "stream": False
+}
+
+response = predictor.predict(payload)
+print(response)
+```
+
+The endpoint support the Open AI API specification. 
+
+## 4. Clean‑up
+
+To avoid incurring unnecessary costs, when you’re done, delete the SageMaker endpoints in the Deployments → Endpoints console or using the following code snippets:
+```python
+predictor.delete_model()
+predictor.delete_endpoint()
+```
\ No newline at end of file
diff --git a/docs/sagemaker/inference.md b/docs/sagemaker/source/tutorials/sagemaker-sdk/deploy-sagemaker-sdk.md
similarity index 99%
rename from docs/sagemaker/inference.md
rename to docs/sagemaker/source/tutorials/sagemaker-sdk/deploy-sagemaker-sdk.md
index ae440e3043..85670adfe8 100644
--- a/docs/sagemaker/inference.md
+++ b/docs/sagemaker/source/tutorials/sagemaker-sdk/deploy-sagemaker-sdk.md
@@ -95,7 +95,7 @@ predictor = hf_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.x
 
 # example request: you always need to define "inputs"
 data = {
-   "inputs": "Camera - You are awarded a SiPix Digital Camera! call 09061221066 fromm landline. Delivery within 28 days."
+   "inputs": "Camera - You are awarded a SiPix Digital Camera! call 09061221066 from landline. Delivery within 28 days."
 }
 
 # request
@@ -133,7 +133,7 @@ predictor = huggingface_model.deploy(
 
 # example request: you always need to define "inputs"
 data = {
-   "inputs": "Camera - You are awarded a SiPix Digital Camera! call 09061221066 fromm landline. Delivery within 28 days."
+   "inputs": "Camera - You are awarded a SiPix Digital Camera! call 09061221066 from landline. Delivery within 28 days."
 }
 
 # request
diff --git a/docs/sagemaker/getting-started.md b/docs/sagemaker/source/tutorials/sagemaker-sdk/sagemaker-sdk-quickstart.md
similarity index 82%
rename from docs/sagemaker/getting-started.md
rename to docs/sagemaker/source/tutorials/sagemaker-sdk/sagemaker-sdk-quickstart.md
index 5e8034c0f8..67c052c432 100644
--- a/docs/sagemaker/getting-started.md
+++ b/docs/sagemaker/source/tutorials/sagemaker-sdk/sagemaker-sdk-quickstart.md
@@ -1,16 +1,14 @@
-# Train and deploy Hugging Face on Amazon SageMaker
+# Train and deploy a Hugging Face model on Amazon SageMaker with the SDK
 
-The get started guide will show you how to quickly use Hugging Face on Amazon SageMaker. Learn how to fine-tune and deploy a pretrained 🤗 Transformers model on SageMaker for a binary text classification task.
-
-💡 If you are new to Hugging Face, we recommend first reading the 🤗 Transformers [quick tour](https://huggingface.co/docs/transformers/quicktour).
+The get started guide will show you how to quickly use Hugging Face on Amazon SageMaker with the SDK. Learn how to fine-tune and deploy a pretrained 🤗 Transformers model on SageMaker for a binary text classification task.
 
 <iframe width="560" height="315" src="https://www.youtube.com/embed/pYqjCzoyWyo" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
 
-📓 Open the [agemaker-notebook.ipynb file](https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb) to follow along!
+📓 Open the [sagemaker-notebook.ipynb file](https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb) to follow along!
 
 ## Installation and setup
 
-Get started by installing the necessary Hugging Face libraries and SageMaker. You will also need to install [PyTorch](https://pytorch.org/get-started/locally/) and [TensorFlow](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available) if you don't already have it installed.
+Get started by installing the necessary Hugging Face libraries and SageMaker. You will also need to install [PyTorch](https://pytorch.org/get-started/locally/) if you don't already have it installed. If you run this example in SageMaker Studio, it is already installed in the notebook kernel!
 
 ```python
 pip install "sagemaker>=2.140.0" "transformers==4.26.1" "datasets[s3]==2.10.1" --upgrade
@@ -108,10 +106,10 @@ huggingface_estimator = HuggingFace(
     source_dir="./scripts",                 # directory where fine-tuning script is stored
     instance_type="ml.p3.2xlarge",          # instance type
     instance_count=1,                       # number of instances
-    role=role,                              # IAM role used in training job to acccess AWS resources (S3)
-    transformers_version="4.26",             # Transformers version
-    pytorch_version="1.13",                  # PyTorch version
-    py_version="py39",                      # Python version
+    role=role,                              # IAM role used in training job to access AWS resources (S3)
+    transformers_version="4.36",             # Transformers version
+    pytorch_version="2.1.0",                  # PyTorch version
+    py_version="py310",                      # Python version
     hyperparameters=hyperparameters         # hyperparameters to use in training job
 )
 ```
@@ -146,6 +144,4 @@ predictor.delete_endpoint()
 
 ## What's next?
 
-Congratulations, you've just fine-tuned and deployed a pretrained 🤗 Transformers model on SageMaker! 🎉
-
-For your next steps, keep reading our documentation for more details about training and deployment. There are many interesting features such as [distributed training](/docs/sagemaker/train#distributed-training) and [Spot instances](/docs/sagemaker/train#spot-instances).
+Congratulations, you've just fine-tuned and deployed a pretrained 🤗 Transformers model on SageMaker for binary text classification! 🎉
diff --git a/docs/sagemaker/train.md b/docs/sagemaker/source/tutorials/sagemaker-sdk/training-sagemaker-sdk.md
similarity index 100%
rename from docs/sagemaker/train.md
rename to docs/sagemaker/source/tutorials/sagemaker-sdk/training-sagemaker-sdk.md