Merge pull request #120 from aws/inf2

philschmid · web-flow · commit 416408917801 · 2024-05-08T14:35:07.000+02:00
Add Inferentai2 and Optimum Neuron Support
diff --git a/README.md b/README.md
@@ -157,7 +157,14 @@ The custom module can override the following methods:
 * `output_fn(prediction, accept)`: overrides the default method for postprocessing, the return value `result` will be the respond of your request(e.g.`JSON`). The inputs are `predictions`, the result of the `predict()` method and `accept` the return accept type from the HTTP Request, e.g. `application/json`
 
 
+## 🏎️ Deploy Models on AWS Inferentia2
 
+The SageMaker Hugging Face Inference Toolkit provides support for deploying Hugging Face on AWS Inferentia2. To deploy a model on Inferentia2 you have 3 options:
+* Provide `HF_MODEL_ID`, the model repo id on huggingface.co which contains the compiled model under `.neuron` format. e.g. `optimum/bge-base-en-v1.5-neuronx`
+* Provide the `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` environment variables to compile the model on the fly, e.g. `HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128`
+* Include `neuron` dictionary in the [config.json](https://huggingface.co/optimum/tiny_random_bert_neuron/blob/main/config.json) file in the model archive, e.g. `neuron: {"static_batch_size": 1, "static_sequence_length": 128}`
+
+The currently supported tasks can be found [here](https://huggingface.co/docs/optimum-neuron/en/package_reference/supported_models). If you plan to deploy an LLM, we recommend taking a look at [Neuronx TGI](https://huggingface.co/blog/text-generation-inference-on-inferentia2), which is purposly build for LLMs 
 
 ---
 ## 🤝 Contributing
@@ -201,4 +208,35 @@ curl --request POST \
   --header 'Content-Type: application/json' \
   --data '"{\"inputs\": \"Camera\"}" \
   --output image.png
+```
+
+
+## Run Inferentia2 Model Locally
+
+_Note: You need to run this on an Inferentia2 instance._
+
+1. manually change `MMS_CONFIG_FILE`
+```
+wget -O sagemaker-mms.properties https://github.com/raw/aws/deep-learning-containers/master/huggingface/build_artifacts/inference/config.properties
+```
+
+2. Adjust `handler_service.py` and comment out `if content_type in content_types.UTF8_TYPES:` thats needed for SageMaker but cannot be used locally
+
+2. Run Container,
+
+- transformers `text-classification` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
+```
+HF_MODEL_ID="distilbert/distilbert-base-uncased-finetuned-sst-2-english" HF_TASK="text-classification" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py
+```
+- sentence transformers `feature-extration` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
+```
+HF_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2" HF_TASK="feature-extraction" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py
+```
+
+3. Send request 
+```
+curl --request POST \
+  --url http://localhost:8080/invocations \
+  --header 'Content-Type: application/json' \
+  --data "{\"inputs\": \"I like you.\"}"
 ```
diff --git a/makefile b/makefile
@@ -5,7 +5,7 @@ check_dirs := src tests
 # run tests
 
 unit-test:
-	python -m pytest -n auto --dist loadfile -s -v ./tests/unit/
+	python -m pytest -v -s  ./tests/unit/
 
 integ-test:
 	python -m pytest -n 2 -s -v ./tests/integ/
diff --git a/setup.py b/setup.py
@@ -68,7 +68,7 @@
 
 
 extras["test"] = [
-    "pytest",
+    "pytest<8",
     "pytest-xdist",
     "parameterized",
     "psutil",
diff --git a/src/sagemaker_huggingface_inference_toolkit/mms_model_server.py b/src/sagemaker_huggingface_inference_toolkit/mms_model_server.py
@@ -33,11 +33,11 @@
 )
 
 from sagemaker_huggingface_inference_toolkit import handler_service
+from sagemaker_huggingface_inference_toolkit.optimum_utils import is_optimum_neuron_available
 from sagemaker_huggingface_inference_toolkit.transformers_utils import (
     HF_API_TOKEN,
     HF_MODEL_REVISION,
     _load_model_from_hub,
-    is_aws_neuron_available,
 )
 
 
@@ -73,11 +73,6 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE):
     elif use_hf_hub:
         # Use different model store directory
         model_store = DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY
-        if is_aws_neuron_available():
-            raise ValueError(
-                "Hugging Face Hub deployments are currently not supported with AWS Neuron and Inferentia."
-                "You need to create a `inference.py` script to run your model using AWS Neuron"
-            )
         storage_dir = _load_model_from_hub(
             model_id=os.environ["HF_MODEL_ID"],
             model_dir=DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY,
@@ -90,6 +85,15 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE):
 
     env = environment.Environment()
 
+    # Set the number of workers to available number if optimum neuron is available and not already set
+    if is_optimum_neuron_available() and os.environ.get("SAGEMAKER_MODEL_SERVER_WORKERS", None) is None:
+        from optimum.neuron.utils.cache_utils import get_num_neuron_cores
+
+        try:
+            env._model_server_workers = str(get_num_neuron_cores())
+        except Exception:
+            env._model_server_workers = "1"
+
     # Note: multi-model default config already sets default_service_handler
     handler_service_for_config = None if ENABLE_MULTI_MODEL else handler_service
     _create_model_server_config_file(env, handler_service_for_config)
diff --git a/src/sagemaker_huggingface_inference_toolkit/optimum_utils.py b/src/sagemaker_huggingface_inference_toolkit/optimum_utils.py
@@ -38,21 +38,25 @@ def get_input_shapes(model_dir):
     # try to get input shapes from config file
     try:
         config = AutoConfig.from_pretrained(model_dir)
-        if hasattr(config, "neuron_batch_size") and hasattr(config, "neuron_sequence_length"):
-            input_shapes["batch_size"] = config.neuron_batch_size
-            input_shapes["sequence_length"] = config.neuron_sequence_length
-            input_shapes_available = True
-            logger.info(
-                f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}"
-            )
-            if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None:
-                logger.warning(
-                    "HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
-                )
-            if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None:
-                logger.warning(
-                    "HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
+        if hasattr(config, "neuron"):
+            # check if static batch size and sequence length are available
+            if config.neuron.get("static_batch_size", None) and config.neuron.get("static_sequence_length", None):
+                input_shapes["batch_size"] = config.neuron["static_batch_size"]
+                input_shapes["sequence_length"] = config.neuron["static_sequence_length"]
+                input_shapes_available = True
+                logger.info(
+                    f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}"
                 )
+            else:
+                # Add warning if environment variables are set but will be ignored
+                if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None:
+                    logger.warning(
+                        "HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
+                    )
+                if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None:
+                    logger.warning(
+                        "HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
+                    )
     except Exception:
         input_shapes_available = False
 
@@ -62,6 +66,11 @@ def get_input_shapes(model_dir):
 
     # extract input shapes from environment variables
     sequence_length = os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None)
+    if sequence_length is None:
+        raise ValueError(
+            "HF_OPTIMUM_SEQUENCE_LENGTH environment variable is not set. Please set HF_OPTIMUM_SEQUENCE_LENGTH to a positive integer."
+        )
+
     if not int(sequence_length) > 0:
         raise ValueError(
             f"HF_OPTIMUM_SEQUENCE_LENGTH must be set to a positive integer. Current value is {sequence_length}"
@@ -73,10 +82,9 @@ def get_input_shapes(model_dir):
     return {"batch_size": int(batch_size), "sequence_length": int(sequence_length)}
 
 
-# TODO: not used yet, need to sync on how to determine if we are running on inf2 instance
 def get_optimum_neuron_pipeline(task, model_dir):
     """Method to get optimum neuron pipeline for a given task. Method checks if task is supported by optimum neuron and if required environment variables are set, in case model is not converted. If all checks pass, optimum neuron pipeline is returned. If checks fail, an error is raised."""
-    from optimum.neuron.pipelines import NEURONX_SUPPORTED_TASKS, pipeline
+    from optimum.neuron.pipelines.transformers.base import NEURONX_SUPPORTED_TASKS, pipeline
     from optimum.neuron.utils import NEURON_FILE_NAME
 
     # check task support
@@ -94,6 +102,8 @@ def get_optimum_neuron_pipeline(task, model_dir):
 
     # get static input shapes to run inference
     input_shapes = get_input_shapes(model_dir)
+    # set NEURON_RT_NUM_CORES to 1 to avoid conflicts with multiple HTTP workers
+    os.environ["NEURON_RT_NUM_CORES"] = "1"
     # get optimum neuron pipeline
     neuron_pipe = pipeline(task, model=model_dir, export=export, input_shapes=input_shapes)
 
diff --git a/src/sagemaker_huggingface_inference_toolkit/transformers_utils.py b/src/sagemaker_huggingface_inference_toolkit/transformers_utils.py
@@ -24,6 +24,10 @@
 from transformers.pipelines import Conversation, Pipeline
 
 from sagemaker_huggingface_inference_toolkit.diffusers_utils import get_diffusers_pipeline, is_diffusers_available
+from sagemaker_huggingface_inference_toolkit.optimum_utils import (
+    get_optimum_neuron_pipeline,
+    is_optimum_neuron_available,
+)
 
 
 if is_tf_available():
@@ -71,6 +75,7 @@ def strtobool(val):
     "savedmodel": "*tar.gz",
     "openvino": "*openvino*",
     "ckpt": "*ckpt",
+    "neuronx": "*neuron",
 }
 
 
@@ -202,7 +207,9 @@ def _load_model_from_hub(
     # check if safetensors weights are available
     if framework == "pytorch":
         files = HfApi().model_info(model_id).siblings
-        if any(f.rfilename.endswith("safetensors") for f in files):
+        if is_optimum_neuron_available() and any(f.rfilename.endswith("neuron") for f in files):
+            framework = "neuronx"
+        elif any(f.rfilename.endswith("safetensors") for f in files):
             framework = "safetensors"
 
     # create regex to only include the framework specific weights
@@ -282,8 +289,10 @@ def get_pipeline(task: str, device: int, model_dir: Path, **kwargs) -> Pipeline:
         kwargs["feature_extractor"] = model_dir
     else:
         kwargs["tokenizer"] = model_dir
-
-    if TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0:
+    # check if optimum neuron is available and tries to load it
+    if is_optimum_neuron_available():
+        hf_pipeline = get_optimum_neuron_pipeline(task=task, model_dir=model_dir)
+    elif TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0:
         tokenizer = AutoTokenizer.from_pretrained(os.environ["HF_MODEL_ID"])
 
         hf_pipeline = pipeline(
diff --git a/tests/unit/test_handler_service_without_context.py b/tests/unit/test_handler_service_without_context.py
@@ -77,9 +77,7 @@ def test_handle(inference_handler):
         inference_handler.initialize(CONTEXT)
         json_data = json.dumps(INPUT)
         prediction = inference_handler.handle([{"body": json_data.encode()}], CONTEXT)
-        loaded_response = json.loads(prediction[0])
-        assert "entity" in loaded_response[0]
-        assert "score" in loaded_response[0]
+        assert "output" in prediction[0]
 
 
 @require_torch
@@ -90,13 +88,15 @@ def test_load(inference_handler):
             model_dir=tmpdirname,
         )
         # test with automatic infer
+        if "HF_TASK" in os.environ:
+            del os.environ["HF_TASK"]
         hf_pipeline_without_task = inference_handler.load(storage_folder)
         assert hf_pipeline_without_task.task == "token-classification"
 
         # test with automatic infer
-        os.environ["HF_TASK"] = TASK
+        os.environ["HF_TASK"] = "text-classification"
         hf_pipeline_with_task = inference_handler.load(storage_folder)
-        assert hf_pipeline_with_task.task == TASK
+        assert hf_pipeline_with_task.task == "text-classification"
 
 
 def test_preprocess(inference_handler):
@@ -139,10 +139,7 @@ def test_validate_and_initialize_user_module(inference_handler):
     prediction = inference_handler.handle([{"body": b""}], CONTEXT)
     assert "output" in prediction[0]
 
-    assert inference_handler.load({}) == "model"
-    assert inference_handler.preprocess({}, "") == "data"
-    assert inference_handler.predict({}, "model") == "output"
-    assert inference_handler.postprocess("output", "") == "output"
+    assert inference_handler.load({}) == "Loading inference_tranform_fn.py"
 
 
 def test_validate_and_initialize_user_module_transform_fn():
diff --git a/tests/unit/test_mms_model_server.py b/tests/unit/test_mms_model_server.py
@@ -13,7 +13,6 @@
 # limitations under the License.import os
 import os
 
-import pytest
 from sagemaker_inference.environment import model_dir
 
 from mock import patch
@@ -186,35 +185,3 @@ def test_start_mms_with_model_from_hub(
     subprocess_popen.assert_called_once_with(multi_model_server_cmd)
     sigterm.assert_called_once_with(retrieve.return_value)
     os.remove(mms_model_server.DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY)
-
-
-@patch("sagemaker_huggingface_inference_toolkit.transformers_utils._aws_neuron_available", return_value=True)
-@patch("subprocess.call")
-@patch("subprocess.Popen")
-@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._retry_retrieve_mms_server_process")
-@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._load_model_from_hub")
-@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._add_sigterm_handler")
-@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._install_requirements")
-@patch("os.makedirs", return_value=True)
-@patch("os.remove", return_value=True)
-@patch("os.path.exists", return_value=True)
-@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._create_model_server_config_file")
-@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._adapt_to_mms_format")
-def test_start_mms_neuron_and_model_from_hub(
-    adapt,
-    create_config,
-    exists,
-    remove,
-    dir,
-    install_requirements,
-    sigterm,
-    load_model_from_hub,
-    retrieve,
-    subprocess_popen,
-    subprocess_call,
-    _aws_neuron_available,
-):
-    with pytest.raises(ValueError):
-        os.environ["HF_MODEL_ID"] = "lysandre/tiny-bert-random"
-
-        mms_model_server.start_model_server()
diff --git a/tests/unit/test_optimum_utils.py b/tests/unit/test_optimum_utils.py
@@ -54,7 +54,7 @@ def test_get_input_shapes_from_file():
         )
         input_shapes = get_input_shapes(model_dir=storage_folder)
         assert input_shapes["batch_size"] == 1
-        assert input_shapes["sequence_length"] == 16
+        assert input_shapes["sequence_length"] == 32
 
 
 @require_torch

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def test_get_input_shapes_from_file():`
`54`	`54`	`)`
`55`	`55`	`input_shapes = get_input_shapes(model_dir=storage_folder)`
`56`	`56`	`assert input_shapes["batch_size"] == 1`
`57`		`- assert input_shapes["sequence_length"] == 16`
	`57`	`+ assert input_shapes["sequence_length"] == 32`
`58`	`58`
`59`	`59`
`60`	`60`	`@require_torch`