Skip to content

Commit 4164089

Browse files
authored
Merge pull request #120 from aws/inf2
Add Inferentai2 and Optimum Neuron Support
2 parents c2440c1 + 8c9fc10 commit 4164089

File tree

9 files changed

+95
-70
lines changed

9 files changed

+95
-70
lines changed

README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,14 @@ The custom module can override the following methods:
157157
* `output_fn(prediction, accept)`: overrides the default method for postprocessing, the return value `result` will be the respond of your request(e.g.`JSON`). The inputs are `predictions`, the result of the `predict()` method and `accept` the return accept type from the HTTP Request, e.g. `application/json`
158158

159159

160+
## 🏎️ Deploy Models on AWS Inferentia2
160161

162+
The SageMaker Hugging Face Inference Toolkit provides support for deploying Hugging Face on AWS Inferentia2. To deploy a model on Inferentia2 you have 3 options:
163+
* Provide `HF_MODEL_ID`, the model repo id on huggingface.co which contains the compiled model under `.neuron` format. e.g. `optimum/bge-base-en-v1.5-neuronx`
164+
* Provide the `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` environment variables to compile the model on the fly, e.g. `HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128`
165+
* Include `neuron` dictionary in the [config.json](https://huggingface.co/optimum/tiny_random_bert_neuron/blob/main/config.json) file in the model archive, e.g. `neuron: {"static_batch_size": 1, "static_sequence_length": 128}`
166+
167+
The currently supported tasks can be found [here](https://huggingface.co/docs/optimum-neuron/en/package_reference/supported_models). If you plan to deploy an LLM, we recommend taking a look at [Neuronx TGI](https://huggingface.co/blog/text-generation-inference-on-inferentia2), which is purposly build for LLMs
161168

162169
---
163170
## 🤝 Contributing
@@ -201,4 +208,35 @@ curl --request POST \
201208
--header 'Content-Type: application/json' \
202209
--data '"{\"inputs\": \"Camera\"}" \
203210
--output image.png
211+
```
212+
213+
214+
## Run Inferentia2 Model Locally
215+
216+
_Note: You need to run this on an Inferentia2 instance._
217+
218+
1. manually change `MMS_CONFIG_FILE`
219+
```
220+
wget -O sagemaker-mms.properties https://github.com/raw/aws/deep-learning-containers/master/huggingface/build_artifacts/inference/config.properties
221+
```
222+
223+
2. Adjust `handler_service.py` and comment out `if content_type in content_types.UTF8_TYPES:` thats needed for SageMaker but cannot be used locally
224+
225+
2. Run Container,
226+
227+
- transformers `text-classification` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
228+
```
229+
HF_MODEL_ID="distilbert/distilbert-base-uncased-finetuned-sst-2-english" HF_TASK="text-classification" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py
230+
```
231+
- sentence transformers `feature-extration` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
232+
```
233+
HF_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2" HF_TASK="feature-extraction" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py
234+
```
235+
236+
3. Send request
237+
```
238+
curl --request POST \
239+
--url http://localhost:8080/invocations \
240+
--header 'Content-Type: application/json' \
241+
--data "{\"inputs\": \"I like you.\"}"
204242
```

makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ check_dirs := src tests
55
# run tests
66

77
unit-test:
8-
python -m pytest -n auto --dist loadfile -s -v ./tests/unit/
8+
python -m pytest -v -s ./tests/unit/
99

1010
integ-test:
1111
python -m pytest -n 2 -s -v ./tests/integ/

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868

6969

7070
extras["test"] = [
71-
"pytest",
71+
"pytest<8",
7272
"pytest-xdist",
7373
"parameterized",
7474
"psutil",

src/sagemaker_huggingface_inference_toolkit/mms_model_server.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@
3333
)
3434

3535
from sagemaker_huggingface_inference_toolkit import handler_service
36+
from sagemaker_huggingface_inference_toolkit.optimum_utils import is_optimum_neuron_available
3637
from sagemaker_huggingface_inference_toolkit.transformers_utils import (
3738
HF_API_TOKEN,
3839
HF_MODEL_REVISION,
3940
_load_model_from_hub,
40-
is_aws_neuron_available,
4141
)
4242

4343

@@ -73,11 +73,6 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE):
7373
elif use_hf_hub:
7474
# Use different model store directory
7575
model_store = DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY
76-
if is_aws_neuron_available():
77-
raise ValueError(
78-
"Hugging Face Hub deployments are currently not supported with AWS Neuron and Inferentia."
79-
"You need to create a `inference.py` script to run your model using AWS Neuron"
80-
)
8176
storage_dir = _load_model_from_hub(
8277
model_id=os.environ["HF_MODEL_ID"],
8378
model_dir=DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY,
@@ -90,6 +85,15 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE):
9085

9186
env = environment.Environment()
9287

88+
# Set the number of workers to available number if optimum neuron is available and not already set
89+
if is_optimum_neuron_available() and os.environ.get("SAGEMAKER_MODEL_SERVER_WORKERS", None) is None:
90+
from optimum.neuron.utils.cache_utils import get_num_neuron_cores
91+
92+
try:
93+
env._model_server_workers = str(get_num_neuron_cores())
94+
except Exception:
95+
env._model_server_workers = "1"
96+
9397
# Note: multi-model default config already sets default_service_handler
9498
handler_service_for_config = None if ENABLE_MULTI_MODEL else handler_service
9599
_create_model_server_config_file(env, handler_service_for_config)

src/sagemaker_huggingface_inference_toolkit/optimum_utils.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,25 @@ def get_input_shapes(model_dir):
3838
# try to get input shapes from config file
3939
try:
4040
config = AutoConfig.from_pretrained(model_dir)
41-
if hasattr(config, "neuron_batch_size") and hasattr(config, "neuron_sequence_length"):
42-
input_shapes["batch_size"] = config.neuron_batch_size
43-
input_shapes["sequence_length"] = config.neuron_sequence_length
44-
input_shapes_available = True
45-
logger.info(
46-
f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}"
47-
)
48-
if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None:
49-
logger.warning(
50-
"HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
51-
)
52-
if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None:
53-
logger.warning(
54-
"HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
41+
if hasattr(config, "neuron"):
42+
# check if static batch size and sequence length are available
43+
if config.neuron.get("static_batch_size", None) and config.neuron.get("static_sequence_length", None):
44+
input_shapes["batch_size"] = config.neuron["static_batch_size"]
45+
input_shapes["sequence_length"] = config.neuron["static_sequence_length"]
46+
input_shapes_available = True
47+
logger.info(
48+
f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}"
5549
)
50+
else:
51+
# Add warning if environment variables are set but will be ignored
52+
if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None:
53+
logger.warning(
54+
"HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
55+
)
56+
if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None:
57+
logger.warning(
58+
"HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
59+
)
5660
except Exception:
5761
input_shapes_available = False
5862

@@ -62,6 +66,11 @@ def get_input_shapes(model_dir):
6266

6367
# extract input shapes from environment variables
6468
sequence_length = os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None)
69+
if sequence_length is None:
70+
raise ValueError(
71+
"HF_OPTIMUM_SEQUENCE_LENGTH environment variable is not set. Please set HF_OPTIMUM_SEQUENCE_LENGTH to a positive integer."
72+
)
73+
6574
if not int(sequence_length) > 0:
6675
raise ValueError(
6776
f"HF_OPTIMUM_SEQUENCE_LENGTH must be set to a positive integer. Current value is {sequence_length}"
@@ -73,10 +82,9 @@ def get_input_shapes(model_dir):
7382
return {"batch_size": int(batch_size), "sequence_length": int(sequence_length)}
7483

7584

76-
# TODO: not used yet, need to sync on how to determine if we are running on inf2 instance
7785
def get_optimum_neuron_pipeline(task, model_dir):
7886
"""Method to get optimum neuron pipeline for a given task. Method checks if task is supported by optimum neuron and if required environment variables are set, in case model is not converted. If all checks pass, optimum neuron pipeline is returned. If checks fail, an error is raised."""
79-
from optimum.neuron.pipelines import NEURONX_SUPPORTED_TASKS, pipeline
87+
from optimum.neuron.pipelines.transformers.base import NEURONX_SUPPORTED_TASKS, pipeline
8088
from optimum.neuron.utils import NEURON_FILE_NAME
8189

8290
# check task support
@@ -94,6 +102,8 @@ def get_optimum_neuron_pipeline(task, model_dir):
94102

95103
# get static input shapes to run inference
96104
input_shapes = get_input_shapes(model_dir)
105+
# set NEURON_RT_NUM_CORES to 1 to avoid conflicts with multiple HTTP workers
106+
os.environ["NEURON_RT_NUM_CORES"] = "1"
97107
# get optimum neuron pipeline
98108
neuron_pipe = pipeline(task, model=model_dir, export=export, input_shapes=input_shapes)
99109

src/sagemaker_huggingface_inference_toolkit/transformers_utils.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
from transformers.pipelines import Conversation, Pipeline
2525

2626
from sagemaker_huggingface_inference_toolkit.diffusers_utils import get_diffusers_pipeline, is_diffusers_available
27+
from sagemaker_huggingface_inference_toolkit.optimum_utils import (
28+
get_optimum_neuron_pipeline,
29+
is_optimum_neuron_available,
30+
)
2731

2832

2933
if is_tf_available():
@@ -71,6 +75,7 @@ def strtobool(val):
7175
"savedmodel": "*tar.gz",
7276
"openvino": "*openvino*",
7377
"ckpt": "*ckpt",
78+
"neuronx": "*neuron",
7479
}
7580

7681

@@ -202,7 +207,9 @@ def _load_model_from_hub(
202207
# check if safetensors weights are available
203208
if framework == "pytorch":
204209
files = HfApi().model_info(model_id).siblings
205-
if any(f.rfilename.endswith("safetensors") for f in files):
210+
if is_optimum_neuron_available() and any(f.rfilename.endswith("neuron") for f in files):
211+
framework = "neuronx"
212+
elif any(f.rfilename.endswith("safetensors") for f in files):
206213
framework = "safetensors"
207214

208215
# create regex to only include the framework specific weights
@@ -282,8 +289,10 @@ def get_pipeline(task: str, device: int, model_dir: Path, **kwargs) -> Pipeline:
282289
kwargs["feature_extractor"] = model_dir
283290
else:
284291
kwargs["tokenizer"] = model_dir
285-
286-
if TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0:
292+
# check if optimum neuron is available and tries to load it
293+
if is_optimum_neuron_available():
294+
hf_pipeline = get_optimum_neuron_pipeline(task=task, model_dir=model_dir)
295+
elif TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0:
287296
tokenizer = AutoTokenizer.from_pretrained(os.environ["HF_MODEL_ID"])
288297

289298
hf_pipeline = pipeline(

tests/unit/test_handler_service_without_context.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,7 @@ def test_handle(inference_handler):
7777
inference_handler.initialize(CONTEXT)
7878
json_data = json.dumps(INPUT)
7979
prediction = inference_handler.handle([{"body": json_data.encode()}], CONTEXT)
80-
loaded_response = json.loads(prediction[0])
81-
assert "entity" in loaded_response[0]
82-
assert "score" in loaded_response[0]
80+
assert "output" in prediction[0]
8381

8482

8583
@require_torch
@@ -90,13 +88,15 @@ def test_load(inference_handler):
9088
model_dir=tmpdirname,
9189
)
9290
# test with automatic infer
91+
if "HF_TASK" in os.environ:
92+
del os.environ["HF_TASK"]
9393
hf_pipeline_without_task = inference_handler.load(storage_folder)
9494
assert hf_pipeline_without_task.task == "token-classification"
9595

9696
# test with automatic infer
97-
os.environ["HF_TASK"] = TASK
97+
os.environ["HF_TASK"] = "text-classification"
9898
hf_pipeline_with_task = inference_handler.load(storage_folder)
99-
assert hf_pipeline_with_task.task == TASK
99+
assert hf_pipeline_with_task.task == "text-classification"
100100

101101

102102
def test_preprocess(inference_handler):
@@ -139,10 +139,7 @@ def test_validate_and_initialize_user_module(inference_handler):
139139
prediction = inference_handler.handle([{"body": b""}], CONTEXT)
140140
assert "output" in prediction[0]
141141

142-
assert inference_handler.load({}) == "model"
143-
assert inference_handler.preprocess({}, "") == "data"
144-
assert inference_handler.predict({}, "model") == "output"
145-
assert inference_handler.postprocess("output", "") == "output"
142+
assert inference_handler.load({}) == "Loading inference_tranform_fn.py"
146143

147144

148145
def test_validate_and_initialize_user_module_transform_fn():

tests/unit/test_mms_model_server.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.import os
1414
import os
1515

16-
import pytest
1716
from sagemaker_inference.environment import model_dir
1817

1918
from mock import patch
@@ -186,35 +185,3 @@ def test_start_mms_with_model_from_hub(
186185
subprocess_popen.assert_called_once_with(multi_model_server_cmd)
187186
sigterm.assert_called_once_with(retrieve.return_value)
188187
os.remove(mms_model_server.DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY)
189-
190-
191-
@patch("sagemaker_huggingface_inference_toolkit.transformers_utils._aws_neuron_available", return_value=True)
192-
@patch("subprocess.call")
193-
@patch("subprocess.Popen")
194-
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._retry_retrieve_mms_server_process")
195-
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._load_model_from_hub")
196-
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._add_sigterm_handler")
197-
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._install_requirements")
198-
@patch("os.makedirs", return_value=True)
199-
@patch("os.remove", return_value=True)
200-
@patch("os.path.exists", return_value=True)
201-
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._create_model_server_config_file")
202-
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._adapt_to_mms_format")
203-
def test_start_mms_neuron_and_model_from_hub(
204-
adapt,
205-
create_config,
206-
exists,
207-
remove,
208-
dir,
209-
install_requirements,
210-
sigterm,
211-
load_model_from_hub,
212-
retrieve,
213-
subprocess_popen,
214-
subprocess_call,
215-
_aws_neuron_available,
216-
):
217-
with pytest.raises(ValueError):
218-
os.environ["HF_MODEL_ID"] = "lysandre/tiny-bert-random"
219-
220-
mms_model_server.start_model_server()

tests/unit/test_optimum_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def test_get_input_shapes_from_file():
5454
)
5555
input_shapes = get_input_shapes(model_dir=storage_folder)
5656
assert input_shapes["batch_size"] == 1
57-
assert input_shapes["sequence_length"] == 16
57+
assert input_shapes["sequence_length"] == 32
5858

5959

6060
@require_torch

0 commit comments

Comments
 (0)