Skip to content

Sharded checkpoint support #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions src/sagemaker_huggingface_inference_toolkit/transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import json
import logging
import os
import re
from pathlib import Path
from typing import Optional

Expand All @@ -41,8 +42,15 @@ def is_aws_neuron_available():
logger = logging.getLogger(__name__)

PYTORCH_WEIGHTS_NAME = "pytorch_model.bin"
PYTORCH_WEIGHTS_SHARDED_NAME = "pytorch_model-\d{5}-of-\d{5}.bin"
TF2_WEIGHTS_NAME = "tf_model.h5"
FRAMEWORK_MAPPING = {"pytorch": PYTORCH_WEIGHTS_NAME, "tensorflow": TF2_WEIGHTS_NAME}
TF2_WEIGHTS_SHARDED_NAME = "tf_model-\d{5}-of-\d{5}.h5"
SHARDED_INDEX_EXTENSION = ".index.json"
FRAMEWORK_MAPPING = {
"pytorch": [PYTORCH_WEIGHTS_NAME, PYTORCH_WEIGHTS_NAME + SHARDED_INDEX_EXTENSION],
"tensorflow": [TF2_WEIGHTS_NAME, TF2_WEIGHTS_NAME + SHARDED_INDEX_EXTENSION],
}
FRAMEWORK_MAPPING_SHARDED = {"pytorch": PYTORCH_WEIGHTS_SHARDED_NAME, "tensorflow": TF2_WEIGHTS_SHARDED_NAME}

FILE_LIST_NAMES = [
"config.json",
Expand Down Expand Up @@ -188,9 +196,7 @@ def _load_model_from_hub(

# filters files to download
download_file_list = [
file.rfilename
for file in model_info.siblings
if file.rfilename in FILE_LIST_NAMES + [FRAMEWORK_MAPPING[framework]]
file.rfilename for file in model_info.siblings if _should_download_file(file.rfilename, framework)
]

# download files to storage_folder and removes cache
Expand All @@ -205,6 +211,18 @@ def _load_model_from_hub(
return storage_folder


def _should_download_file(filename: str, framework: str) -> bool:
"""
Check if a file is in the global allowlist of files to download, is allowed based on the framework, or is a sharded file.
"""
if filename in FILE_LIST_NAMES + FRAMEWORK_MAPPING[framework]:
return True
elif re.match(FRAMEWORK_MAPPING_SHARDED[framework], filename):
return True
else:
return False


def infer_task_from_model_architecture(model_config_path: str, architecture_index=0) -> str:
"""
Infer task from `config.json` of trained model. It is not guaranteed to the detect, e.g. some models implement multiple architectures or
Expand Down
14 changes: 14 additions & 0 deletions tests/unit/test_transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
_get_framework,
_is_gpu_available,
_load_model_from_hub,
_should_download_file,
get_pipeline,
infer_task_from_hub,
infer_task_from_model_architecture,
Expand Down Expand Up @@ -158,3 +159,16 @@ def test_wrapped_pipeline():
res = conv_pipe(data)
assert "conversation" in res
assert "generated_text" in res


def test_allow_sharded_files():
assert _should_download_file("pytorch_model-00001-of-00002.bin", "pytorch") is True
assert _should_download_file("pytorch_model-00002-of-00002.bin", "pytorch") is True
assert _should_download_file("pytorch_model-00002-of-00002.bin", "tensorflow") is False
assert _should_download_file("pytorch_model-abc-of-def.bin", "pytorch") is False
assert _should_download_file("tf_model-00001-of-00002.h5", "tensorflow") is True
assert _should_download_file("tf_model-00002-of-00002.h5", "tensorflow") is True
assert _should_download_file("tf_model.h5.index.json", "tensorflow") is True
assert _should_download_file("pytorch_model.bin.index.json", "pytorch") is True
assert _should_download_file("tf_model.h5.index.json", "pytorch") is False
assert _should_download_file("pytorch_model.bin.index.json", "tensorflow") is False