Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions swift/megatron/argument/megatron_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import json
import megatron.core
import torch
from packaging import version
Expand Down Expand Up @@ -167,6 +168,9 @@ class MegatronArguments(ExtraMegatronArguments):
num_workers: int = 4
no_create_attention_mask_in_dataloader: bool = True

# extra_args for megatron
extra_megatron_kwargs: Optional[str] = None

def _set_default(self):
if self.num_query_groups is None:
self.num_query_groups = 1
Expand Down Expand Up @@ -231,6 +235,17 @@ def __post_init__(self):

self.tensorboard_dir = to_abspath(self.tensorboard_dir)

try:
if self.extra_megatron_kwargs is None:
self.extra_megatron_kwargs = {}
elif isinstance(self.extra_megatron_kwargs, str):
self.extra_megatron_kwargs = json.loads(self.extra_megatron_kwargs)
elif isinstance(self.extra_megatron_kwargs, dict):
# For loading from config file
self.extra_megatron_kwargs = self.extra_megatron_kwargs
except json.JSONDecodeError:
raise ValueError('extra_megatron_kwargs should be a valid json string')

def _args_to_argv(self) -> Tuple[List[Any], Dict[str, Any]]:
new_args = []
args_dict = asdict(self)
Expand All @@ -241,6 +256,15 @@ def _args_to_argv(self) -> Tuple[List[Any], Dict[str, Any]]:
if k not in MegatronArguments.__annotations__:
extra_args[k] = value
continue
if k == 'extra_megatron_kwargs':
if isinstance(value, str):
value = json.loads(value)
if not isinstance(value, dict):
raise ValueError(f'extra_megatron_kwargs should be a dict, but got {type(value)}')
for sub_key, sub_value in value.items():
new_args.append(f"--{sub_key.replace('_', '-')}")
new_args.append(str(sub_value))
continue
if value is None or value is False:
continue
new_args.append(f"--{k.replace('_', '-')}")
Expand Down
3 changes: 3 additions & 0 deletions swift/megatron/model/register.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from argparse import ArgumentParser
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional

Expand All @@ -20,6 +21,8 @@ class MegatronModelMeta:
convert_mcore2hf: Callable[[nn.Module, nn.Module], None]
convert_hf2mcore: Callable[[nn.Module, nn.Module], None]

extra_args_provider: Optional[Callable[[ArgumentParser], ArgumentParser]] = None


def register_megatron_model(megatron_model_meta: MegatronModelMeta, *, exist_ok: bool = False):
megatron_model_type = megatron_model_meta.megatron_model_type
Expand Down
2 changes: 2 additions & 0 deletions swift/megatron/train/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ def run(self):
logger.info(f'The logging file will be saved in: {logging_path}')
try:
with patch_megatron_data_collator(data_collator):
extra_args_provider = args.megatron_model_meta.extra_args_provider
pretrain(
datasets_provider,
args.megatron_model_meta.model_provider,
ModelType.encoder_or_decoder,
forward_step,
extra_args_provider=extra_args_provider,
args_defaults=args.extra_args)
finally:
# Visualization
Expand Down
6 changes: 4 additions & 2 deletions swift/megatron/utils/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def convert_hf2mcore(args: ExportArguments) -> None:
megatron_args = MegatronArguments(**kwargs, **convert_kwargs, save=args.output_dir, torch_dtype=args.torch_dtype)
patch_megatron_tokenizer(processor)
extra_args = megatron_args.parse_to_megatron()
initialize_megatron(args_defaults=extra_args)
extra_args_provider = megatron_model_meta.extra_args_provider
initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=extra_args)

mg_model = megatron_model_meta.model_provider()
logger.info('Megatron model created successfully.')
Expand All @@ -101,7 +102,8 @@ def convert_mcore2hf(args: ExportArguments) -> None:
megatron_args = MegatronArguments(**kwargs, **convert_kwargs, load=args.mcore_model, torch_dtype=args.torch_dtype)
patch_megatron_tokenizer(processor)
extra_args = megatron_args.parse_to_megatron()
initialize_megatron(args_defaults=extra_args)
extra_args_provider = megatron_model_meta.extra_args_provider
initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=extra_args)

mg_model = megatron_model_meta.model_provider()
load_checkpoint([mg_model], None, None, strict=True)
Expand Down
6 changes: 4 additions & 2 deletions tests/megatron/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ def get_mg_model_tokenizer(model_id):
megatron_model_meta = get_megatron_model_meta(processor.model_meta.model_type)
model_info = processor.model_info
kwargs = megatron_model_meta.convert_hf_config(model_info.config)
megatron_args = MegatronArguments(**kwargs, seq_length=1, use_cpu_initialization=True, no_initialization=True)
megatron_args = MegatronArguments(
**kwargs, seq_length=1, use_cpu_initialization=True, no_initialization=True, torch_dtype=torch.float32)
extra_args_provider = megatron_model_meta.extra_args_provider
patch_megatron_tokenizer(processor)
extra_args = megatron_args.parse_to_megatron()
initialize_megatron(args_defaults=extra_args)
initialize_megatron(args_defaults=extra_args, extra_args_provider=extra_args_provider)
mg_model = megatron_model_meta.model_provider()
megatron_model_meta.convert_hf2mcore(hf_model, mg_model)
return hf_model, mg_model, processor
Expand Down