Skip to content

Commit eea5998

Browse files
committed
rename to timeout
1 parent 3fbb58c commit eea5998

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

src/torchrunx/agent.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class WorkerArgs:
3434
local_world_size: int
3535
world_size: int
3636
log_file: os.PathLike
37-
pg_timeout: int
37+
timeout: int
3838

3939
def to_bytes(self) -> bytes:
4040
return cloudpickle.dumps(self)
@@ -87,7 +87,7 @@ def entrypoint(serialized_worker_args: bytes):
8787
world_size=worker_args.world_size,
8888
rank=worker_args.rank,
8989
store=store,
90-
timeout=datetime.timedelta(seconds=worker_args.pg_timeout),
90+
timeout=datetime.timedelta(seconds=worker_args.timeout),
9191
)
9292

9393
os.environ["RANK"] = str(worker_args.rank)
@@ -136,7 +136,7 @@ def main(launcher_agent_group: LauncherAgentGroup):
136136
local_world_size=num_workers,
137137
world_size=worker_world_size,
138138
log_file=worker_log_files[i],
139-
pg_timeout=launcher_payload.pg_timeout,
139+
timeout=launcher_payload.timeout,
140140
).to_bytes(),
141141
)
142142
for i in range(num_workers)

src/torchrunx/launcher.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class Launcher:
9696
]
9797
)
9898
env_file: str | os.PathLike | None = None
99-
pg_timeout: int = 600
99+
timeout: int = 600
100100

101101
def run(
102102
self,
@@ -210,7 +210,7 @@ def run(
210210
worker_global_ranks=worker_global_ranks,
211211
worker_log_files=worker_log_files,
212212
backend=self.backend,
213-
pg_timeout=self.pg_timeout,
213+
timeout=self.timeout,
214214
)
215215

216216
agent_payloads: list[AgentPayload] = launcher_agent_group.sync_payloads(payload=payload)[1:] # pyright: ignore[reportAssignmentType]
@@ -272,7 +272,7 @@ def launch(
272272
"NCCL*",
273273
],
274274
env_file: str | os.PathLike | None = None,
275-
pg_timeout: int = 600,
275+
timeout: int = 600,
276276
) -> dict[int, Any]:
277277
"""
278278
Launch a distributed PyTorch function on the specified nodes.
@@ -295,8 +295,8 @@ def launch(
295295
:type env_vars: list[str], optional
296296
:param env_file: An additional environment file that will be sourced prior to executing ``func``, defaults to None
297297
:type env_file: str | os.PathLike | None, optional
298-
:param pg_timeout: Worker process group timeout, defaults to 600
299-
:type pg_timeout: int, optional
298+
:param timeout: Worker process group timeout, defaults to 600
299+
:type timeout: int, optional
300300
:raises RuntimeError: May fail due to misconfiguration, or errors thrown by ``func``
301301
:return: A dictionary mapping worker ranks to their output
302302
:rtype: dict[int, Any]
@@ -309,5 +309,5 @@ def launch(
309309
log_dir=log_dir,
310310
env_vars=env_vars,
311311
env_file=env_file,
312-
pg_timeout=pg_timeout,
312+
timeout=timeout,
313313
).run(func=func, func_kwargs=func_kwargs)

src/torchrunx/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class LauncherPayload:
2929
worker_global_ranks: list[list[int]]
3030
worker_log_files: list[list[Path]]
3131
backend: Literal["mpi", "gloo", "nccl", "ucc", None]
32-
pg_timeout: int
32+
timeout: int
3333

3434

3535
@dataclass

0 commit comments

Comments
 (0)