Skip to content

Commit 549d352

Browse files
MichaelCliffordKPostOffice
authored andcommitted
added job tests (#1)
* WIP job tests * added unit tests for Jobs * add more specificity to tests
1 parent 0533aba commit 549d352

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed

tests/unit_test.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sys
1717
import filecmp
1818
import os
19+
import re
1920

2021
parent = Path(__file__).resolve().parents[1]
2122
sys.path.append(str(parent) + "/src")
@@ -46,10 +47,20 @@
4647
RayClusterStatus,
4748
CodeFlareClusterStatus,
4849
)
50+
from codeflare_sdk.job.jobs import (
51+
JobDefinition,
52+
Job,
53+
DDPJobDefinition,
54+
DDPJob,
55+
torchx_runner,
56+
)
4957
import openshift
5058
from openshift import OpenShiftPythonException
5159
from openshift.selector import Selector
5260
import ray
61+
from torchx.specs import AppDryRunInfo, AppDef
62+
from torchx.runner import get_runner, Runner
63+
from torchx.schedulers.ray_scheduler import RayJob
5364
import pytest
5465

5566

@@ -1522,6 +1533,7 @@ def test_cluster_status(mocker):
15221533
mocker.patch(
15231534
"codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=fake_ray
15241535
)
1536+
15251537
status, ready = cf.status()
15261538
assert status == CodeFlareClusterStatus.STARTING
15271539
assert ready == False
@@ -1576,3 +1588,186 @@ def test_cmd_line_generation():
15761588
)
15771589
os.remove("unit-test-cluster.yaml")
15781590
os.remove("unit-cmd-cluster.yaml")
1591+
1592+
1593+
def test_jobdefinition_coverage():
1594+
abstract = JobDefinition()
1595+
cluster = Cluster(test_config_creation())
1596+
abstract._dry_run(cluster)
1597+
abstract.submit(cluster)
1598+
1599+
1600+
def test_job_coverage():
1601+
abstract = Job()
1602+
abstract.status()
1603+
abstract.logs()
1604+
1605+
1606+
def test_DDPJobDefinition_creation():
1607+
ddp = DDPJobDefinition(
1608+
script="test.py",
1609+
m=None,
1610+
script_args=["test"],
1611+
name="test",
1612+
cpu=1,
1613+
gpu=0,
1614+
memMB=1024,
1615+
h=None,
1616+
j="2x1",
1617+
env={"test": "test"},
1618+
max_retries=0,
1619+
mounts=[],
1620+
rdzv_port=29500,
1621+
scheduler_args={"requirements": "test"},
1622+
)
1623+
assert ddp.script == "test.py"
1624+
assert ddp.m == None
1625+
assert ddp.script_args == ["test"]
1626+
assert ddp.name == "test"
1627+
assert ddp.cpu == 1
1628+
assert ddp.gpu == 0
1629+
assert ddp.memMB == 1024
1630+
assert ddp.h == None
1631+
assert ddp.j == "2x1"
1632+
assert ddp.env == {"test": "test"}
1633+
assert ddp.max_retries == 0
1634+
assert ddp.mounts == []
1635+
assert ddp.rdzv_port == 29500
1636+
assert ddp.scheduler_args == {"requirements": "test"}
1637+
return ddp
1638+
1639+
1640+
def test_DDPJobDefinition_dry_run():
1641+
"""
1642+
Test that the dry run method returns the correct type: AppDryRunInfo,
1643+
that the attributes of the returned object are of the correct type,
1644+
and that the values from cluster and job definition are correctly passed.
1645+
"""
1646+
ddp = test_DDPJobDefinition_creation()
1647+
cluster = Cluster(test_config_creation())
1648+
ddp_job = ddp._dry_run(cluster)
1649+
assert type(ddp_job) == AppDryRunInfo
1650+
assert ddp_job._fmt is not None
1651+
assert type(ddp_job.request) == RayJob
1652+
assert type(ddp_job._app) == AppDef
1653+
assert type(ddp_job._cfg) == type(dict())
1654+
assert type(ddp_job._scheduler) == type(str())
1655+
1656+
assert ddp_job.request.app_id.startswith("test")
1657+
assert ddp_job.request.working_dir.startswith("/tmp/torchx_workspace")
1658+
assert ddp_job.request.cluster_name == "unit-test-cluster"
1659+
assert ddp_job.request.requirements == "test"
1660+
1661+
assert ddp_job._app.roles[0].resource.cpu == 1
1662+
assert ddp_job._app.roles[0].resource.gpu == 0
1663+
assert ddp_job._app.roles[0].resource.memMB == 1024
1664+
1665+
assert ddp_job._cfg["cluster_name"] == "unit-test-cluster"
1666+
assert ddp_job._cfg["requirements"] == "test"
1667+
1668+
assert ddp_job._scheduler == "ray"
1669+
1670+
1671+
def test_DDPJobDefinition_dry_run_no_resource_args():
1672+
"""
1673+
Test that the dry run correctly gets resources from the cluster object
1674+
when the job definition does not specify resources.
1675+
"""
1676+
cluster = Cluster(test_config_creation())
1677+
ddp = DDPJobDefinition(
1678+
script="test.py",
1679+
m=None,
1680+
script_args=["test"],
1681+
name="test",
1682+
h=None,
1683+
env={"test": "test"},
1684+
max_retries=0,
1685+
mounts=[],
1686+
rdzv_port=29500,
1687+
scheduler_args={"requirements": "test"},
1688+
)
1689+
ddp_job = ddp._dry_run(cluster)
1690+
1691+
assert ddp_job._app.roles[0].resource.cpu == cluster.config.max_cpus
1692+
assert ddp_job._app.roles[0].resource.gpu == cluster.config.gpu
1693+
assert ddp_job._app.roles[0].resource.memMB == cluster.config.max_memory * 1024
1694+
assert (
1695+
parse_j(ddp_job._app.roles[0].args[1])
1696+
== f"{cluster.config.max_worker}x{cluster.config.gpu}"
1697+
)
1698+
1699+
1700+
def test_DDPJobDefinition_submit(mocker):
1701+
"""
1702+
Tests that the submit method returns the correct type: DDPJob
1703+
And that the attributes of the returned object are of the correct type
1704+
"""
1705+
ddp_def = test_DDPJobDefinition_creation()
1706+
cluster = Cluster(test_config_creation())
1707+
mocker.patch(
1708+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1709+
return_value="fake-dashboard-url",
1710+
) # a fake app_handle
1711+
ddp_job = ddp_def.submit(cluster)
1712+
assert type(ddp_job) == DDPJob
1713+
assert type(ddp_job.job_definition) == DDPJobDefinition
1714+
assert type(ddp_job.cluster) == Cluster
1715+
assert type(ddp_job._app_handle) == str
1716+
assert ddp_job._app_handle == "fake-dashboard-url"
1717+
1718+
1719+
def test_DDPJob_creation(mocker):
1720+
ddp_def = test_DDPJobDefinition_creation()
1721+
cluster = Cluster(test_config_creation())
1722+
mocker.patch(
1723+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1724+
return_value="fake-dashboard-url",
1725+
) # a fake app_handle
1726+
ddp_job = DDPJob(ddp_def, cluster)
1727+
assert type(ddp_job) == DDPJob
1728+
assert type(ddp_job.job_definition) == DDPJobDefinition
1729+
assert type(ddp_job.cluster) == Cluster
1730+
assert type(ddp_job._app_handle) == str
1731+
assert ddp_job._app_handle == "fake-dashboard-url"
1732+
_, args, kwargs = torchx_runner.schedule.mock_calls[0]
1733+
assert type(args[0]) == AppDryRunInfo
1734+
job_info = args[0]
1735+
assert type(job_info.request) == RayJob
1736+
assert type(job_info._app) == AppDef
1737+
assert type(job_info._cfg) == type(dict())
1738+
assert type(job_info._scheduler) == type(str())
1739+
return ddp_job
1740+
1741+
1742+
def test_DDPJob_status(mocker):
1743+
ddp_job = test_DDPJob_creation(mocker)
1744+
mocker.patch(
1745+
"codeflare_sdk.job.jobs.torchx_runner.status", return_value="fake-status"
1746+
)
1747+
assert ddp_job.status() == "fake-status"
1748+
_, args, kwargs = torchx_runner.status.mock_calls[0]
1749+
assert args[0] == "fake-dashboard-url"
1750+
1751+
1752+
def test_DDPJob_logs(mocker):
1753+
ddp_job = test_DDPJob_creation(mocker)
1754+
mocker.patch(
1755+
"codeflare_sdk.job.jobs.torchx_runner.log_lines", return_value="fake-logs"
1756+
)
1757+
assert ddp_job.logs() == "fake-logs"
1758+
_, args, kwargs = torchx_runner.log_lines.mock_calls[0]
1759+
assert args[0] == "fake-dashboard-url"
1760+
1761+
1762+
def parse_j(cmd):
1763+
1764+
pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+"
1765+
match = re.search(pattern, cmd)
1766+
if match:
1767+
substring = match.group(0)
1768+
else:
1769+
return None
1770+
args = substring.split()
1771+
max_worker = args[1]
1772+
gpu = args[3]
1773+
return f"{max_worker}x{gpu}"

0 commit comments

Comments
 (0)