Skip to content

Commit 022bc76

Browse files
MichaelCliffordKPostOffice
authored andcommitted
added job tests (#1)
* WIP job tests * added unit tests for Jobs * add more specificity to tests
1 parent 60ef21d commit 022bc76

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed

tests/unit_test.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sys
1717
import filecmp
1818
import os
19+
import re
1920

2021
parent = Path(__file__).resolve().parents[1]
2122
sys.path.append(str(parent) + "/src")
@@ -46,10 +47,20 @@
4647
RayClusterStatus,
4748
CodeFlareClusterStatus,
4849
)
50+
from codeflare_sdk.job.jobs import (
51+
JobDefinition,
52+
Job,
53+
DDPJobDefinition,
54+
DDPJob,
55+
torchx_runner,
56+
)
4957
import openshift
5058
from openshift import OpenShiftPythonException
5159
from openshift.selector import Selector
5260
import ray
61+
from torchx.specs import AppDryRunInfo, AppDef
62+
from torchx.runner import get_runner, Runner
63+
from torchx.schedulers.ray_scheduler import RayJob
5364
import pytest
5465

5566

@@ -1535,6 +1546,7 @@ def test_cluster_status(mocker):
15351546
mocker.patch(
15361547
"codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=fake_ray
15371548
)
1549+
15381550
status, ready = cf.status()
15391551
assert status == CodeFlareClusterStatus.STARTING
15401552
assert ready == False
@@ -1594,3 +1606,186 @@ def test_cmd_line_generation():
15941606
def test_cleanup():
15951607
os.remove("test.yaml")
15961608
os.remove("raytest2.yaml")
1609+
1610+
1611+
def test_jobdefinition_coverage():
1612+
abstract = JobDefinition()
1613+
cluster = Cluster(test_config_creation())
1614+
abstract._dry_run(cluster)
1615+
abstract.submit(cluster)
1616+
1617+
1618+
def test_job_coverage():
1619+
abstract = Job()
1620+
abstract.status()
1621+
abstract.logs()
1622+
1623+
1624+
def test_DDPJobDefinition_creation():
1625+
ddp = DDPJobDefinition(
1626+
script="test.py",
1627+
m=None,
1628+
script_args=["test"],
1629+
name="test",
1630+
cpu=1,
1631+
gpu=0,
1632+
memMB=1024,
1633+
h=None,
1634+
j="2x1",
1635+
env={"test": "test"},
1636+
max_retries=0,
1637+
mounts=[],
1638+
rdzv_port=29500,
1639+
scheduler_args={"requirements": "test"},
1640+
)
1641+
assert ddp.script == "test.py"
1642+
assert ddp.m == None
1643+
assert ddp.script_args == ["test"]
1644+
assert ddp.name == "test"
1645+
assert ddp.cpu == 1
1646+
assert ddp.gpu == 0
1647+
assert ddp.memMB == 1024
1648+
assert ddp.h == None
1649+
assert ddp.j == "2x1"
1650+
assert ddp.env == {"test": "test"}
1651+
assert ddp.max_retries == 0
1652+
assert ddp.mounts == []
1653+
assert ddp.rdzv_port == 29500
1654+
assert ddp.scheduler_args == {"requirements": "test"}
1655+
return ddp
1656+
1657+
1658+
def test_DDPJobDefinition_dry_run():
1659+
"""
1660+
Test that the dry run method returns the correct type: AppDryRunInfo,
1661+
that the attributes of the returned object are of the correct type,
1662+
and that the values from cluster and job definition are correctly passed.
1663+
"""
1664+
ddp = test_DDPJobDefinition_creation()
1665+
cluster = Cluster(test_config_creation())
1666+
ddp_job = ddp._dry_run(cluster)
1667+
assert type(ddp_job) == AppDryRunInfo
1668+
assert ddp_job._fmt is not None
1669+
assert type(ddp_job.request) == RayJob
1670+
assert type(ddp_job._app) == AppDef
1671+
assert type(ddp_job._cfg) == type(dict())
1672+
assert type(ddp_job._scheduler) == type(str())
1673+
1674+
assert ddp_job.request.app_id.startswith("test")
1675+
assert ddp_job.request.working_dir.startswith("/tmp/torchx_workspace")
1676+
assert ddp_job.request.cluster_name == "unit-test-cluster"
1677+
assert ddp_job.request.requirements == "test"
1678+
1679+
assert ddp_job._app.roles[0].resource.cpu == 1
1680+
assert ddp_job._app.roles[0].resource.gpu == 0
1681+
assert ddp_job._app.roles[0].resource.memMB == 1024
1682+
1683+
assert ddp_job._cfg["cluster_name"] == "unit-test-cluster"
1684+
assert ddp_job._cfg["requirements"] == "test"
1685+
1686+
assert ddp_job._scheduler == "ray"
1687+
1688+
1689+
def test_DDPJobDefinition_dry_run_no_resource_args():
1690+
"""
1691+
Test that the dry run correctly gets resources from the cluster object
1692+
when the job definition does not specify resources.
1693+
"""
1694+
cluster = Cluster(test_config_creation())
1695+
ddp = DDPJobDefinition(
1696+
script="test.py",
1697+
m=None,
1698+
script_args=["test"],
1699+
name="test",
1700+
h=None,
1701+
env={"test": "test"},
1702+
max_retries=0,
1703+
mounts=[],
1704+
rdzv_port=29500,
1705+
scheduler_args={"requirements": "test"},
1706+
)
1707+
ddp_job = ddp._dry_run(cluster)
1708+
1709+
assert ddp_job._app.roles[0].resource.cpu == cluster.config.max_cpus
1710+
assert ddp_job._app.roles[0].resource.gpu == cluster.config.gpu
1711+
assert ddp_job._app.roles[0].resource.memMB == cluster.config.max_memory * 1024
1712+
assert (
1713+
parse_j(ddp_job._app.roles[0].args[1])
1714+
== f"{cluster.config.max_worker}x{cluster.config.gpu}"
1715+
)
1716+
1717+
1718+
def test_DDPJobDefinition_submit(mocker):
1719+
"""
1720+
Tests that the submit method returns the correct type: DDPJob
1721+
And that the attributes of the returned object are of the correct type
1722+
"""
1723+
ddp_def = test_DDPJobDefinition_creation()
1724+
cluster = Cluster(test_config_creation())
1725+
mocker.patch(
1726+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1727+
return_value="fake-dashboard-url",
1728+
) # a fake app_handle
1729+
ddp_job = ddp_def.submit(cluster)
1730+
assert type(ddp_job) == DDPJob
1731+
assert type(ddp_job.job_definition) == DDPJobDefinition
1732+
assert type(ddp_job.cluster) == Cluster
1733+
assert type(ddp_job._app_handle) == str
1734+
assert ddp_job._app_handle == "fake-dashboard-url"
1735+
1736+
1737+
def test_DDPJob_creation(mocker):
1738+
ddp_def = test_DDPJobDefinition_creation()
1739+
cluster = Cluster(test_config_creation())
1740+
mocker.patch(
1741+
"codeflare_sdk.job.jobs.torchx_runner.schedule",
1742+
return_value="fake-dashboard-url",
1743+
) # a fake app_handle
1744+
ddp_job = DDPJob(ddp_def, cluster)
1745+
assert type(ddp_job) == DDPJob
1746+
assert type(ddp_job.job_definition) == DDPJobDefinition
1747+
assert type(ddp_job.cluster) == Cluster
1748+
assert type(ddp_job._app_handle) == str
1749+
assert ddp_job._app_handle == "fake-dashboard-url"
1750+
_, args, kwargs = torchx_runner.schedule.mock_calls[0]
1751+
assert type(args[0]) == AppDryRunInfo
1752+
job_info = args[0]
1753+
assert type(job_info.request) == RayJob
1754+
assert type(job_info._app) == AppDef
1755+
assert type(job_info._cfg) == type(dict())
1756+
assert type(job_info._scheduler) == type(str())
1757+
return ddp_job
1758+
1759+
1760+
def test_DDPJob_status(mocker):
1761+
ddp_job = test_DDPJob_creation(mocker)
1762+
mocker.patch(
1763+
"codeflare_sdk.job.jobs.torchx_runner.status", return_value="fake-status"
1764+
)
1765+
assert ddp_job.status() == "fake-status"
1766+
_, args, kwargs = torchx_runner.status.mock_calls[0]
1767+
assert args[0] == "fake-dashboard-url"
1768+
1769+
1770+
def test_DDPJob_logs(mocker):
1771+
ddp_job = test_DDPJob_creation(mocker)
1772+
mocker.patch(
1773+
"codeflare_sdk.job.jobs.torchx_runner.log_lines", return_value="fake-logs"
1774+
)
1775+
assert ddp_job.logs() == "fake-logs"
1776+
_, args, kwargs = torchx_runner.log_lines.mock_calls[0]
1777+
assert args[0] == "fake-dashboard-url"
1778+
1779+
1780+
def parse_j(cmd):
1781+
1782+
pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+"
1783+
match = re.search(pattern, cmd)
1784+
if match:
1785+
substring = match.group(0)
1786+
else:
1787+
return None
1788+
args = substring.split()
1789+
max_worker = args[1]
1790+
gpu = args[3]
1791+
return f"{max_worker}x{gpu}"

0 commit comments

Comments
 (0)