diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb index 6512c9be1..a5c8e8725 100644 --- a/demo-notebooks/batch-job/batch_mnist.ipynb +++ b/demo-notebooks/batch-job/batch_mnist.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "614daa0c", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ "auth = TokenAuthentication(\n", " token = \"XXXX\",\n", " server = \"XXXX\",\n", - " skip_tls=True\n", + " skip_tls=True,\n", ")" ] }, @@ -37,10 +37,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "0f4bc870-091f-4e11-9642-cba145710159", - "metadata": {}, - "outputs": [], + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to: mnisttest.yaml\n" + ] + } + ], "source": [ "# Create our cluster and submit appwrapper\n", "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"], auth=auth))" @@ -56,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", "metadata": {}, "outputs": [], @@ -75,9 +85,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "id": "3c1b4311-2e61-44c9-8225-87c2db11363d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -116,7 +128,7 @@ "(False, )" ] }, - "execution_count": 17, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -127,52 +139,52 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
                  ๐Ÿš€ List of CodeFlare clusters ๐Ÿš€                  \n",
-       "                                                                    \n",
-       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
-       " โ”‚   Owner                                                        โ”‚ \n",
-       " โ”‚   mnisttest                                        Active โœ…   โ”‚ \n",
-       " โ”‚                                                                โ”‚ \n",
-       " โ”‚   URI: ray://mnisttest-head-svc.default.svc:10001              โ”‚ \n",
-       " โ”‚                                                                โ”‚ \n",
-       " โ”‚   Dashboard๐Ÿ”—                                                  โ”‚ \n",
-       " โ”‚                                                                โ”‚ \n",
-       " โ”‚                      Cluster Resources                         โ”‚ \n",
-       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ     โ”‚ \n",
-       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚     โ”‚ \n",
-       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚     โ”‚ \n",
-       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  16G~16G     8           4           โ”‚     โ”‚ \n",
-       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚     โ”‚ \n",
-       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ     โ”‚ \n",
-       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
                   ๐Ÿš€ List of CodeFlare clusters ๐Ÿš€                   \n",
+       "                                                                      \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Owner                                                          โ”‚ \n",
+       " โ”‚   mnisttest                                        InActive โŒ   โ”‚ \n",
+       " โ”‚                                                                  โ”‚ \n",
+       " โ”‚   URI: ray://mnisttest-head-svc.default.svc:10001                โ”‚ \n",
+       " โ”‚                                                                  โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                    โ”‚ \n",
+       " โ”‚                                                                  โ”‚ \n",
+       " โ”‚                      Cluster Resources                           โ”‚ \n",
+       " โ”‚   โ•ญโ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ       โ”‚ \n",
+       " โ”‚   โ”‚  Min  Max  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚       โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚       โ”‚ \n",
+       " โ”‚   โ”‚  2    2    โ”‚  โ”‚  2G~2G       1           0           โ”‚       โ”‚ \n",
+       " โ”‚   โ”‚            โ”‚  โ”‚                                      โ”‚       โ”‚ \n",
+       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ       โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
        "
\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ List of CodeFlare clusters ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", - "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", - " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", - " โ”‚ \u001b[1;37;42mOwner\u001b[0m โ”‚ \n", - " โ”‚ \u001b[1;4mmnisttest\u001b[0m Active โœ… โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b]8;id=309861;ray-dashboard-mnisttest-default.apps.prepfullinstall.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", - " โ”‚ โ”‚ \n", - " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", - " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", - " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m16G~16G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m8 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m4 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", - " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", - " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ List of CodeFlare clusters ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mOwner\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mmnisttest\u001b[0m InActive โŒ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=188136;ray-dashboard-mnisttest-default.apps.kpostoffice.dev.datahub.redhat.com\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", + " โ”‚ โ•ญโ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", + " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m2G~2G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m1 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" ] }, "metadata": {}, @@ -181,10 +193,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 18, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -203,51 +215,38 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "3cc6183a-8f6e-4347-af91-d088ed422544", + "execution_count": 5, + "id": "89388795", "metadata": {}, + "outputs": [], + "source": [ + "from codeflare_sdk.jobs.jobs import TorchXJobDefinition\n", + "from codeflare_sdk.jobs.config import JobConfiguration" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3cc6183a-8f6e-4347-af91-d088ed422544", + "metadata": { + "scrolled": true + }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "environemnt before exec ddp from torchx {'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Checking for changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs`...\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Built new image `/tmp/torchx_workspace3c_d437b` based on original image `ghcr.io/pytorch/torchx:0.3.0dev0` and changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs` for role[0]=mnist.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 WARNING \u001b[0m The Ray scheduler does not support port mapping.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Uploading package gcs://_ray_pkg_ce2c3e935774455d.zip.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Creating a file package for local directory '/tmp/torchx_workspace3c_d437b'.\n", - "ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Launched app: ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m AppStatus:\n", - " msg: PENDING\n", - " num_restarts: -1\n", - " roles:\n", - " - replicas:\n", - " - hostname: \n", - " id: 0\n", - " role: ray\n", - " state: !!python/object/apply:torchx.specs.api.AppState\n", - " - 2\n", - " structured_error_msg: \n", - " role: ray\n", - " state: PENDING (2)\n", - " structured_error_msg: \n", - " ui_url: null\n", - "\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Job URL: None\n", - "\u001b[0m" + "The Ray scheduler does not support port mapping.\n" ] } ], "source": [ - "! torchx run -s ray -cfg dashboard_address=mnisttest-head-svc.default.svc:8265,requirements=requirements.txt dist.ddp -j 2x4 --gpu 4 --script mnist.py" + "job = TorchXJobDefinition(JobConfiguration(script=\"mnist.py\", requirements=\"requirements.txt\")).submit(cluster)" ] }, { "cell_type": "markdown", - "id": "ff065051", + "id": "38925ffa", "metadata": {}, "source": [ "Now we can go ahead and look at the status and logs of our batch job." @@ -255,1726 +254,85 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", - "{'mnist-jlm13hx5g53mk': JobInfo(status='SUCCEEDED', entrypoint='python3 ray_driver.py', message='Job finished successfully.', error_type=None, start_time=1667574271415, end_time=1667574616127, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_ce2c3e935774455d.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'e4ce38d001dbbe09cd21c497fedd03d692b2be3e'})}\n", - "\u001b[0m" - ] - } - ], - "source": [ - "cluster.list_jobs()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", - "metadata": {}, + "execution_count": 6, + "id": "b7bf9867", + "metadata": { + "scrolled": false + }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", - "\n", - "\u001b[32m-----------------------------------\u001b[39m\n", - "\u001b[32mJob 'mnist-jlm13hx5g53mk' succeeded\u001b[39m\n", - "\u001b[32m-----------------------------------\u001b[39m\n", - "\n", - "\u001b[0m" - ] + "data": { + "text/plain": [ + "AppStatus:\n", + " msg: !!python/object/apply:ray.dashboard.modules.job.common.JobStatus\n", + " - RUNNING\n", + " num_restarts: -1\n", + " roles:\n", + " - replicas:\n", + " - hostname: \n", + " id: 0\n", + " role: ray\n", + " state: !!python/object/apply:torchx.specs.api.AppState\n", + " - 3\n", + " structured_error_msg: \n", + " role: ray\n", + " state: RUNNING (3)\n", + " structured_error_msg: \n", + " ui_url: null" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "cluster.job_status(\"mnist-jlm13hx5g53mk\")" + "job.status()" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "264c1809-de72-4acf-b0f6-e67d345640f6", + "execution_count": 8, + "id": "24cd6fa8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", - "acrtors: [RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4), RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4)]\n", - "Waiting for placement group to start.\n", - "here and rank is 0 and 10.131.66.16 49782\n", - "finally setting actor remote address and port 10.131.66.16 49782\n", - "here and rank is 1 and 10.131.66.16 49782\n", - "setting actor remote address and port 10.131.66.16 49782\n", - "finally setting actor remote address and port 10.131.66.16 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port before: 10.131.66.16 42903\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port: 10.131.66.16 42903\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m set_address_and_port: 10.131.66.16 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port before: 10.131.66.16 53621\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port: 10.131.66.16 53621\n", - "running ray.wait on [ObjectRef(32b0eec39cfa87ac523554acce28b667f9bc98bb0200000001000000), ObjectRef(80b655a2d9b04d4074fb8e3cef07ab2b3516f40e0200000001000000)]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m cmd: ['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py\"]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m worker env: {'NV_LIBCUBLAS_DEV_VERSION': '11.3.1.68-1', 'NV_CUDA_COMPAT_PACKAGE': 'cuda-compat-11-2', 'RAY_IP': 'mnisttest-head-svc', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_GCS': '6379', 'NV_CUDNN_PACKAGE_DEV': 'libcudnn8-dev=8.1.1.33-1+cuda11.2', 'LC_ALL': 'C.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'NV_LIBNCCL_DEV_PACKAGE': 'libnccl-dev=2.8.4-1+cuda11.2', 'REDIS_PASSWORD': '', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PORT': '8265', 'RAY_USAGE_STATS_ENABLED': '0', 'LANG': 'C.UTF-8', 'TZ': 'America/Los_Angeles', 'NV_LIBNPP_DEV_PACKAGE': 'libnpp-dev-11-2=11.2.1.68-1', 'HOSTNAME': 'mnisttest-worker-small-group-mnisttest-wzz2l', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP': 'tcp://172.30.163.155:8265', 'OLDPWD': '/home/ray/workspace', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_ADDR': '172.30.163.155', 'RAY_CLIENT_MODE': '0', 'RAY_JOB_ID': '02000000', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_ADDR': '172.30.163.155', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_ADDR': '172.30.163.155', 'NV_LIBNPP_VERSION': '11.2.1.68-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PROTO': 'tcp', 'NVIDIA_VISIBLE_DEVICES': 'GPU-d3e8af45-f80b-98a8-dcd8-d3b428c4a4c2,GPU-15e57e64-c38b-9923-8f4a-6c098fdbc062,GPU-d14042c5-219c-5419-9511-ac62c72f90d1,GPU-b0d6ba11-ccb2-c4fb-89ad-01c50e6d393c', 'VIRTUAL_ENV': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv', 'NV_LIBCUSPARSE_VERSION': '11.3.1.68-1', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_DASHBOARD': '8265', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_CLIENT': '10001', 'KUBERNETES_PORT_443_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PORT': '6379', 'KUBERNETES_PORT_443_TCP_ADDR': '172.30.0.1', 'NV_LIBCUBLAS_DEV_PACKAGE': 'libcublas-dev-11-2=11.3.1.68-1', 'NCCL_VERSION': '2.8.4-1', 'KUBERNETES_PORT': 'tcp://172.30.0.1:443', 'PWD': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d', 'NVARCH': 'x86_64', 'NV_LIBCUSPARSE_DEV_VERSION': '11.3.1.68-1', 'HOME': '/home/ray', 'RAY_RAYLET_PID': '19', 'NV_ML_REPO_URL': 'https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64', 'NV_LIBNCCL_PACKAGE_VERSION': '2.8.4-1', 'SPT_NOENV': '1', 'KUBERNETES_SERVICE_PORT_HTTPS': '443', 'NV_LIBNCCL_PACKAGE': 'libnccl2=2.8.4-1+cuda11.2', 'NV_LIBNCCL_DEV_PACKAGE_NAME': 'libnccl-dev', 'KUBERNETES_PORT_443_TCP_PORT': '443', 'NV_CUDA_LIB_VERSION': '11.2.0-1', 'NV_ML_REPO_ENABLED': '1', 'NV_LIBNPP_PACKAGE': 'libnpp-11-2=11.2.1.68-1', 'NV_LIBNCCL_PACKAGE_NAME': 'libnccl2', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'NV_NVTX_VERSION': '11.2.67-1', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP': 'tcp://172.30.163.155:10001', 'NV_LIBCUBLAS_VERSION': '11.3.1.68-1', 'RAY_ADDRESS': 'mnisttest-head-svc:6379', 'NV_LIBCUBLAS_PACKAGE': 'libcublas-11-2=11.3.1.68-1', 'KUBERNETES_PORT_443_TCP': 'tcp://172.30.0.1:443', 'NV_CUDNN_VERSION': '8.1.1.33', 'RAY_PORT': '6379', 'NV_CUDA_CUDART_DEV_VERSION': '11.2.72-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP': 'tcp://172.30.163.155:6379', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PORT': '10001', 'TERM': 'xterm', 'MNISTTEST_HEAD_SVC_SERVICE_PORT': '6379', 'NV_NVML_DEV_VERSION': '11.2.67-1', 'CUDA_VERSION': '11.2.0', 'NV_LIBCUBLAS_PACKAGE_NAME': 'libcublas-11-2', 'NSS_SDB_USE_CACHE': 'no', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'MY_POD_IP': '10.131.66.16', 'SHLVL': '1', 'PYTHONPATH': ':/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d:/home/ray/workspace::/home/ray/workspace:', 'NV_LIBCUBLAS_DEV_PACKAGE_NAME': 'libcublas-dev-11-2', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450', 'NV_LIBNPP_DEV_VERSION': '11.2.1.68-1', 'KUBERNETES_SERVICE_PORT': '443', 'NV_CUDA_CUDART_VERSION': '11.2.72-1', 'NV_CUDNN_PACKAGE_NAME': 'libcudnn8', 'PATH': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv/bin:/home/ray/anaconda3/bin:/home/ray/anaconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'NV_LIBNCCL_DEV_PACKAGE_VERSION': '2.8.4-1', 'MNISTTEST_HEAD_SVC_PORT': 'tcp://172.30.163.155:6379', 'PS1': '(virtualenv) ', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_SERVICE_HOST': '172.30.163.155', 'KUBERNETES_SERVICE_HOST': '172.30.0.1', 'NV_CUDNN_PACKAGE': 'libcudnn8=8.1.1.33-1+cuda11.2', 'OMP_NUM_THREADS': '1', 'PYTHONBREAKPOINT': 'ray.util.rpdb.set_trace', 'CUDA_VISIBLE_DEVICES': '0,1,2,3', 'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_RANK0_HOST': '10.131.66.16'}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m set_address_and_port: 10.131.66.16 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m cmd: ['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py\"]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m worker env: {'NV_LIBCUBLAS_DEV_VERSION': '11.3.1.68-1', 'NV_CUDA_COMPAT_PACKAGE': 'cuda-compat-11-2', 'RAY_IP': 'mnisttest-head-svc', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_GCS': '6379', 'NV_CUDNN_PACKAGE_DEV': 'libcudnn8-dev=8.1.1.33-1+cuda11.2', 'LC_ALL': 'C.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'NV_LIBNCCL_DEV_PACKAGE': 'libnccl-dev=2.8.4-1+cuda11.2', 'REDIS_PASSWORD': '', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PORT': '8265', 'RAY_USAGE_STATS_ENABLED': '0', 'LANG': 'C.UTF-8', 'TZ': 'America/Los_Angeles', 'NV_LIBNPP_DEV_PACKAGE': 'libnpp-dev-11-2=11.2.1.68-1', 'HOSTNAME': 'mnisttest-worker-small-group-mnisttest-hfm8l', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP': 'tcp://172.30.163.155:8265', 'OLDPWD': '/home/ray/workspace', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_ADDR': '172.30.163.155', 'RAY_CLIENT_MODE': '0', 'RAY_JOB_ID': '02000000', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_ADDR': '172.30.163.155', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_ADDR': '172.30.163.155', 'NV_LIBNPP_VERSION': '11.2.1.68-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PROTO': 'tcp', 'NVIDIA_VISIBLE_DEVICES': 'GPU-48fae530-6bda-e366-3423-864fe847ff3b,GPU-5d8d79bb-5c38-4ef7-0ea8-c91297cbc59f,GPU-8c8b3c0b-ccf8-c06c-f253-0bb90285c4cb,GPU-a8a4e808-841d-c212-2686-a2bd227279b3', 'VIRTUAL_ENV': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv', 'NV_LIBCUSPARSE_VERSION': '11.3.1.68-1', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_DASHBOARD': '8265', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_CLIENT': '10001', 'KUBERNETES_PORT_443_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PORT': '6379', 'KUBERNETES_PORT_443_TCP_ADDR': '172.30.0.1', 'NV_LIBCUBLAS_DEV_PACKAGE': 'libcublas-dev-11-2=11.3.1.68-1', 'NCCL_VERSION': '2.8.4-1', 'KUBERNETES_PORT': 'tcp://172.30.0.1:443', 'PWD': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d', 'NVARCH': 'x86_64', 'NV_LIBCUSPARSE_DEV_VERSION': '11.3.1.68-1', 'HOME': '/home/ray', 'RAY_RAYLET_PID': '19', 'NV_ML_REPO_URL': 'https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64', 'NV_LIBNCCL_PACKAGE_VERSION': '2.8.4-1', 'SPT_NOENV': '1', 'KUBERNETES_SERVICE_PORT_HTTPS': '443', 'NV_LIBNCCL_PACKAGE': 'libnccl2=2.8.4-1+cuda11.2', 'NV_LIBNCCL_DEV_PACKAGE_NAME': 'libnccl-dev', 'KUBERNETES_PORT_443_TCP_PORT': '443', 'NV_CUDA_LIB_VERSION': '11.2.0-1', 'NV_ML_REPO_ENABLED': '1', 'NV_LIBNPP_PACKAGE': 'libnpp-11-2=11.2.1.68-1', 'NV_LIBNCCL_PACKAGE_NAME': 'libnccl2', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'NV_NVTX_VERSION': '11.2.67-1', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP': 'tcp://172.30.163.155:10001', 'NV_LIBCUBLAS_VERSION': '11.3.1.68-1', 'RAY_ADDRESS': 'mnisttest-head-svc:6379', 'NV_LIBCUBLAS_PACKAGE': 'libcublas-11-2=11.3.1.68-1', 'KUBERNETES_PORT_443_TCP': 'tcp://172.30.0.1:443', 'NV_CUDNN_VERSION': '8.1.1.33', 'RAY_PORT': '6379', 'NV_CUDA_CUDART_DEV_VERSION': '11.2.72-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP': 'tcp://172.30.163.155:6379', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PORT': '10001', 'TERM': 'xterm', 'MNISTTEST_HEAD_SVC_SERVICE_PORT': '6379', 'NV_NVML_DEV_VERSION': '11.2.67-1', 'CUDA_VERSION': '11.2.0', 'NV_LIBCUBLAS_PACKAGE_NAME': 'libcublas-11-2', 'NSS_SDB_USE_CACHE': 'no', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'MY_POD_IP': '10.128.68.15', 'SHLVL': '1', 'PYTHONPATH': ':/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d:/home/ray/workspace::/home/ray/workspace:', 'NV_LIBCUBLAS_DEV_PACKAGE_NAME': 'libcublas-dev-11-2', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450', 'NV_LIBNPP_DEV_VERSION': '11.2.1.68-1', 'KUBERNETES_SERVICE_PORT': '443', 'NV_CUDA_CUDART_VERSION': '11.2.72-1', 'NV_CUDNN_PACKAGE_NAME': 'libcudnn8', 'PATH': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv/bin:/home/ray/anaconda3/bin:/home/ray/anaconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'NV_LIBNCCL_DEV_PACKAGE_VERSION': '2.8.4-1', 'MNISTTEST_HEAD_SVC_PORT': 'tcp://172.30.163.155:6379', 'PS1': '(virtualenv) ', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_SERVICE_HOST': '172.30.163.155', 'KUBERNETES_SERVICE_HOST': '172.30.0.1', 'NV_CUDNN_PACKAGE': 'libcudnn8=8.1.1.33-1+cuda11.2', 'OMP_NUM_THREADS': '1', 'PYTHONBREAKPOINT': 'ray.util.rpdb.set_trace', 'CUDA_VISIBLE_DEVICES': '0,1,2,3', 'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_RANK0_HOST': '10.131.66.16'}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m entrypoint : mnist.py\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m min_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m max_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m nproc_per_node : 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m run_id : mnist-jlm13hx5g53mk\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_backend : static\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_endpoint : 10.131.66.16:49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_configs : {'rank': 1, 'timeout': 900}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m max_restarts : 0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m monitor_interval : 5\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m log_dir : None\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m metrics_cfg : {}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m entrypoint : mnist.py\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m min_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m max_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m nproc_per_node : 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m run_id : mnist-jlm13hx5g53mk\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_backend : static\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_endpoint : 10.131.66.16:49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_configs : {'rank': 0, 'timeout': 900}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m max_restarts : 0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m monitor_interval : 5\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m log_dir : None\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m metrics_cfg : {}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m restart_count=0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m master_addr=10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m master_port=49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m group_rank=1\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m group_world_size=2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m local_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m role_ranks=[4, 5, 6, 7]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m global_ranks=[4, 5, 6, 7]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m role_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m global_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/0/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/1/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/2/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/3/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m restart_count=0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m master_addr=10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m master_port=49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m group_rank=0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m group_world_size=2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m local_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m role_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m global_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m role_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m global_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/0/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/1/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/2/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/3/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading MNIST dataset...\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Validation sanity check: 0it [00:00, ?it/s][0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Validation sanity check: 0%| | 0/2 [00:00\n", + " main()\n", + " File \"ray_driver.py\", line 308, in main\n", + " driver.run()\n", + " File \"ray_driver.py\", line 293, in run\n", + " terminal = self._step()\n", + " File \"ray_driver.py\", line 245, in _step\n", + " result = ray.get(object_ref)\n", + " File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\n", + " return func(*args, **kwargs)\n", + " File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2289, in get\n", + " raise value.as_instanceof_cause()\n", + "ray.exceptions.RayTaskError(RuntimeError): \u001b[36mray::CommandActor.exec_module()\u001b[39m (pid=998, ip=10.129.2.23, repr=)\n", + " File \"ray_driver.py\", line 76, in exec_module\n", + " raise RuntimeError(\n", + "RuntimeError: Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchxOpen issue at https://github.com/pytorch/torchx\n", + "\n" ] } ], "source": [ - "cluster.job_logs(\"mnist-jlm13hx5g53mk\")" + "print(job.logs())" ] }, { @@ -1987,18 +345,26 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 20, "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", "metadata": {}, "outputs": [], "source": [ "cluster.down()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34f83ec2", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.7 64-bit", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -2012,11 +378,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.16" }, "vscode": { "interpreter": { - "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + "hash": "96e22af1ea11c5255349108fcb121f4f4096b7c4d0b90bfc9064be550878f2a3" } } }, diff --git a/src/codeflare_sdk/jobs/config.py b/src/codeflare_sdk/jobs/config.py new file mode 100644 index 000000000..447f0e9c4 --- /dev/null +++ b/src/codeflare_sdk/jobs/config.py @@ -0,0 +1,38 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The config sub-module contains the definition of the JobConfiguration dataclass, +which is used to specify requirements and other details when creating a +Job object. +""" + +from dataclasses import dataclass, field +from typing import Optional, Dict + + +@dataclass +class JobConfiguration: + """ + This dataclass is used to specify resource requirements and other details, and + is passed in as an argument when creating a Job object. + """ + + name: Optional[str] = None + script: Optional[str] = None + m: Optional[str] = None + h: Optional[str] = None # custom resource types + env: Optional[Dict[str, str]] = None + working_dir: Optional[str] = None + requirements: Optional[str] = None diff --git a/src/codeflare_sdk/jobs/jobs.py b/src/codeflare_sdk/jobs/jobs.py new file mode 100644 index 000000000..a21572f24 --- /dev/null +++ b/src/codeflare_sdk/jobs/jobs.py @@ -0,0 +1,162 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The jobs sub-module contains the definitions for the Job objects, which represent +the methods by which a user can submit a job to a cluster, check the jobs status and +access the jobs logs. +""" + +import abc +from typing import List +from pathlib import Path + +from ray.job_submission import JobSubmissionClient +from torchx.components.dist import ddp +from torchx.runner import get_runner, Runner +from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo + +from .config import JobConfiguration + +import typing + +if typing.TYPE_CHECKING: + from ..cluster.cluster import Cluster + +all_jobs: List["Job"] = [] +torchx_runner: Runner = get_runner() + + +class JobDefinition(metaclass=abc.ABCMeta): + """ + A job definition to be submitted to a generic backend cluster. + """ + + def _dry_run(self, cluster) -> str: + """ + Create job definition, but do not submit. + + The primary purpose of this function is to facilitate unit testing. + """ + + def submit(self, cluster: "Cluster"): + """ + Method for creating a job on a specific cluster + """ + pass + + +class Job(metaclass=abc.ABCMeta): + """ + An abstract class that defines the necessary methods for authenticating to a remote environment. + Specifically, this class `status` and a `logs` function. + """ + + def status(self): + """ + Method for retrieving the job's current status. + """ + pass + + def logs(self): + """ + Method for retrieving the job's logs. + """ + + +class TorchXJobDefinition(JobDefinition): + def __init__(self, config: JobConfiguration): + """ + Create the TorchXJob object by passing in a JobConfiguration + (defined in the config sub-module). + """ + self.config = config + + def _dry_run(self, cluster: "Cluster", *script_args) -> AppDryRunInfo: + """ + Create job definition, but do not submit. + + The primary purpose of this function is to facilitate unit testing. + """ + j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus + dashboard_address = f"{cluster.cluster_dashboard_uri(cluster.config.namespace).lstrip('http://')}" + return torchx_runner.dryrun( + app=ddp( + *script_args, + script=self.config.script, + m=self.config.m, + name=self.config.name, + h=None, # for custom resource types + cpu=cluster.config.max_cpus, + gpu=cluster.config.gpu, + memMB=1024 * cluster.config.max_memory, # cluster memory is in GB + j=j, + env=self.config.env, + # max_retries=0, # default + # mounts=None, # default + ), + scheduler="ray", # can be determined by type of cluster if more are introduced + cfg={ + "cluster_name": cluster.config.name, + "dashboard_address": dashboard_address, + "working_dir": self.config.working_dir, + "requirements": self.config.requirements, + }, + workspace=f"file://{Path.cwd()}", + ) + + def submit(self, cluster: "Cluster") -> "TorchXRayJob": + """ + Submit the job definition to a specific cluster, resulting in a Job object. + """ + return TorchXRayJob(self, cluster) + + +class TorchXRayJob(Job): + """ + Active submission of a dist.ddp job to a Ray cluster which can be used to get logs and status. + """ + + def __init__( + self, job_definition: TorchXJobDefinition, cluster: "Cluster", *script_args + ): + """ + Creates job which maximizes resource usage on the passed cluster. + """ + self.job_definition: TorchXJobDefinition = job_definition + self.cluster: "Cluster" = cluster + self._app_handle = torchx_runner.schedule( + job_definition._dry_run(cluster, *script_args) + ) + all_jobs.append(self) + + @property + def job_id(self): + if hasattr(self, "_job_id"): + return self._job_id + dashboard_address = f"{self.cluster.cluster_dashboard_uri(self.cluster.config.namespace).lstrip('http://')}:8265" + _, _, job_id = parse_app_handle(self._app_handle) + self._job_id = job_id.lstrip(f"{dashboard_address}-") + + def status(self) -> str: + """ + Get running job status. + """ + return torchx_runner.status(self._app_handle) + + def logs(self) -> str: + """ + Get job logs. + """ + return "".join(torchx_runner.log_lines(self._app_handle, None))