85
85
86
86
logger : logging .Logger = logging .getLogger (__name__ )
87
87
88
+ # Kubernetes reserves a small amount of resources per host for the system. For
89
+ # TorchX we always assume the entire host is being requested so we adjust the
90
+ # requested numbers account for the node reserved resources.
91
+ #
92
+ # https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/
93
+ RESERVED_MILLICPU = 100
94
+ RESERVED_MEMMB = 1024
95
+
88
96
RETRY_POLICIES : Mapping [str , Iterable [Mapping [str , str ]]] = {
89
97
RetryPolicy .REPLICA : [],
90
98
RetryPolicy .APPLICATION : [
152
160
153
161
ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
154
162
163
+ LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
164
+
155
165
156
166
def sanitize_for_serialization (obj : object ) -> object :
157
167
from kubernetes import client
@@ -176,21 +186,35 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
176
186
V1EmptyDirVolumeSource ,
177
187
)
178
188
189
+ # limits puts an upper cap on the resources a pod may consume.
190
+ # requests is how much the scheduler allocates. We assume that the jobs will
191
+ # be allocation the whole machine so requests is slightly lower than the
192
+ # requested resources to account for the Kubernetes node reserved resources.
193
+ limits = {}
179
194
requests = {}
180
195
181
196
resource = role .resource
182
- if resource .cpu >= 0 :
183
- requests ["cpu" ] = f"{ int (resource .cpu * 1000 )} m"
184
- if resource .memMB >= 0 :
185
- requests ["memory" ] = f"{ int (resource .memMB )} M"
186
- if resource .gpu >= 0 :
187
- requests ["nvidia.com/gpu" ] = str (resource .gpu )
197
+ if resource .cpu > 0 :
198
+ mcpu = int (resource .cpu * 1000 )
199
+ limits ["cpu" ] = f"{ mcpu } m"
200
+ request_mcpu = max (mcpu - RESERVED_MILLICPU , 0 )
201
+ requests ["cpu" ] = f"{ request_mcpu } m"
202
+ if resource .memMB > 0 :
203
+ limits ["memory" ] = f"{ int (resource .memMB )} M"
204
+ request_memMB = max (int (resource .memMB ) - RESERVED_MEMMB , 0 )
205
+ requests ["memory" ] = f"{ request_memMB } M"
206
+ if resource .gpu > 0 :
207
+ requests ["nvidia.com/gpu" ] = limits ["nvidia.com/gpu" ] = str (resource .gpu )
188
208
189
209
resources = V1ResourceRequirements (
190
- limits = requests ,
210
+ limits = limits ,
191
211
requests = requests ,
192
212
)
193
213
214
+ node_selector : Dict [str , str ] = {}
215
+ if LABEL_INSTANCE_TYPE in resource .capabilities :
216
+ node_selector [LABEL_INSTANCE_TYPE ] = resource .capabilities [LABEL_INSTANCE_TYPE ]
217
+
194
218
# To support PyTorch dataloaders we need to set /dev/shm to larger than the
195
219
# 64M default so we mount an unlimited sized tmpfs directory on it.
196
220
SHM_VOL = "dshm"
@@ -264,6 +288,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
264
288
restart_policy = "Never" ,
265
289
service_account_name = service_account ,
266
290
volumes = volumes ,
291
+ node_selector = node_selector ,
267
292
),
268
293
metadata = V1ObjectMeta (
269
294
annotations = {
@@ -416,6 +441,29 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
416
441
417
442
External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
418
443
444
+ **Resources / Allocation**
445
+
446
+ To select a specific machine type you can add a capability to your resources
447
+ with ``node.kubernetes.io/instance-type`` which will constrain the launched
448
+ jobs to nodes of that instance type.
449
+
450
+ >>> from torchx import specs
451
+ >>> specs.Resource(
452
+ ... cpu=4,
453
+ ... memMB=16000,
454
+ ... gpu=2,
455
+ ... capabilities={
456
+ ... "node.kubernetes.io/instance-type": "<cloud instance type>",
457
+ ... },
458
+ ... )
459
+ Resource(...)
460
+
461
+ Kubernetes may reserve some memory for the host. TorchX assumes you're
462
+ scheduling on whole hosts and thus will automatically reduce the resource
463
+ request by a small amount to account for the node reserved CPU and memory.
464
+ If you run into scheduling issues you may need to reduce the requested CPU
465
+ and memory from the host values.
466
+
419
467
**Compatibility**
420
468
421
469
.. compatibility::
0 commit comments