Skip to content

Commit 313d43b

Browse files
committed
Decouple queue capacity with ability to run OPPORTUNISTIC container
* Adds queueing policies `BY_RESOURCES` and `BY_QUEUE_LEN` at the NM * If `BY_RESOURCES` is specified, the NM will queue as long as it has enough resources to run all pending + running containers, otherwise, it will reject the OPPORTUNISTIC container * If BY_QUEUE_LEN is specified, the NM will only accept as many containers as its queue capacity is configured * Restructure `TestContainerSchedulerQueueing` to accommodate different queueing policies at the NM
1 parent b85c66a commit 313d43b

File tree

10 files changed

+673
-189
lines changed

10 files changed

+673
-189
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1241,7 +1241,23 @@ public static boolean isAclEnabled(Configuration conf) {
12411241
/** Prefix for all node manager configs.*/
12421242
public static final String NM_PREFIX = "yarn.nodemanager.";
12431243

1244-
/** Max Queue length of <code>OPPORTUNISTIC</code> containers on the NM. */
1244+
/**
1245+
* At the NM, the policy to determine whether to queue an
1246+
* <code>OPPORTUNISTIC</code> container or not.
1247+
* If set to <code>BY_QUEUE_LEN</code>, uses the queue capacity, as set by
1248+
* {@link YarnConfiguration#NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH},
1249+
* to limit how many containers to accept/queue.
1250+
* If set to <code>BY_RESOURCES</code>, limits the number of containers
1251+
* accepted based on the resource capacity of the node.
1252+
*/
1253+
public static final String NM_OPPORTUNISTIC_CONTAINERS_QUEUE_POLICY =
1254+
NM_PREFIX + "opportunistic-containers-queue-policy";
1255+
public static final String DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_QUEUE_POLICY =
1256+
"BY_QUEUE_LEN";
1257+
1258+
/** Max Queue length of <code>OPPORTUNISTIC</code> containers on the NM.
1259+
* If set to 0, NM does not accept any <code>OPPORTUNISTIC</code> containers.
1260+
* If set to {@literal > 0}, enforces the queue capacity. */
12451261
public static final String NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH =
12461262
NM_PREFIX + "opportunistic-containers-max-queue-length";
12471263
public static final int DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH =

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,8 +1265,27 @@
12651265
</property>
12661266

12671267
<property>
1268-
<description>Max number of OPPORTUNISTIC containers to queue at the
1269-
nodemanager.</description>
1268+
<description>
1269+
At the NM, the policy to determine whether to queue an
1270+
OPPORTUNISTIC container or not.
1271+
If set to BY_QUEUE_LEN, uses the queue capacity, as set by
1272+
yarn.nodemanager.opportunistic-containers-max-queue-length
1273+
to limit how many containers to accept/queue.
1274+
If set to BY_RESOURCES, limits the number of containers
1275+
accepted based on the resource capacity of the node.
1276+
</description>
1277+
<name>yarn.nodemanager.opportunistic-containers-queue-policy</name>
1278+
<value>BY_QUEUE_LEN</value>
1279+
</property>
1280+
1281+
<property>
1282+
<description>
1283+
Max number of OPPORTUNISTIC containers to queue at the
1284+
nodemanager (NM). If the value is 0, NMs do not allow any
1285+
OPPORTUNISTIC containers.
1286+
If the value is positive, the NM caps the number of OPPORTUNISTIC
1287+
containers that can be queued at the NM.
1288+
</description>
12701289
<name>yarn.nodemanager.opportunistic-containers-max-queue-length</name>
12711290
<value>0</value>
12721291
</property>

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/AllocationBasedResourceUtilizationTracker.java

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
package org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler;
2020

21+
import org.apache.hadoop.yarn.api.records.Resource;
2122
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
2223
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
2324
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
@@ -34,6 +35,9 @@ public class AllocationBasedResourceUtilizationTracker implements
3435
private static final Logger LOG =
3536
LoggerFactory.getLogger(AllocationBasedResourceUtilizationTracker.class);
3637

38+
private static final long LEFT_SHIFT_MB_IN_BYTES = 20;
39+
private static final int RIGHT_SHIFT_BYTES_IN_MB = 20;
40+
3741
private ResourceUtilization containersAllocation;
3842
private ContainerScheduler scheduler;
3943

@@ -80,10 +84,24 @@ public void subtractContainerResource(Container container) {
8084
*/
8185
@Override
8286
public boolean hasResourcesAvailable(Container container) {
83-
long pMemBytes = container.getResource().getMemorySize() * 1024 * 1024L;
84-
return hasResourcesAvailable(pMemBytes,
85-
(long) (getContainersMonitor().getVmemRatio()* pMemBytes),
86-
container.getResource().getVirtualCores());
87+
return hasResourcesAvailable(container.getResource());
88+
}
89+
90+
private static long convertMBToBytes(final long memMB) {
91+
return memMB << LEFT_SHIFT_MB_IN_BYTES;
92+
}
93+
94+
private static long convertBytesToMB(final long bytes) {
95+
return bytes >> RIGHT_SHIFT_BYTES_IN_MB;
96+
}
97+
98+
@Override
99+
public boolean hasResourcesAvailable(Resource resource) {
100+
long pMemBytes = convertMBToBytes(resource.getMemorySize());
101+
final long vmemBytes = (long)
102+
(getContainersMonitor().getVmemRatio() * pMemBytes);
103+
return hasResourcesAvailable(
104+
pMemBytes, vmemBytes, resource.getVirtualCores());
87105
}
88106

89107
private boolean hasResourcesAvailable(long pMemBytes, long vMemBytes,
@@ -92,29 +110,32 @@ private boolean hasResourcesAvailable(long pMemBytes, long vMemBytes,
92110
if (LOG.isDebugEnabled()) {
93111
LOG.debug("pMemCheck [current={} + asked={} > allowed={}]",
94112
this.containersAllocation.getPhysicalMemory(),
95-
(pMemBytes >> 20),
96-
(getContainersMonitor().getPmemAllocatedForContainers() >> 20));
113+
convertBytesToMB(pMemBytes),
114+
convertBytesToMB(
115+
getContainersMonitor().getPmemAllocatedForContainers()));
97116
}
98117
if (this.containersAllocation.getPhysicalMemory() +
99-
(int) (pMemBytes >> 20) >
100-
(int) (getContainersMonitor()
101-
.getPmemAllocatedForContainers() >> 20)) {
118+
(int) convertBytesToMB(pMemBytes) >
119+
(int) convertBytesToMB(getContainersMonitor()
120+
.getPmemAllocatedForContainers())) {
102121
return false;
103122
}
104123

105124
if (LOG.isDebugEnabled()) {
106125
LOG.debug("before vMemCheck" +
107126
"[isEnabled={}, current={} + asked={} > allowed={}]",
108127
getContainersMonitor().isVmemCheckEnabled(),
109-
this.containersAllocation.getVirtualMemory(), (vMemBytes >> 20),
110-
(getContainersMonitor().getVmemAllocatedForContainers() >> 20));
128+
this.containersAllocation.getVirtualMemory(),
129+
convertBytesToMB(vMemBytes),
130+
convertBytesToMB(
131+
getContainersMonitor().getVmemAllocatedForContainers()));
111132
}
112133
// Check virtual memory.
113134
if (getContainersMonitor().isVmemCheckEnabled() &&
114135
this.containersAllocation.getVirtualMemory() +
115-
(int) (vMemBytes >> 20) >
116-
(int) (getContainersMonitor()
117-
.getVmemAllocatedForContainers() >> 20)) {
136+
(int) convertBytesToMB(vMemBytes) >
137+
(int) convertBytesToMB(getContainersMonitor()
138+
.getVmemAllocatedForContainers())) {
118139
return false;
119140
}
120141

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java

Lines changed: 94 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
2525
import org.apache.hadoop.yarn.api.records.ContainerId;
2626
import org.apache.hadoop.yarn.api.records.ExecutionType;
27+
import org.apache.hadoop.yarn.api.records.Resource;
2728
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
2829
import org.apache.hadoop.yarn.conf.YarnConfiguration;
2930
import org.apache.hadoop.yarn.event.AsyncDispatcher;
@@ -46,6 +47,7 @@
4647
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService
4748
.RecoveredContainerState;
4849
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerStatus;
50+
import org.apache.hadoop.yarn.util.resource.Resources;
4951
import org.slf4j.Logger;
5052
import org.slf4j.LoggerFactory;
5153

@@ -74,6 +76,7 @@ public class ContainerScheduler extends AbstractService implements
7476
private final Context context;
7577
// Capacity of the queue for opportunistic Containers.
7678
private final int maxOppQueueLength;
79+
private final boolean forceStartGuaranteedContainers;
7780

7881
// Queue of Guaranteed Containers waiting for resources to run
7982
private final LinkedHashMap<ContainerId, Container>
@@ -106,9 +109,39 @@ public class ContainerScheduler extends AbstractService implements
106109

107110
private final AsyncDispatcher dispatcher;
108111
private final NodeManagerMetrics metrics;
112+
private final OpportunisticContainersQueuePolicy oppContainersQueuePolicy;
109113

110114
private Boolean usePauseEventForPreemption = false;
111115

116+
private static int getMaxOppQueueLengthFromConf(final Context context) {
117+
if (context == null || context.getConf() == null) {
118+
return YarnConfiguration
119+
.DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH;
120+
}
121+
122+
return context.getConf().getInt(
123+
YarnConfiguration.NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH,
124+
YarnConfiguration.DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH
125+
);
126+
}
127+
128+
private static OpportunisticContainersQueuePolicy
129+
getOppContainersQueuePolicyFromConf(final Context context) {
130+
final String queuePolicy;
131+
if (context == null || context.getConf() == null) {
132+
queuePolicy = YarnConfiguration
133+
.DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_QUEUE_POLICY;
134+
} else {
135+
queuePolicy = context.getConf().get(
136+
YarnConfiguration.NM_OPPORTUNISTIC_CONTAINERS_QUEUE_POLICY,
137+
YarnConfiguration
138+
.DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_QUEUE_POLICY
139+
);
140+
}
141+
142+
return OpportunisticContainersQueuePolicy.valueOf(queuePolicy);
143+
}
144+
112145
@VisibleForTesting
113146
ResourceHandlerChain resourceHandlerChain = null;
114147

@@ -120,10 +153,9 @@ public class ContainerScheduler extends AbstractService implements
120153
*/
121154
public ContainerScheduler(Context context, AsyncDispatcher dispatcher,
122155
NodeManagerMetrics metrics) {
123-
this(context, dispatcher, metrics, context.getConf().getInt(
124-
YarnConfiguration.NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH,
125-
YarnConfiguration.
126-
DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH));
156+
this(context, dispatcher, metrics,
157+
getOppContainersQueuePolicyFromConf(context),
158+
getMaxOppQueueLengthFromConf(context));
127159
}
128160

129161

@@ -149,13 +181,35 @@ public void serviceInit(Configuration conf) throws Exception {
149181
@VisibleForTesting
150182
public ContainerScheduler(Context context, AsyncDispatcher dispatcher,
151183
NodeManagerMetrics metrics, int qLength) {
184+
this(context, dispatcher, metrics,
185+
getOppContainersQueuePolicyFromConf(context), qLength);
186+
}
187+
188+
@VisibleForTesting
189+
public ContainerScheduler(Context context, AsyncDispatcher dispatcher,
190+
NodeManagerMetrics metrics,
191+
OpportunisticContainersQueuePolicy oppContainersQueuePolicy,
192+
int qLength) {
152193
super(ContainerScheduler.class.getName());
153194
this.context = context;
154195
this.dispatcher = dispatcher;
155196
this.metrics = metrics;
156-
this.maxOppQueueLength = (qLength <= 0) ? 0 : qLength;
157197
this.utilizationTracker =
158198
new AllocationBasedResourceUtilizationTracker(this);
199+
this.oppContainersQueuePolicy = oppContainersQueuePolicy;
200+
switch (oppContainersQueuePolicy) {
201+
case BY_RESOURCES:
202+
this.maxOppQueueLength = 0;
203+
this.forceStartGuaranteedContainers = false;
204+
LOG.info("Setting max opportunistic queue length to 0,"
205+
+ " as {} is incompatible with queue length",
206+
oppContainersQueuePolicy);
207+
break;
208+
case BY_QUEUE_LEN:
209+
default:
210+
this.maxOppQueueLength = qLength;
211+
this.forceStartGuaranteedContainers = (maxOppQueueLength <= 0);
212+
}
159213
this.opportunisticContainersStatus =
160214
OpportunisticContainersStatus.newInstance();
161215
}
@@ -187,7 +241,7 @@ public void handle(ContainerSchedulerEvent event) {
187241
shedQueuedOpportunisticContainers();
188242
break;
189243
case RECOVERY_COMPLETED:
190-
startPendingContainers(maxOppQueueLength <= 0);
244+
startPendingContainers(forceStartGuaranteedContainers);
191245
metrics.setQueuedContainers(queuedOpportunisticContainers.size(),
192246
queuedGuaranteedContainers.size());
193247
break;
@@ -243,7 +297,7 @@ private void onUpdateContainer(UpdateContainerSchedulerEvent updateEvent) {
243297
LOG.warn(String.format("Could not update resources on " +
244298
"continer update of %s", containerId), ex);
245299
}
246-
startPendingContainers(maxOppQueueLength <= 0);
300+
startPendingContainers(forceStartGuaranteedContainers);
247301
metrics.setQueuedContainers(queuedOpportunisticContainers.size(),
248302
queuedGuaranteedContainers.size());
249303
}
@@ -371,7 +425,6 @@ private void onResourcesReclaimed(Container container) {
371425
ExecutionType.OPPORTUNISTIC) {
372426
this.metrics.completeOpportunisticContainer(container.getResource());
373427
}
374-
boolean forceStartGuaranteedContainers = (maxOppQueueLength <= 0);
375428
startPendingContainers(forceStartGuaranteedContainers);
376429
}
377430
this.metrics.setQueuedContainers(queuedOpportunisticContainers.size(),
@@ -380,13 +433,13 @@ private void onResourcesReclaimed(Container container) {
380433

381434
/**
382435
* Start pending containers in the queue.
383-
* @param forceStartGuaranteedContaieners When this is true, start guaranteed
436+
* @param forceStartGContainers When this is true, start guaranteed
384437
* container without looking at available resource
385438
*/
386-
private void startPendingContainers(boolean forceStartGuaranteedContaieners) {
439+
private void startPendingContainers(boolean forceStartGContainers) {
387440
// Start guaranteed containers that are paused, if resources available.
388441
boolean resourcesAvailable = startContainers(
389-
queuedGuaranteedContainers.values(), forceStartGuaranteedContaieners);
442+
queuedGuaranteedContainers.values(), forceStartGContainers);
390443
// Start opportunistic containers, if resources available.
391444
if (resourcesAvailable) {
392445
startContainers(queuedOpportunisticContainers.values(), false);
@@ -429,6 +482,21 @@ private boolean resourceAvailableToStartContainer(Container container) {
429482
return this.utilizationTracker.hasResourcesAvailable(container);
430483
}
431484

485+
private boolean resourceAvailableToQueueOppContainer(
486+
Container newOppContainer) {
487+
final Resource cumulativeResource = Resource.newInstance(Resources.none());
488+
for (final Container container : queuedGuaranteedContainers.values()) {
489+
Resources.addTo(cumulativeResource, container.getResource());
490+
}
491+
492+
for (final Container container : queuedOpportunisticContainers.values()) {
493+
Resources.addTo(cumulativeResource, container.getResource());
494+
}
495+
496+
Resources.addTo(cumulativeResource, newOppContainer.getResource());
497+
return this.utilizationTracker.hasResourcesAvailable(cumulativeResource);
498+
}
499+
432500
private boolean enqueueContainer(Container container) {
433501
boolean isGuaranteedContainer = container.getContainerTokenIdentifier().
434502
getExecutionType() == ExecutionType.GUARANTEED;
@@ -438,7 +506,21 @@ private boolean enqueueContainer(Container container) {
438506
queuedGuaranteedContainers.put(container.getContainerId(), container);
439507
isQueued = true;
440508
} else {
441-
if (queuedOpportunisticContainers.size() < maxOppQueueLength) {
509+
switch (oppContainersQueuePolicy) {
510+
case BY_RESOURCES:
511+
isQueued = resourceAvailableToQueueOppContainer(container);
512+
break;
513+
case BY_QUEUE_LEN:
514+
default:
515+
if (maxOppQueueLength <= 0) {
516+
isQueued = false;
517+
} else {
518+
isQueued =
519+
queuedOpportunisticContainers.size() < maxOppQueueLength;
520+
}
521+
}
522+
523+
if (isQueued) {
442524
LOG.info("Opportunistic container {} will be queued at the NM.",
443525
container.getContainerId());
444526
queuedOpportunisticContainers.put(
@@ -451,7 +533,6 @@ private boolean enqueueContainer(Container container) {
451533
container.sendKillEvent(
452534
ContainerExitStatus.KILLED_BY_CONTAINER_SCHEDULER,
453535
"Opportunistic container queue is full.");
454-
isQueued = false;
455536
}
456537
}
457538

@@ -484,7 +565,6 @@ protected void scheduleContainer(Container container) {
484565
// When opportunistic container not allowed (which is determined by
485566
// max-queue length of pending opportunistic containers <= 0), start
486567
// guaranteed containers without looking at available resources.
487-
boolean forceStartGuaranteedContainers = (maxOppQueueLength <= 0);
488568
startPendingContainers(forceStartGuaranteedContainers);
489569

490570
// if the guaranteed container is queued, we need to preempt opportunistic

0 commit comments

Comments
 (0)