Skip to content

Commit f000942

Browse files
authored
YARN-11709. NodeManager should be shut down or blacklisted when it cacannot run program /var/lib/yarn-ce/bin/container-executor (#6960)
1 parent 5f93edf commit f000942

File tree

2 files changed

+36
-5
lines changed

2 files changed

+36
-5
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,8 +451,10 @@ public void startLocalizer(LocalizerStartContext ctx)
451451

452452
} catch (PrivilegedOperationException e) {
453453
int exitCode = e.getExitCode();
454-
LOG.warn("Exit code from container {} startLocalizer is : {}",
455-
locId, exitCode, e);
454+
LOG.error("Unrecoverable issue occurred. Marking the node as unhealthy to prevent "
455+
+ "further containers to get scheduled on the node and cause application failures. " +
456+
"Exit code from the container " + locId + "startLocalizer is : " + exitCode, e);
457+
nmContext.getNodeStatusUpdater().reportException(e);
456458

457459
throw new IOException("Application " + appId + " initialization failed" +
458460
" (exitCode=" + exitCode + ") with output: " + e.getOutput(), e);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import static org.mockito.ArgumentMatchers.any;
2727
import static org.mockito.ArgumentMatchers.anyBoolean;
2828
import static org.mockito.Mockito.doAnswer;
29+
import static org.mockito.Mockito.doNothing;
2930
import static org.mockito.Mockito.doThrow;
3031
import static org.mockito.Mockito.mock;
3132
import static org.mockito.Mockito.spy;
@@ -37,6 +38,7 @@
3738
import java.io.FileReader;
3839
import java.io.IOException;
3940
import java.io.LineNumberReader;
41+
import java.lang.reflect.Field;
4042
import java.net.InetSocketAddress;
4143
import java.net.URI;
4244
import java.net.URISyntaxException;
@@ -345,7 +347,8 @@ public void testStartLocalizer() throws IOException {
345347

346348
@Test
347349
public void testContainerLaunchError()
348-
throws IOException, ContainerExecutionException, URISyntaxException {
350+
throws IOException, ContainerExecutionException, URISyntaxException, IllegalAccessException,
351+
NoSuchFieldException {
349352

350353
final String[] expecetedMessage = {"badcommand", "Exit code: 24"};
351354
final String[] executor = {
@@ -387,6 +390,14 @@ public Object answer(InvocationOnMock invocationOnMock)
387390
dirsHandler.init(conf);
388391
mockExec.setConf(conf);
389392

393+
//set the private nmContext field without initing the LinuxContainerExecutor
394+
NodeManager nodeManager = new NodeManager();
395+
NodeManager.NMContext nmContext =
396+
nodeManager.createNMContext(null, null, null, false, conf);
397+
Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext");
398+
lceNmContext.setAccessible(true);
399+
lceNmContext.set(mockExec, nmContext);
400+
390401
String appSubmitter = "nobody";
391402
String cmd = String
392403
.valueOf(PrivilegedOperation.RunAsUserCommand.LAUNCH_CONTAINER.
@@ -601,15 +612,30 @@ public void testNoExitCodeFromPrivilegedOperation() throws Exception {
601612
LinuxContainerRuntime runtime = new DefaultLinuxContainerRuntime(
602613
spyPrivilegedExecutor);
603614
runtime.initialize(conf, null);
604-
mockExec = new LinuxContainerExecutor(runtime);
605-
mockExec.setConf(conf);
606615
LinuxContainerExecutor lce = new LinuxContainerExecutor(runtime) {
607616
@Override
608617
protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
609618
return spyPrivilegedExecutor;
610619
}
611620
};
612621
lce.setConf(conf);
622+
623+
//set the private nmContext field without initing the LinuxContainerExecutor
624+
NodeManager nodeManager = new NodeManager();
625+
NodeManager.NMContext nmContext =
626+
nodeManager.createNMContext(null, null, null, false, conf);
627+
NodeManager.NMContext spyNmContext = spy(nmContext);
628+
629+
//initialize a mock NodeStatusUpdater
630+
NodeStatusUpdaterImpl nodeStatusUpdater = mock(NodeStatusUpdaterImpl.class);
631+
nmContext.setNodeStatusUpdater(nodeStatusUpdater);
632+
//imitate a void method call on the NodeStatusUpdater when setting NM unhealthy.
633+
doNothing().when(nodeStatusUpdater).reportException(any());
634+
635+
Field lceNmContext = LinuxContainerExecutor.class.getDeclaredField("nmContext");
636+
lceNmContext.setAccessible(true);
637+
lceNmContext.set(lce, nmContext);
638+
613639
InetSocketAddress address = InetSocketAddress.createUnresolved(
614640
"localhost", 8040);
615641
Path nmPrivateCTokensPath= new Path("file:///bin/nmPrivateCTokensPath");
@@ -672,6 +698,9 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
672698
assertTrue("Unexpected exception " + e,
673699
e.getMessage().contains("exit code"));
674700
}
701+
702+
//verify that the NM was set unhealthy on PrivilegedOperationException
703+
verify(nodeStatusUpdater, times(1)).reportException(any());
675704
}
676705

677706
@Test

0 commit comments

Comments
 (0)