From 5d847ebe7908ed9a1b5e79e6a6a82121c7bb860d Mon Sep 17 00:00:00 2001 From: slfan1989 Date: Sat, 10 Jun 2023 20:13:50 +0800 Subject: [PATCH 1/3] YARN-11510. [Federation] Fix NodeManager#TestFederationInterceptor Flaky Unit Test. --- .../hadoop/yarn/conf/YarnConfiguration.java | 4 ++++ .../yarn/server/MockResourceManagerFacade.java | 8 +++++++- .../amrmproxy/FederationInterceptor.java | 16 ++++++++++++++++ .../amrmproxy/TestFederationInterceptor.java | 13 ++++++++++++- 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 56bbe8843d414..8db03eb63968a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -3075,6 +3075,10 @@ public static boolean isAclEnabled(Configuration conf) { + "amrmproxy.enabled"; public static final boolean DEFAULT_AMRM_PROXY_ENABLED = false; + public static final String AMRM_PROXY_WAIT_UAM_REGISTER_DONE = + NM_PREFIX + "amrmproxy.wait.uam-register.done"; + public static final boolean DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE = false; + public static final String AMRM_PROXY_ADDRESS = NM_PREFIX + "amrmproxy.address"; public static final int DEFAULT_AMRM_PROXY_PORT = 8049; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java index 999e66a040d8b..5d612d31cd83f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server; +import java.io.Closeable; import java.io.IOException; import java.net.ConnectException; import java.util.ArrayList; @@ -183,7 +184,7 @@ * change the implementation with care. */ public class MockResourceManagerFacade implements ApplicationClientProtocol, - ApplicationMasterProtocol, ResourceManagerAdministrationProtocol { + ApplicationMasterProtocol, ResourceManagerAdministrationProtocol, Closeable { private static final Logger LOG = LoggerFactory.getLogger(MockResourceManagerFacade.class); @@ -967,4 +968,9 @@ public DeregisterSubClusterResponse deregisterSubCluster(DeregisterSubClusterReq public HashMap> getApplicationContainerIdMap() { return applicationContainerIdMap; } + + @Override + public void close() throws IOException { + isRunning = false; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/FederationInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/FederationInterceptor.java index ae6765cfb479c..14a2d60c2b5bc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/FederationInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/FederationInterceptor.java @@ -251,6 +251,8 @@ public class FederationInterceptor extends AbstractRequestInterceptor { // the maximum wait time for the first async heart beat response private long heartbeatMaxWaitTimeMs; + private boolean waitUamRegisterDone; + private MonotonicClock clock = new MonotonicClock(); /** @@ -353,6 +355,8 @@ public void init(AMRMProxyApplicationContext appContext) { this.subClusterTimeOut = YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT; } + this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE, + YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE); } @Override @@ -1332,6 +1336,18 @@ public void run() { }); this.uamRegisterFutures.put(scId, future); } + + if (this.waitUamRegisterDone) { + for (Map.Entry> entry : this.uamRegisterFutures.entrySet()) { + SubClusterId subClusterId = entry.getKey(); + Future future = entry.getValue(); + while (!future.isDone()) { + LOG.info("subClusterId {} Wait Uam Register done.", subClusterId); + } + } + } + + return newSubClusters; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java index 8661990ed72d1..abb5a45e93649 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java @@ -175,6 +175,9 @@ protected YarnConfiguration createConfiguration() { conf.setLong(YarnConfiguration.FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT, 500); + // Wait UAM Register Down + conf.setBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE, true); + return conf; } @@ -590,6 +593,10 @@ public Object run() throws Exception { interceptor.recover(recoveredDataMap); Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); + + // Waiting for SC-1 to time out. + Thread.sleep(800); + // SC1 should be initialized to be timed out Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size()); @@ -848,7 +855,7 @@ public Object run() throws Exception { List containers = getContainersAndAssert(numberOfContainers, numberOfContainers * 2); for (Container c : containers) { - LOG.info("Allocated container " + c.getId()); + LOG.info("Allocated container {}", c.getId()); } Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); @@ -882,6 +889,10 @@ public Object run() throws Exception { int numberOfContainers = 3; // Should re-attach secondaries and get the three running containers Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); + + // Waiting for SC-1 to time out. + Thread.sleep(800); + // SC1 should be initialized to be timed out Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size()); Assert.assertEquals(numberOfContainers, From d839772b3536453ef3958fa4d9823cae4c018fde Mon Sep 17 00:00:00 2001 From: slfan1989 Date: Wed, 14 Jun 2023 23:55:06 +0800 Subject: [PATCH 2/3] YARN-11510. Fix CheckStyle. --- .../src/main/resources/yarn-default.xml | 10 ++++++++++ .../hadoop/yarn/server/MockResourceManagerFacade.java | 2 +- .../amrmproxy/TestFederationInterceptor.java | 5 +++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 0069e9ef360c9..0aa13bd72e381 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -5328,4 +5328,14 @@ + + + Whether we wait for uam registration to complete. + The default value is false. If we set it to true, + the UAM needs to be registered before attempting to allocate a container. + + yarn.nodemanager.amrmproxy.wait.uam-register.done + false + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java index 5d612d31cd83f..c0ca3b5d8a5ea 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/test/java/org/apache/hadoop/yarn/server/MockResourceManagerFacade.java @@ -971,6 +971,6 @@ public HashMap> getApplicationContainerIdMap() @Override public void close() throws IOException { - isRunning = false; + LOG.info("MockResourceManagerFacade Close."); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java index abb5a45e93649..15cf39efa3d17 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/TestFederationInterceptor.java @@ -38,6 +38,7 @@ import org.apache.hadoop.registry.client.api.RegistryOperations; import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService; import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.LambdaTestUtils; import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; @@ -595,7 +596,7 @@ public Object run() throws Exception { Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); // Waiting for SC-1 to time out. - Thread.sleep(800); + GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000); // SC1 should be initialized to be timed out Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size()); @@ -891,7 +892,7 @@ public Object run() throws Exception { Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize()); // Waiting for SC-1 to time out. - Thread.sleep(800); + GenericTestUtils.waitFor(() -> interceptor.getTimedOutSCs(true).size() == 1, 100, 1000); // SC1 should be initialized to be timed out Assert.assertEquals(1, interceptor.getTimedOutSCs(true).size()); From a357f9be9da29e8bf3e6662c447ca496d7d7c49a Mon Sep 17 00:00:00 2001 From: slfan1989 Date: Thu, 15 Jun 2023 06:53:07 +0800 Subject: [PATCH 3/3] YARN-11510. Fix CheckStyle. --- .../hadoop-yarn-common/src/main/resources/yarn-default.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index ab62f646c4602..395984e530247 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -5363,7 +5363,7 @@ yarn.nodemanager.amrmproxy.wait.uam-register.done false - + YARN Federation supports Non-HA mode.