Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2267,9 +2267,11 @@ public synchronized void restartNameNode(int nnIndex, boolean waitActive,
info.nameNode = nn;
info.setStartOpt(startOpt);
if (waitActive) {
waitClusterUp();
if (numDataNodes > 0) {
waitNameNodeUp(nnIndex);
}
LOG.info("Restarted the namenode");
waitActive();
waitActive(nnIndex);
}
}

Expand Down Expand Up @@ -2775,11 +2777,25 @@ public void waitActive(int nnIndex) throws IOException {
DFSClient client = new DFSClient(addr, conf);

// ensure all datanodes have registered and sent heartbeat to the namenode
while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE), addr)) {
int failedCount = 0;
while (true) {
try {
LOG.info("Waiting for cluster to become active");
Thread.sleep(100);
while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE), addr)) {
LOG.info("Waiting for cluster to become active");
Thread.sleep(100);
}
break;
} catch (IOException e) {
failedCount++;
// Cached RPC connection to namenode, if any, is expected to fail once
if (failedCount > 1) {
LOG.warn("Tried waitActive() " + failedCount
+ " time(s) and failed, giving up. " + StringUtils
.stringifyException(e));
throw e;
}
} catch (InterruptedException e) {
throw new IOException(e);
}
}

Expand Down Expand Up @@ -2815,22 +2831,7 @@ public Boolean get() {
*/
public void waitActive() throws IOException {
for (int index = 0; index < namenodes.size(); index++) {
int failedCount = 0;
while (true) {
try {
waitActive(index);
break;
} catch (IOException e) {
failedCount++;
// Cached RPC connection to namenode, if any, is expected to fail once
if (failedCount > 1) {
LOG.warn("Tried waitActive() " + failedCount
+ " time(s) and failed, giving up. "
+ StringUtils.stringifyException(e));
throw e;
}
}
}
waitActive(index);
}
LOG.info("Cluster is active");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,14 @@ public void testSetUpFederatedCluster() throws Exception {
DFSUtil.addKeySuffixes(
DFS_NAMENODE_HTTP_ADDRESS_KEY, "ns1", "nn1")));
}

// Shutdown namenodes individually.
cluster.shutdownNameNode(0);
cluster.shutdownNameNode(1);
Comment on lines +314 to +315
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we just do restarts rather than shutdown + restart?

Copy link
Member Author

@ayushtkn ayushtkn Dec 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that will repro the issue, Single restart shall pass only since all other namenodes are up. May be just one shutdown and the other one restarting will repro...
But that won't look good. My original use case was from a RBF patch, where I was restaring the namenodes like this only, so kept the logic same

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that will repro the issue, Single restart shall pass only since all other namenodes are up. May be just one shutdown and the other one restarting will repro...
But that won't look good. My original use case was from a RBF patch, where I was restaring the namenodes like this only, so kept the logic same

Ah I see, yeah makes sense 👍


// Restart namenodes individually with wait active, both should be successful.
cluster.restartNameNode(0);
cluster.restartNameNode(1);
}
}
}